def np_conv(na, nw, padding, stride=1): batch, in_channel, in_height, in_width = na.shape _, num_filter, kernel_h, kernel_w = nw.shape if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w)) pad_h = pad_top + pad_bottom pad_w = pad_left + pad_right out_channel = num_filter out_height = (in_height - kernel_h + pad_h) // stride_h + 1 out_width = (in_width - kernel_w + pad_w) // stride_w + 1 nb = np.zeros((batch, out_channel, out_height, out_width)) for n in range(batch): for f in range(out_channel): for c in range(in_channel): if pad_h > 0 or pad_w > 0: apad = np.zeros((in_height + pad_h, in_width + pad_w)) apad[pad_top : pad_top + in_height, pad_left : pad_left + in_width] = na[n, c] else: apad = na[n, c] out = scipy.signal.convolve2d(apad, np.rot90(np.rot90(nw[f, c])), mode="valid") nb[n, f] += out[::stride, ::stride] return nb
def get_ref_data(): out_grad_np = np.random.uniform(size=out_grad_shape).astype(dtype) input_np = np.random.uniform(size=in_shape).astype(dtype) dilated_out_grad_np = tvm.topi.testing.dilate_python( out_grad_np, [1, stride_h, stride_w, 1]) pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( [padding_h, padding_w], (filter_h, filter_w)) padded_input_np = np.zeros( (batch, in_h + pad_top + pad_bottom, in_w + pad_left + pad_right, in_channel)) padded_input_np[:, pad_top:in_h + pad_top, pad_left:in_w + pad_left, :] = input_np weight_grad_np = np.zeros( (filter_h, filter_w, in_channel, channel_multiplier)) for c in range(in_channel): for m in range(channel_multiplier): for b in range(batch): weight_grad_np[:, :, c, m] += signal.convolve2d( padded_input_np[b, :, :, c], np.rot90( dilated_out_grad_np[b, :, :, c * channel_multiplier + m % channel_multiplier], 2, ), mode="valid", )[0:filter_h, 0:filter_w] return (out_grad_np, input_np, weight_grad_np)
def conv2d_hwcn_python(a_np, w_np, stride, padding): """Convolution operator in HWCN layout. Parameters ---------- a_np : numpy.ndarray 4-D with shape [in_height, in_width, in_channel, batch] w_np : numpy.ndarray 4-D with shape [filter_height, filter_width, in_channel, num_filter] stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] padding : int or str or a list/tuple of 2 or 4 ints Padding size, or ['VALID', 'SAME'], or [pad_height, pad_width] for 2 ints, or [pad_top, pad_left, pad_bottom, pad_right] for 2 ints Returns ------- b_np : np.ndarray 4-D with shape [out_height, out_width, out_channel, batch] """ in_height, in_width, in_channel, batch = a_np.shape kernel_h, kernel_w, _, num_filter = w_np.shape if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel_h, kernel_w)) pad_h = pad_top + pad_bottom pad_w = pad_left + pad_right # compute the output shape out_channel = num_filter out_height = (in_height - kernel_h + pad_h) // stride_h + 1 out_width = (in_width - kernel_w + pad_w) // stride_w + 1 # change the layout from HWCN to NCHW at = a_np.transpose((3, 2, 0, 1)) wt = w_np.transpose((3, 2, 0, 1)) bt = np.zeros((batch, out_channel, out_height, out_width)) # computation for n in range(batch): for f in range(out_channel): for c in range(in_channel): if pad_h > 0 or pad_w > 0: apad = np.zeros((in_height + pad_h, in_width + pad_w)) apad[pad_top:pad_top + in_height, pad_left:pad_left + in_width] = at[n, c] else: apad = at[n, c] out = scipy.signal.convolve2d(apad, np.rot90(np.rot90(wt[f, c])), mode="valid") bt[n, f] += out[::stride, ::stride] return bt.transpose((2, 3, 1, 0))
def conv2d_grad(orig, grad): """Gradient of conv2d""" attrs = orig.attrs data, weight = orig.args data_shape = get_const_tuple(data.checked_type.shape) weight_shape = get_const_tuple(weight.checked_type.shape) _, _, grad_h, grad_w = get_const_tuple(orig.checked_type.shape) _, _, in_h, in_w = data_shape _, _, filter_h, filter_w = weight_shape # infer output_padding fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple( get_const_tuple(attrs.padding), (filter_h, filter_w)) stride_h, stride_w = get_const_tuple(attrs.strides) out_h = (grad_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h out_w = (grad_w - 1) * stride_w - fpad_left - fpad_right + filter_w output_padding = (in_h - out_h, in_w - out_w) assert attrs.data_layout == "NCHW", "only support NCHW data layout" assert attrs.kernel_layout == "OIHW", "only support OIHW kernel layout" assert attrs.out_layout in ["", "NCHW"], "only support NCHW output layout" if attrs.out_dtype in ["", None]: assert data.checked_type, "Call InferType first." out_dtype = data.checked_type.dtype else: out_dtype = attrs.out_dtype backward_data = _nn.conv2d_transpose( grad, weight, strides=attrs.strides, padding=attrs.padding, dilation=attrs.dilation, groups=attrs.groups, output_padding=output_padding, out_dtype=out_dtype, ) backward_weight = _nn.conv2d_backward_weight( grad, data, strides=attrs.strides, padding=attrs.padding, dilation=attrs.dilation, groups=attrs.groups, channels=attrs.channels, kernel_size=(filter_h, filter_w), grad_layout=attrs.out_layout if attrs.out_layout else attrs.data_layout, data_layout=attrs.data_layout, kernel_layout=attrs.kernel_layout, out_dtype=out_dtype, ) return [backward_data, backward_weight]
def compile_depthwise_NHWC_int8_arm( batch, in_channel, in_size, kernel, depth_multiplier, stride, padding, add_bias=False, dilation=1, ): pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right in_height = in_width = in_size A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int16") W = te.placeholder((kernel, kernel, in_channel, depth_multiplier), name="W", dtype="int16") bias = te.placeholder((in_channel * depth_multiplier, ), name="bias", dtype="int32") dtype = "int32" device = "llvm -device=arm_cpu -mtriple=aarch64-linux-gnu" compute = topi.arm_cpu.compute_depthwise_conv2d_nhwc schedule = topi.arm_cpu.schedule_depthwise_conv2d_nhwc if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Compiling on arm AArch64 target: %s" % device) with tvm.target.Target(device): assert topi.arm_cpu.arm_utils.is_aarch64_arm( ), "AArch64 target not recognized" C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype) if add_bias: C += bias ins_outs = [A, W, bias, C] else: ins_outs = [A, W, C] s = schedule([C]) func = tvm.build( s, ins_outs, device, name="depthwise_conv2d", )
def conv2d_transpose_packed(cfg, data, kernel, strides, padding, out_dtype, output_padding=(0, 0)): """Packed conv2d_transpose compute""" ishape = get_const_tuple(data.shape) kshape = get_const_tuple(kernel.shape) b, c_i, i_h, i_w, t_b, t_ci = ishape c_o, _, k_h, k_w, t_co, t_ci = kshape stride_h, stride_w = strides opad_h, opad_w = output_padding # FIXME(tmoreau89): currently IR pass breaks when output padding != (0,0) assert opad_h == 0 and opad_w == 0, "VTA does not support output padding for now" # derive padding parameters fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple( padding, (k_h, k_w)) bpad_top = k_h - 1 - fpad_top bpad_bottom = k_h - 1 - fpad_bottom + opad_h bpad_left = k_w - 1 - fpad_left bpad_right = k_w - 1 - fpad_right + opad_w # padding stage dilated_input = topi.nn.dilate(data, [1, 1, stride_h, stride_w, 1, 1]) data_pad = topi.nn.pad(dilated_input, [0, 0, bpad_top, bpad_left, 0, 0], [0, 0, bpad_bottom, bpad_right, 0, 0]) # convolution transpose stage out_h = (i_h - 1) * stride_h - fpad_top - fpad_bottom + k_h + opad_h out_w = (i_w - 1) * stride_w - fpad_left - fpad_right + k_w + opad_w oshape = (b, c_o, out_h, out_w, t_b, t_co) d_c = te.reduce_axis((0, c_i), name="d_c") d_h = te.reduce_axis((0, k_h), name="d_h") d_w = te.reduce_axis((0, k_w), name="d_w") d_ci = te.reduce_axis((0, t_ci), name="d_ci") out = te.compute( oshape, lambda i_n, i_c, i_h, i_w, j_n, j_c: te.sum( data_pad(i_n, d_c, i_h + d_h, i_w + d_w, j_n, d_ci).astype( out_dtype) * kernel[i_c, d_c, d_h, d_w, j_c, d_ci].astype( out_dtype), axis=[d_c, d_h, d_w, d_ci], ), tag="packed_conv2d_transpose", name="res", ) cfg.add_flop(2 * np.prod(topi.utils.get_const_tuple(oshape)) * kshape[2] * kshape[3] * ishape[1] * ishape[-1]) return out
def depthwise_conv2d_python_nchw(input_np, filter_np, stride, padding): """Depthwise convolution operator in NCHW layout. Parameters ---------- input_np : numpy.ndarray 4-D with shape [batch, in_channel, in_height, in_width] filter_np : numpy.ndarray 4-D with shape [in_channel, channel_multiplier, filter_height, filter_width] stride : list / tuple of 2 ints [stride_height, stride_width] padding : str 'VALID' or 'SAME' Returns ------- output_np : np.ndarray 4-D with shape [batch, out_channel, out_height, out_width] """ batch, in_channel, in_height, in_width = input_np.shape _, channel_multiplier, filter_height, filter_width = filter_np.shape if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (filter_height, filter_width)) pad_h = pad_top + pad_bottom pad_w = pad_left + pad_right out_channel = in_channel * channel_multiplier out_height = (in_height - filter_height + pad_h) // stride_h + 1 out_width = (in_width - filter_width + pad_w) // stride_w + 1 output_np = np.zeros((batch, out_channel, out_height, out_width)) for i in range(batch): for j in range(out_channel): apad = input_np[i, j // channel_multiplier, :, :] if pad_h or pad_w: apad = np.pad(apad, [(pad_top, pad_bottom), (pad_left, pad_right)]) conv = _convolve2d( apad, np.rot90(filter_np[j // channel_multiplier, j % channel_multiplier, :, :], k=2), ) output_np[i, j, :, :] = conv[::stride_h, ::stride_w, ] return output_np
def conv_bwd(N, CI, HI, WI, CO, HO, WO, KSIZE, stride, padding, dtype): strides = (stride, stride) shape_data = (N, CI, HI, WI) shape_weight = (CO, CI, KSIZE, KSIZE) shape_grad_output = (N, CO, HO, WO) # given tensor data = te.placeholder(shape_data, name="data", dtype=dtype) weight = te.placeholder(shape_weight, name="weight", dtype=dtype) grad_output = te.placeholder(shape_grad_output, name="grad_output", dtype=dtype) # grad_data out_h = (HO - 1) * strides[0] - 2 * padding + KSIZE out_w = (WO - 1) * strides[1] - 2 * padding + KSIZE output_padding = (HI - out_h, WI - out_w) grad_data = topi.nn.conv2d_transpose_nchw(grad_output, weight, strides, padding, dtype, output_padding) # grad_weight dilation_h, dilation_w = (1, 1) batch, in_channel, in_h, in_w = shape_data out_channel, _, filter_h, filter_w = shape_weight grad_output_tmp = topi.tile(grad_output, [1, in_channel, 1, 1]) grad_output_tmp = topi.reshape( grad_output_tmp, [batch * in_channel * out_channel, 1, HO, WO]) data_tmp = topi.reshape(data, [1, in_channel * batch, HI, WI]) grad_weight = topi.nn.group_conv2d_nchw(data_tmp, grad_output_tmp, stride=(dilation_h, dilation_w), padding=padding, dilation=strides, groups=in_channel * batch, out_dtype=dtype) # infer shape of grad_weight _, _, grad_h, grad_w = shape_grad_output fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple( padding, (filter_h, filter_w)) padded_weight_grad_h = (in_h - (grad_h - 1) * strides[0] - 1 + fpad_top + fpad_bottom) // dilation_h + 1 padded_weight_grad_w = (in_w - (grad_w - 1) * strides[1] - 1 + fpad_left + fpad_right) // dilation_w + 1 grad_weight = topi.reshape(grad_weight, [ batch, in_channel, out_channel, padded_weight_grad_h, padded_weight_grad_w ]) grad_weight = topi.sum(grad_weight, axis=0) grad_weight = topi.transpose(grad_weight, [1, 0, 2, 3]) if padded_weight_grad_h > filter_h or padded_weight_grad_w > filter_w: grad_weight = topi.strided_slice( grad_weight, begin=[0, 0, 0, 0], end=[out_channel, in_channel, filter_h, filter_w]) return [data, weight, grad_output, grad_data, grad_weight] return [data, weight, grad_output, grad_data, grad_weight]
def _conv2d_nchw_python(a_np, w_np, stride, padding): """Convolution operator in NCHW layout. Parameters ---------- a_np : numpy.ndarray 4-D with shape [batch, in_channel, in_height, in_width] w_np : numpy.ndarray 4-D with shape [num_filter, in_channel, filter_height, filter_width] stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] padding : int or str or a list/tuple of 2 or 4 ints Padding size, or ['VALID', 'SAME'], or [pad_height, pad_width] for 2 ints, or [pad_top, pad_left, pad_bottom, pad_right] for 2 ints Returns ------- b_np : np.ndarray 4-D with shape [batch, out_channel, out_height, out_width] """ batch, in_channel, in_height, in_width = a_np.shape num_filter, _, kernel_h, kernel_w = w_np.shape if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel_h, kernel_w)) pad_h = pad_top + pad_bottom pad_w = pad_left + pad_right # compute the output shape out_channel = num_filter out_height = (in_height - kernel_h + pad_h) // stride_h + 1 out_width = (in_width - kernel_w + pad_w) // stride_w + 1 b_np = np.zeros((batch, out_channel, out_height, out_width)) # computation for n in range(batch): for f in range(out_channel): for c in range(in_channel): if pad_h > 0 or pad_w > 0: apad = np.zeros((in_height + pad_h, in_width + pad_w)) apad[pad_top:pad_top + in_height, pad_left:pad_left + in_width] = a_np[n, c] else: apad = a_np[n, c] out = scipy.signal.convolve2d(apad, np.rot90(np.rot90(w_np[f, c])), mode="valid") b_np[n, f] += out[::stride_h, ::stride_w] return b_np
def make_ethosu_conv2d( ifm, ifm_channels, ofm_channels, kernel_shape, padding, strides, dilation, lut=relay.const([], dtype="int8"), activation="NONE", ifm_layout="NHWC", ofm_layout="NHWC", weight_dtype="int8", scale_bias_dtype="uint8", rounding_mode="TFL", upscale="NONE", ): # conv params weight_shape = (ofm_channels, kernel_shape[0], kernel_shape[1], ifm_channels) padding = get_pad_tuple(padding, kernel_shape) scale_bias_data = generate_weights_data((weight_shape[0], 10), scale_bias_dtype) scale_bias = relay.const(scale_bias_data, dtype=scale_bias_dtype) weight_data = generate_weights_data(weight_shape, weight_dtype) weight = relay.const(weight_data, dtype=weight_dtype) conv = ethosu_ops.ethosu_conv2d( ifm, weight, scale_bias, lut=lut, ifm_scale=0.5, ifm_zero_point=10, weight_zero_point=12, ofm_scale=0.25, ofm_zero_point=14, kernel_shape=kernel_shape, ofm_channels=ofm_channels, strides=strides, padding=padding, dilation=dilation, activation=activation, clip_min=10 if activation == "CLIP" else 0, clip_max=100 if activation == "CLIP" else 0, rounding_mode=rounding_mode, upscale=upscale, ifm_layout=ifm_layout, ofm_layout=ofm_layout, ) return conv
def make_ethosu_depthwise_conv2d( ifm, channels, kernel_shape, padding, strides, dilation, activation="NONE", ifm_layout="NHWC", ofm_layout="NHWC", weight_dtype="int8", scale_bias_dtype="uint8", rounding_mode="TFL", ): # params weight_shape = (channels, kernel_shape[0], kernel_shape[1], 1) padding = get_pad_tuple(padding, kernel_shape) scale_bias_data = generate_weights_data((weight_shape[0], 10), scale_bias_dtype) scale_bias = relay.const(scale_bias_data, dtype=scale_bias_dtype) weight_data = generate_weights_data(weight_shape, weight_dtype) weight = relay.const(weight_data, dtype=weight_dtype) depthwise = ethosu_ops.ethosu_depthwise_conv2d( ifm, weight, scale_bias, lut=relay.const([], dtype="int8"), ifm_scale=0.6, ifm_zero_point=11, weight_zero_point=13, ofm_scale=0.26, ofm_zero_point=15, kernel_shape=kernel_shape, ofm_channels=channels, strides=strides, padding=padding, dilation=dilation, activation=activation, clip_min=15 if activation == "CLIP" else 0, clip_max=105 if activation == "CLIP" else 0, rounding_mode=rounding_mode, upscale="NONE", ifm_layout=ifm_layout, ofm_layout=ofm_layout, ) return depthwise
def get_ref_data(): out_grad_np = np.random.uniform(size=out_grad_shape).astype(dtype) filter_np = np.random.uniform(size=filter_shape).astype(dtype) dilated_out_grad_np = tvm.topi.testing.dilate_python( out_grad_np, [1, stride_h, stride_w, 1]) # padding params in forward propagation fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple( [padding_h, padding_w], (filter_h, filter_w)) # padding params in backward propagation bpad_top = filter_h - 1 - fpad_top bpad_bottom = (filter_h - 1 - fpad_bottom) + (stride_h - 1) bpad_left = filter_w - 1 - fpad_left bpad_right = (filter_w - 1 - fpad_right) + (stride_w - 1) padded_out_grad = np.zeros(( batch, dilated_out_grad_np.shape[1] + bpad_top + bpad_bottom, dilated_out_grad_np.shape[2] + bpad_left + bpad_right, out_channel, )) padded_out_grad[:, bpad_top:dilated_out_grad_np.shape[1] + bpad_top, bpad_left:dilated_out_grad_np.shape[2] + bpad_left, :, ] = dilated_out_grad_np in_grad_np = np.zeros((batch, in_h, in_w, in_channel)) for b in range(batch): for c in range(in_channel): for m in range(channel_multiplier): in_grad_np[b, :, :, c] += signal.convolve2d( padded_out_grad[b, :, :, c * channel_multiplier + m], filter_np[:, :, c, m], mode="valid", )[0:in_h, 0:in_w] return (out_grad_np, filter_np, in_grad_np)
def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, out_dtype): """Compute function for Cortex-M7 SIMD implementation of conv2d.""" assert isinstance(strides, int) or len(strides) == 2 assert isinstance(dilation, int) or len(dilation) == 2 if isinstance(strides, int): stride_h = stride_w = strides else: stride_h, stride_w = strides if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation batch_size, in_height, in_width, in_channels = data.shape kernel_h, kernel_w, out_channels, _ = kernel.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 pad_top, pad_left, pad_down, pad_right = get_pad_tuple( padding, (dilated_kernel_h, dilated_kernel_w)) out_height = simplify( (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) out_width = simplify( (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] padded_data = pad(data, pad_before, pad_after, name="padded_data") rc = te.reduce_axis((0, in_channels), name="rc") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") conv = te.compute( (batch_size, out_height, out_width, out_channels), lambda nn, yy, xx, ff: te.sum( padded_data[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * kernel[ ry, rx, ff, rc].astype(out_dtype), axis=[ry, rx, rc], ), name="conv2d", tag="conv2d_nhwc", ) ########################### # Config Space Definition # ########################### n, oh, ow, co = ( cfg.axis(batch_size.value), cfg.axis(out_height.value), cfg.axis(out_width.value), cfg.axis(out_channels.value), ) kh, kw, ci = ( cfg.reduce_axis(kernel_h.value), cfg.reduce_axis(kernel_w.value), cfg.reduce_axis(in_channels.value), ) owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2) cio, cii = cfg.define_split( "tile_ci", ci, policy="factors", num_outputs=2, # TODO: check case with in_channels.value % 4 != 0 with AutoTVM filter=None if cfg.is_fallback else lambda x: x.size[-1] % 4 == 0, ) coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2) cfg.define_reorder( "reorder_0_simd", [n, oh, owo, owi, coo, coi, kh, kw, cio, cii], policy="candidate", candidate=[ [n, oh, kh, kw, owo, coo, cio, owi, coi, cii], [n, oh, kh, kw, coo, owo, cio, owi, coi, cii], [n, kh, kw, oh, owo, coo, cio, owi, coi, cii], [n, kh, kw, oh, coo, owo, cio, owi, coi, cii], ], ) cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32]) cfg.define_knob("unroll_explicit", [0, 1]) if cfg.is_fallback: cfg.fallback_split("tile_ow", [-1, out_width.value]) cfg.fallback_split("tile_ci", [-1, in_channels.value]) cfg.fallback_split("tile_co", [-1, out_channels.value]) return conv
def test_conv2d_nchw( self, hexagon_session: Session, batch, in_channel, in_size, num_filter, kernel, stride, padding, dtype, ref_data, dilation, add_bias, apply_relu, ): target_hexagon = tvm.target.hexagon("v68") pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right a_np, w_np, b_np, c_np = ref_data A = te.placeholder(a_np.shape, name="A", dtype=dtype) W = te.placeholder(w_np.shape, name="W", dtype=dtype) bias = te.placeholder(b_np.shape, name="bias", dtype=dtype) if "int" in dtype: tol = {"atol": 0, "rtol": 0} elif dtype == "float32": tol = {"rtol": 1e-4, "atol": 2e-4} elif dtype == "float16": # A summation in float16 with a single accumulator very # quickly runs into large rounding errors. At some point, # this tolerance should be schedule-dependent for to avoid # false negatives. num_values_summed = in_channel * kernel * kernel gap_size = np.nextafter(c_np.max(), np.inf, dtype=c_np.dtype) - c_np.max() tol = {"rtol": 1e-3, "atol": num_values_summed * gap_size / 2} with tvm.target.Target(target_hexagon): fcompute = topi.nn.conv2d_nchw fschedule = topi.hexagon.schedule_conv2d_nchw C = fcompute(A, W, (stride, stride), padding, (dilation, dilation), dtype) if add_bias: C = topi.add(C, bias) if apply_relu: C = topi.nn.relu(C) s = fschedule([C]) func_name = "conv2d_{}_{}_{}_{}_{}_{}_{}_{}_{}".format( dtype, batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation, ) func = tvm.build( s, [A, W, bias, C], tvm.target.Target(target_hexagon, host=target_hexagon), name=func_name, ) mod = hexagon_session.load_module(func) dev = hexagon_session.device a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) mod[func_name](a, w, b, c) tvm.testing.assert_allclose(c.numpy(), c_np, **tol)
def depthwise_conv2d_with_workload_nchw( batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1 ): in_width = in_height filter_channel = in_channel filter_width = filter_height stride_h = stride_w = stride if dilation == 1: # here we transform the padding argument from 'str' to 'tuple' , # because we need this to match the "workload" tuple to the records in TopHub padt, padl, padb, padr = get_pad_tuple(padding, (filter_height, filter_width)) padding_args = (padt, padl, padb, padr) else: padding_args = padding # placeholder Input = te.placeholder((batch, in_channel, in_height, in_width), name="Input") Filter = te.placeholder( (filter_channel, channel_multiplier, filter_height, filter_width), name="Filter" ) Scale = te.placeholder((in_channel * channel_multiplier,), name="Scale") Shift = te.placeholder((in_channel * channel_multiplier,), name="Shift") dtype = "float32" def check_target(target, dev): print("Running on target: %s" % target) impl_list = tvm.topi.testing.dispatch(target, _depthwise_conv2d_nchw_implement)[:] if target == "llvm" and channel_multiplier == 1 and dilation == 1: impl_list.append( (topi.x86.depthwise_conv2d_nchw, topi.x86.schedule_depthwise_conv2d_nchw) ) for fcompute, fschedule in impl_list: with tvm.target.Target(target): # declare DepthwiseConv2d = fcompute( Input, Filter, (stride_h, stride_w), padding_args, dilation, dtype ) ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift) Relu = topi.nn.relu(ScaleShift) # schedule s1 = fschedule(DepthwiseConv2d) s2 = fschedule(ScaleShift) s3 = fschedule(Relu) # build the kernels f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], target) f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], target) f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], target) # Prepare pod type for test data closure input_shape = get_const_tuple(Input.shape) filter_shape = get_const_tuple(Filter.shape) scale_shape = get_const_tuple(Scale.shape) shift_shape = get_const_tuple(Shift.shape) scale_shift_shape = get_const_tuple(ScaleShift.shape) # Use memoize, pickle the test data for next time use. @memoize("topi.tests.test_topi_depthwise_conv2d.nchw") def get_ref_data(): input_np = np.random.uniform(size=input_shape).astype(dtype) filter_np = np.random.uniform(size=filter_shape).astype(dtype) dilated_filter_np = tvm.topi.testing.dilate_python( filter_np, (1, 1, dilation, dilation) ) scale_np = np.random.uniform(size=scale_shape).astype(dtype) shift_np = np.random.uniform(size=shift_shape).astype(dtype) # correctness with scipy depthwise_conv2d_scipy = tvm.topi.testing.depthwise_conv2d_python_nchw( input_np, dilated_filter_np, stride, padding ) scale_shift_scipy = np.zeros(shape=scale_shift_shape) for c in range(in_channel * channel_multiplier): scale_shift_scipy[:, c, :, :] = ( depthwise_conv2d_scipy[:, c, :, :] * scale_np[c] + shift_np[c] ) relu_scipy = np.maximum(scale_shift_scipy, 0) return ( input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy, ) # Get the test data ( input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy, ) = get_ref_data() def verify_workload_padding(): _, _, out_height, out_width = get_const_tuple(depthwise_conv2d_scipy.shape) wkl = _get_workload( Input, Filter, (stride_h, stride_w), padding_args, dilation, dtype ) # check if tile_ow candidates are the factors of the right output weight. with tvm.target.Target(target): cfg = autotvm.get_config() _fallback_schedule(cfg, wkl) ow_tile = np.prod(cfg["tile_ow"].size) tvm.testing.assert_allclose(ow_tile, out_width) if "llvm" in target: verify_workload_padding() input_tvm = tvm.nd.array(input_np, dev) filter_tvm = tvm.nd.array(filter_np, dev) scale_tvm = tvm.nd.array(scale_np, dev) shift_tvm = tvm.nd.array(shift_np, dev) depthwise_conv2d_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), dev, ) scale_shift_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), dev ) relu_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), dev ) # launch kernel 1 (depthwise_conv2d) timer_1 = f1.time_evaluator(f1.entry_name, dev, number=1) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # launch kernel 2 (depthwise_conv2d + scale_shift) timer_2 = f2.time_evaluator(f2.entry_name, dev, number=1) tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # launch kernel 3 (depthwise_conv2d + scale_shift + relu) timer_3 = f3.time_evaluator(f3.entry_name, dev, number=1) tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean tvm.testing.assert_allclose( depthwise_conv2d_tvm.numpy(), depthwise_conv2d_scipy, rtol=1e-5 ) tvm.testing.assert_allclose(scale_shift_tvm.numpy(), scale_shift_scipy, rtol=1e-5) tvm.testing.assert_allclose(relu_tvm.numpy(), relu_scipy, rtol=1e-5) for target, dev in tvm.testing.enabled_targets(): with autotvm.tophub.context(target): # load tophub pre-tuned parameters check_target(target, dev)
def test_conv2d_nchw( self, target, dev, batch, in_channel, in_size, num_filter, kernel, stride, padding, dtype, ref_data, dilation, add_bias, apply_relu, ): target = tvm.target.Target(target) is_cudnn_target = target.kind.name == "cuda" and "cudnn" in target.attrs.get("libs", []) if target.kind.name == "vulkan" and dtype == "float16": if not target.attrs.get("supports_float16", False) or not target.attrs.get( "supports_16bit_buffer", False ): pytest.xfail("Vulkan device does not support float16") if ( target.kind.name == "cuda" and dtype == "float16" and not tvm.contrib.nvcc.have_fp16(dev.compute_version) ): pytest.xfail("CUDA float16 intrinsics not available") pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right has_asymmetric_padding = (pad_top != pad_bottom) or (pad_left != pad_right) if is_cudnn_target and has_asymmetric_padding: pytest.xfail("CuDNN does not support asymmetric padding") a_np, w_np, b_np, c_np = ref_data A = te.placeholder(a_np.shape, name="A", dtype=dtype) W = te.placeholder(w_np.shape, name="W", dtype=dtype) bias = te.placeholder(b_np.shape, name="bias", dtype=dtype) if "int" in dtype: tol = {"atol": 0, "rtol": 0} elif dtype == "float32": tol = {"rtol": 1e-4, "atol": 2e-4} elif dtype == "float16": # A summation in float16 with a single accumulator very # quickly runs into large rounding errors. At some point, # this tolerance should be schedule-dependent for to avoid # false negatives. num_values_summed = in_channel * kernel * kernel gap_size = np.nextafter(c_np.max(), np.inf, dtype=c_np.dtype) - c_np.max() tol = {"rtol": 1e-3, "atol": num_values_summed * gap_size / 2} with autotvm.tophub.context(target): # load tophub pre-tuned parameters if is_cudnn_target: fcompute, fschedule = topi.cuda.conv2d_cudnn, topi.cuda.schedule_conv2d_cudnn else: fcompute, fschedule = tvm.topi.testing.get_conv2d_nchw_implement(target) with target: if is_cudnn_target: C = fcompute( A, W, (stride, stride), padding, (dilation, dilation), 1, "NCHW", dtype ) else: C = fcompute(A, W, (stride, stride), padding, (dilation, dilation), dtype) if add_bias: C = topi.add(C, bias) if apply_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) func = tvm.build( s, [A, W, bias, C], target, name="conv2d_{}_{}_{}_{}_{}_{}_{}_{}_{}".format( dtype, batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation, ), ) func(a, w, b, c) tvm.testing.assert_allclose(c.numpy(), c_np, **tol)
def verify_conv2d_hwnc( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, dtype="int4" ): """Test the conv2d with tensorcore for hwnc layout""" pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right print( "Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation) ) # choose dtype from int4, int8 assert dtype in ["int4", "int8"] in_height = in_width = in_size A = te.placeholder((in_height, in_width, batch, in_channel), name="A", dtype=dtype) W = te.placeholder((kernel, kernel, num_filter, in_channel), name="W", dtype=dtype) a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) @memoize("topi.tests.test_topi_conv2d_hwnc.verify_conv2d_hwnc") def get_ref_data(): if dtype == "int4": a_np = np.random.randint(low=-8, high=7, size=a_shape).transpose((2, 0, 1, 3)) w_np = np.random.randint(low=-8, high=7, size=w_shape) dw_np = topi.testing.dilate_python( w_np.transpose((0, 1, 3, 2)), (1, 1, dilation, dilation) ) elif dtype == "int8": a_np = ( np.random.randint(low=-128, high=127, size=a_shape) .transpose((2, 0, 1, 3)) .astype(dtype) ) w_np = np.random.randint(low=-128, high=127, size=w_shape).astype(dtype) dw_np = topi.testing.dilate_python( w_np.transpose((0, 1, 3, 2)), (1, 1, dilation, dilation) ) c_np = topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding) return a_np, w_np, c_np def convert_int32_into_int4(a_int32): """convert int32 values into int4 Parameters ---------- a_int32 : int Return ------ a_int4 : int """ I, J, K, L = a_int32.shape a_int4 = np.zeros(shape=(I, J, K, L // 8), dtype=np.int32) for i in range(I): for j in range(J): for k in range(K): for l in range(L // 8): for m in range(min(8, L - l * 8)): a_int4[i, j, k, l] = a_int4[i, j, k, l] | ( (a_int32[i, j, k, l * 8 + m] & 0xF) << ((7 - m) * 4) ) return a_int4 a_np, w_np, c_np = get_ref_data() if dtype == "int4": a_np = convert_int32_into_int4(a_np) w_np = convert_int32_into_int4(w_np) def check_target(target): dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return if not nvcc.have_tensorcore(dev.compute_version): print("skip because gpu does not support Tensor Cores") return print("Running on target: %s" % target) with tvm.target.Target(target): fcompute, fschedule = topi.testing.dispatch(target, _conv2d_hwnc_tensorcore_implement) C = fcompute(A, W, stride, padding, dilation, dtype, "int32") s = fschedule([C]) a = tvm.nd.array(a_np.transpose((1, 2, 0, 3)), dev) w = tvm.nd.array(w_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) func = tvm.build( s, [A, W, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) rtol = 1e-3 tvm.testing.assert_allclose(c.asnumpy().transpose((2, 0, 1, 3)), c_np, rtol=rtol) check_target("cuda")
def compile_conv2d_NHWC_gemm_int8_arm( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, ): pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) in_height = in_width = in_size A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int8") W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8") bias = te.placeholder((num_filter, ), name="bias", dtype="int8") dtype = "int32" devices = [ ( "llvm --device arm_cpu --mtriple aarch64-linux-gnu", topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, ), ( "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod", topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, ), ( "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod", topi.arm_cpu.compute_conv2d_NHWC_quantized_native, topi.arm_cpu.schedule_conv2d_NHWC_quantized_native, ), # TODO(giuseros) Need LLVM-11 in order to compile with +i8mm extension # ( # "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+i8mm", # topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, # topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, # ), ] for device_tuple in devices: target = device_tuple[0] compute = device_tuple[1] schedule = device_tuple[2] dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return print("Compiling on arm AArch64 target: %s" % target) with tvm.target.Target(target): assert is_aarch64_arm(), "AArch64 target not recognized" C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = schedule([C]) if add_bias: tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func = tvm.build( s, [A, W, bias, C], target, name="relu_%dnnn_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) else: func = tvm.build( s, [A, W, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), )
def deformable_conv2d_nchw_python(a_np, offset_np, w_np, stride, padding, dilation, deformable_groups, groups): """Deformable convolution operator in NCHW layout. Parameters ---------- a_np : numpy.ndarray 4-D with shape [batch, in_channel, in_height, in_width] offset_np : numpy.ndarray 4-D with shape [batch, deformable_groups * filter_height * filter_width * 2, out_height, out_width] w_np : numpy.ndarray 4-D with shape [num_filter, in_channel, filter_height, filter_width] stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] padding : int or str or a list/tuple of 2 or 4 ints Padding size, or ['VALID', 'SAME'], or [pad_height, pad_width] for 2 ints, or [pad_top, pad_left, pad_bottom, pad_right] for 2 ints dilation : int or a list/tuple of two ints Dilation size, or [dilate_height, dilate_width] deformable_groups : int Number of deformable groups groups : int Number of groups Returns ------- b_np : np.ndarray 4-D with shape [batch, out_channel, out_height, out_width] """ batch, in_channel, in_height, in_width = a_np.shape out_channel, _, kernel_h, kernel_w = w_np.shape out_height, out_width = offset_np.shape[-2:] dtype = a_np.dtype ic_per_dgroup = in_channel // deformable_groups assert groups == 1, "deformable_conv2d_nchw_python does not support groups > 1" if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride pad_top, pad_left, _, _ = get_pad_tuple(padding, (kernel_h, kernel_w)) if isinstance(dilation, int): dilation_h = dilation_w = dilation else: dilation_h, dilation_w = dilation def _bilinear(n, c, h, w): low_h, low_w = int(h), int(w) high_h = min(low_h + 1, in_height - 1) high_w = min(low_w + 1, in_width - 1) y_lerp = h - low_h x_lerp = w - low_w bottom = (1 - x_lerp) * a_np[n, c, low_h, low_w] + x_lerp * a_np[ n, c, low_h, high_w] top = (1 - x_lerp) * a_np[n, c, high_h, low_w] + x_lerp * a_np[n, c, high_h, high_w] return (1 - y_lerp) * bottom + y_lerp * top a_deform = np.zeros( (batch, in_channel, out_height, out_width, kernel_h, kernel_w), dtype=dtype) for n, h, w in itertools.product(range(batch), range(out_height), range(out_width)): offset = offset_np[n, :, h, w].reshape(deformable_groups, kernel_h, kernel_w, 2) in_h = h * stride_h - pad_top in_w = w * stride_w - pad_left index_h_base, index_w_base = np.meshgrid( np.arange(in_h, in_h + kernel_h * dilation_h, dilation_h, dtype=offset_np.dtype), np.arange(in_w, in_w + kernel_w * dilation_w, dilation_w, dtype=offset_np.dtype), indexing="ij", ) for c, kh, kw in itertools.product(range(in_channel), range(kernel_h), range(kernel_w)): dg = c // ic_per_dgroup index_h = index_h_base + offset[dg, ..., 0] index_w = index_w_base + offset[dg, ..., 1] y, x = index_h[kh, kw], index_w[kh, kw] if y < 0 or y >= in_height or x < 0 or x >= in_width: continue a_deform[n, c, h, w, kh, kw] = _bilinear(n, c, y, x) b_np = np.zeros((batch, out_channel, out_height, out_width), dtype=dtype) for n, c, f, h, w in itertools.product(range(batch), range(in_channel), range(out_channel), range(out_height), range(out_width)): b_np[n, f, h, w] += np.tensordot(a_deform[n, c, h, w], w_np[f, c]) return b_np
def verify_conv2d_NHWC_gemm_int8( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, ): pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) in_height = in_width = in_size A = te.placeholder((batch, in_height, in_width, in_channel), name="A", dtype="int8") W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8") bias = te.placeholder((num_filter, ), name="bias", dtype="int8") a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw") def get_ref_data(): a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype) w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1)) c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding).astype(dtype) if add_bias: b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_target(target): dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return print("Running on target: %s" % target) with tvm.target.Target(target): C = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved( A, W, (stride, stride), padding, (dilation, dilation), dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func = tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5) check_target("llvm")
def verify_conv2d_NCHWc_int8( in_dtype, batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, ): pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) in_height = in_width = in_size A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype=in_dtype) W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype=in_dtype) a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) dtype = A.dtype out_dtype = "int32" if in_dtype == "int8" else "uint32" lo = -128 if in_dtype == "int8" else 0 hi = 127 if in_dtype == "int8" else 255 def check_target(target, compute, schedule, oc_block_factor, build_only): dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return if target == "cuda" and not tvm.contrib.nvcc.have_int8( dev.compute_version): print("Skip because int8 intrinsics are not available") return bias = te.placeholder( (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype=out_dtype) bias_shape = get_const_tuple(bias.shape) @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw") def get_ref_data(): a_np = np.random.randint(low=lo, high=hi, size=a_shape).astype(out_dtype) w_np = np.random.randint(low=lo, high=hi, size=w_shape).astype(out_dtype) b_np = np.random.uniform(size=bias_shape).astype(out_dtype) dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) c_np = tvm.topi.testing.conv2d_nchw_python( a_np, dw_np, stride, padding).astype(out_dtype) # convert to NCHWc _, _, out_height, out_width = c_np.shape c_np = c_np.reshape( (batch, num_filter // oc_block_factor, oc_block_factor, out_height, out_width)).transpose(0, 1, 3, 4, 2) if add_bias: b_np = np.random.uniform(size=bias_shape).astype(out_dtype) c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() with tvm.target.Target(target): C = compute( A, W, (stride, stride), padding, (dilation, dilation), "NCHW", "NCHW", out_dtype, ) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = schedule([C]) a = tvm.nd.array(a_np.astype(dtype), dev) w = tvm.nd.array(w_np.astype(dtype), dev) b = tvm.nd.array(b_np.astype(out_dtype), dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: compile_args = [A, W, bias, C] run_args = [a, w, b, c] else: compile_args = [A, W, C] run_args = [a, w, c] func = tvm.build( s, compile_args, target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) if build_only: return print("Running on target: %s" % target) func(*run_args) tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5) targets = [ ( "cuda", lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8( a, w, s, p, d, l, o), topi.cuda.schedule_conv2d_NCHWc_int8, 4, False, ), # Disable on CI since it does not support spirv int8 dot product # ( # "vulkan -from_device=0", # lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8(a, w, s, p, d, l, o), # topi.cuda.schedule_conv2d_NCHWc_int8, # 4, # False, # ), ] build_only_aarch64 = platform.machine() != "aarch64" targets.append(( "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon,+v8.2a,+dotprod", topi.arm_cpu.conv2d_NCHWc_int8, topi.arm_cpu.schedule_conv2d_NCHWc_int8, 8, build_only_aarch64, )) if in_dtype == "int8": targets += [ ( "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon", topi.arm_cpu.conv2d_NCHWc_int8, topi.arm_cpu.schedule_conv2d_NCHWc_int8, 8, build_only_aarch64, ), ( "rocm -mattr=+dotprod", lambda a, w, s, p, d, l, ol, o: topi.cuda.conv2d_NCHWc_int8( a, w, s, p, d, l, o), topi.cuda.schedule_conv2d_NCHWc_int8, 4, False, ), ] for target, compute, schedule, oc_block_factor, build_only in targets: check_target(target, compute, schedule, oc_block_factor, build_only)
def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding, dilation=1): in_width = in_height filter_channel = in_channel filter_width = filter_height stride_w = stride_h if dilation == 1: # here we transform the padding argument from 'str' to 'tuple' , # because we need this to match the "workload" tuple to the records in TopHub pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width)) padding_args = (pad_h, pad_w) else: padding_args = padding # placeholder Input = te.placeholder((batch, in_height, in_width, in_channel), name="Input") Filter = te.placeholder( (filter_height, filter_width, filter_channel, channel_multiplier), name="Filter") Scale = te.placeholder((in_channel * channel_multiplier, ), name="Scale") Shift = te.placeholder((in_channel * channel_multiplier, ), name="Shift") dtype = "float32" def check_device(device, ctx): print("Running on target: %s" % device) fcompute, fschedule = tvm.topi.testing.dispatch( device, _depthwise_conv2d_nhwc_implement) with tvm.target.Target(device): # declare DepthwiseConv2d = fcompute(Input, Filter, (stride_h, stride_w), padding_args, dilation, dtype) ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift) Relu = topi.nn.relu(ScaleShift) # schedule s1 = fschedule(DepthwiseConv2d) s2 = fschedule(ScaleShift) s3 = fschedule(Relu) # build the kernels f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) # Prepare pod type for test data closure input_shape = get_const_tuple(Input.shape) filter_shape = get_const_tuple(Filter.shape) scale_shape = get_const_tuple(Scale.shape) shift_shape = get_const_tuple(Shift.shape) scale_shift_shape = get_const_tuple(ScaleShift.shape) # Use memoize, pickle the test data for next time use. @memoize("topi.tests.test_topi_depthwise_conv2d.nhwc.v2") def get_ref_data(): input_np = np.random.uniform(size=input_shape).astype(dtype) filter_np = np.random.uniform(size=filter_shape).astype(dtype) dilated_filter_np = tvm.topi.testing.dilate_python( filter_np, (dilation, dilation, 1, 1)) scale_np = np.random.uniform(size=scale_shape).astype(dtype) shift_np = np.random.uniform(size=shift_shape).astype(dtype) # correctness with scipy depthwise_conv2d_scipy = tvm.topi.testing.depthwise_conv2d_python_nhwc( input_np, dilated_filter_np, stride=[stride_h, stride_w], padding=padding) scale_shift_scipy = np.zeros(shape=scale_shift_shape) for c in range(in_channel * channel_multiplier): scale_shift_scipy[:, :, :, c] = ( depthwise_conv2d_scipy[:, :, :, c] * scale_np[c] + shift_np[c]) relu_scipy = np.maximum(scale_shift_scipy, 0) return ( input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy, ) # Get the test data ( input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy, ) = get_ref_data() # prepare data input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) scale_tvm = tvm.nd.array(scale_np, ctx) shift_tvm = tvm.nd.array(shift_np, ctx) depthwise_conv2d_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx) scale_shift_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx) relu_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # launch kernel 1 (depthwise_conv2d) timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # launch kernel 2 (depthwise_conv2d + scale_shift) timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1) tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # launch kernel 3 (depthwise_conv2d + scale_shift + relu) timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1) tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean relu_scipy = np.maximum(scale_shift_scipy, 0) tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) for device, ctx in tvm.testing.enabled_targets(): with autotvm.tophub.context( device): # load tophub pre-tuned parameters check_device(device, ctx)
def verify_conv2d_NCHWc_int8( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, ): pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) in_height = in_width = in_size A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype="int8") W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype="int8") bias = te.placeholder( (num_filter // oc_block_factor, 1, 1, oc_block_factor), name="bias", dtype="int8") a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw") def get_ref_data(): a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype) w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype) # convert to NCHWc _, _, out_height, out_width = c_np.shape c_np = c_np.reshape( (batch, num_filter // oc_block_factor, oc_block_factor, out_height, out_width)).transpose(0, 1, 3, 4, 2) if add_bias: b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_target(target): dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return if target == "cuda" and not tvm.contrib.nvcc.have_int8( dev.compute_version): print("Skip because int8 intrinsics are not available") return print("Running on target: %s" % target) with tvm.target.Target(target): C = topi.cuda.conv2d_NCHWc_int8(A, W, (stride, stride), padding, (dilation, dilation), "NCHW", dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.cuda.schedule_conv2d_NCHWc_int8([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func = tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5) for target in ["cuda"]: check_target(target)
def verify_conv2d_nchw( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, use_cudnn=False, ): pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) in_height = in_width = in_size A = te.placeholder((batch, in_channel, in_height, in_width), name="A") W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W") bias = te.placeholder((num_filter, 1, 1), name="bias") a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding) if add_bias: c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def verify_workload_padding(): _, _, out_height, out_width = get_const_tuple(c_np.shape) wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype) # check if tile_ow candidates are the factors of the right output weight. cfg = autotvm.get_config() _fallback_schedule(cfg, wkl) ow_tile = np.prod(cfg["tile_ow"].size) tvm.testing.assert_allclose(ow_tile, out_width) def check_target(target): dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return print("Running on target: %s" % target) if "cudnn" in target: fcompute, fschedule = topi.cuda.conv2d_cudnn, topi.cuda.schedule_conv2d_cudnn else: fcompute, fschedule = tvm.topi.testing.get_conv2d_nchw_implement( target) with tvm.target.Target(target): if "cudnn" in target: C = fcompute(A, W, (stride, stride), padding, (dilation, dilation), 1, "NCHW", dtype) else: C = fcompute(A, W, (stride, stride), padding, (dilation, dilation), dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) if "llvm" in target: verify_workload_padding() a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-4) for target, dev in tvm.testing.enabled_targets(): with autotvm.tophub.context( target): # load tophub pre-tuned parameters check_target(target) if use_cudnn: check_target("cuda -model=unknown -libs=cudnn")
def _conv2d_transpose_nchw_python(a_np, w_np, stride, padding, output_padding): """Transposed convolution operator in NCHW layout. Parameters ---------- a_np : numpy.ndarray 4-D with shape [batch, in_channel, in_height, in_width] w_np : numpy.ndarray 4-D with shape [in_channel, num_filter, filter_height, filter_width] stride : int or a list/tuple of two ints Stride size, or [stride_height, stride_width] padding : int or str Padding size, or ['VALID', 'SAME'] output_padding : int or a list/tuple of two ints Use to disambiguate the output shape. Returns ------- b_np : np.ndarray 4-D with shape [batch, out_channel, out_height, out_width] """ batch, in_c, in_h, in_w = a_np.shape _, out_c, filter_h, filter_w = w_np.shape if isinstance(stride, int): stride_h = stride_w = stride else: stride_h, stride_w = stride if isinstance(output_padding, int): opad_h = opad_w = output_padding else: opad_h, opad_w = output_padding assert opad_h < stride_h and opad_w < stride_w # dilate stage dilated_a_np = tvm.topi.testing.dilate_python(a_np, [1, 1, stride_h, stride_w]) # padding stage fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple( padding, (filter_h, filter_w)) bpad_top = filter_h - 1 - fpad_top bpad_bottom = filter_h - 1 - fpad_bottom + opad_h bpad_left = filter_w - 1 - fpad_left bpad_right = filter_w - 1 - fpad_right + opad_w padded_a_np = np.zeros(( batch, in_c, dilated_a_np.shape[2] + bpad_top + bpad_bottom, dilated_a_np.shape[3] + bpad_left + bpad_right, )) padded_a_np[:, :, bpad_top:dilated_a_np.shape[2] + bpad_top, bpad_left:dilated_a_np.shape[3] + bpad_left, ] = dilated_a_np # convolution stage out_h = (in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h + opad_h out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w + opad_w b_np = np.zeros((batch, out_c, out_h, out_w)) for n in range(batch): for f in range(out_c): for c in range(in_c): out = scipy.signal.convolve2d(padded_a_np[n, c], w_np[c, f], mode="valid") b_np[n, f] += out return b_np
def verify_conv2d_nchw_int8( in_dtype, batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, ): pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) in_height = in_width = in_size A = te.placeholder((batch, in_channel, in_height, in_width), name="A", dtype=in_dtype) W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W", dtype=in_dtype) bias = te.placeholder((num_filter, 1, 1), name="bias", dtype=in_dtype) a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw") def get_ref_data(): a_np = np.random.randint(low=-128, high=127, size=a_shape).astype(dtype) w_np = np.random.randint(low=-128, high=128, size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding).astype(dtype) if add_bias: b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def verify_workload_padding(): _, _, out_height, out_width = get_const_tuple(c_np.shape) wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype) # for testing functionality, # we choose arbitrary int32_lanes and num_int8_elements can divide the channel, # regardless of the performance. int32_lanes, num_int8_elements = num_filter, in_channel # check if tile_ow candidates are the factors of the right output weight. cfg = autotvm.get_config() fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements) ow_tile = np.prod(cfg["tile_ow"].size) tvm.testing.assert_allclose(ow_tile, out_width) def check_target(target): dev = tvm.device(target, 0) if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return if target == "cuda" and not tvm.contrib.nvcc.have_int8( dev.compute_version): print("Skip because int8 intrinsics are not available") return print("Running on target: %s" % target) with tvm.target.Target(target): C = topi.cuda.conv2d_nchw_int8(A, W, (stride, stride), padding, (dilation, dilation), dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.cuda.schedule_conv2d_nchw_int8([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func = tvm.build( s, [A, W, bias, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], target, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5) verify_workload_padding() for target in ["cuda"]: check_target(target)
def conv2d_grad(orig, grad): """Gradient of conv2d""" attrs = orig.attrs data, weight = orig.args data_shape = get_const_tuple(data.checked_type.shape) weight_shape = get_const_tuple(weight.checked_type.shape) _, _, grad_h, grad_w = get_const_tuple(orig.checked_type.shape) batch, in_channel, in_h, in_w = data_shape out_channel, _, filter_h, filter_w = weight_shape # infer output_padding fpad_top, fpad_left, fpad_bottom, fpad_right = get_pad_tuple( get_const_tuple(attrs.padding), (filter_h, filter_w)) stride_h, stride_w = get_const_tuple(attrs.strides) dilation_h, dilation_w = get_const_tuple(attrs.dilation) out_h = (grad_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h out_w = (grad_w - 1) * stride_w - fpad_left - fpad_right + filter_w output_padding = (in_h - out_h, in_w - out_w) assert attrs.data_layout == "NCHW", "only support NCHW data layout" assert attrs.kernel_layout == "OIHW", "only support OIHW kernel layout" assert attrs.out_layout in ["", "NCHW"], "only support NCHW output layout" backward_data = _nn.conv2d_transpose( grad, weight, strides=attrs.strides, padding=attrs.padding, dilation=attrs.dilation, groups=attrs.groups, output_padding=output_padding, ) grad = tile(grad, [1, in_channel // attrs.groups, 1, 1]) grad = reshape(grad, [-1, 1, 0, 0]) # batch * oc * ic // groups, 1, oh, ow data = reshape(data, [1, -1, 0, 0]) # 1, batch * ic, ih, iw backward_weight = _nn.conv2d( data, grad, strides=attrs.dilation, padding=attrs.padding, dilation=attrs.strides, groups=in_channel * batch, ) # infer shape of backward_weight padded_weight_grad_h = (in_h - (grad_h - 1) * stride_h - 1 + fpad_top + fpad_bottom) // dilation_h + 1 padded_weight_grad_w = (in_w - (grad_w - 1) * stride_w - 1 + fpad_left + fpad_right) // dilation_w + 1 backward_weight = reshape( backward_weight, [ batch, in_channel // attrs.groups, out_channel, padded_weight_grad_h, padded_weight_grad_w, ], ) backward_weight = _sum(backward_weight, axis=0) backward_weight = transpose(backward_weight, [1, 0, 2, 3]) assert padded_weight_grad_h >= filter_h assert padded_weight_grad_w >= filter_w if padded_weight_grad_h > filter_h or padded_weight_grad_w > filter_w: backward_weight = strided_slice( backward_weight, begin=[0, 0, 0, 0], end=[out_channel, in_channel // attrs.groups, filter_h, filter_w], ) return [backward_data, backward_weight]
def verify_conv2d_nchw( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, devices=["cuda", "llvm -device=arm_cpu", "opencl -device=mali"], ): pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) in_height = in_width = in_size A = te.placeholder((batch, in_channel, in_height, in_width), name="A") W = te.placeholder((num_filter, in_channel, kernel, kernel), name="W") bias = te.placeholder((num_filter, 1, 1), name="bias") a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding) if add_bias: b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.Target(device): fcompute, fschedule = tvm.topi.testing.dispatch( device, _conv2d_nchw_winograd_implement) C = fcompute(A, W, stride, padding, dilation, dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build( s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) rtol = 1e-3 tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=rtol) for device in devices: check_device(device)
def verify_conv2d_NCHWc( batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, add_bias=False, add_relu=False, dtype="float32", ): pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( padding, (kernel, kernel)) padding_sum = pad_top + pad_left + pad_bottom + pad_right in_height = in_width = in_size print( "Workload: (%d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum)) # for testing functionality, # we choose arbitrary block size that can divide the channel, # regardless of the performance. oc_block = 1 for bn in range(16, 0, -1): if num_filter % bn == 0: oc_block = bn break ic_block = 1 for bn in range(oc_block, 0, -1): if in_channel % bn == 0: ic_block = bn break A = te.placeholder( (batch, in_channel // ic_block, in_height, in_width, ic_block), name="A") W = te.placeholder( (num_filter // oc_block, in_channel // ic_block, kernel, kernel, ic_block, oc_block), name="W", ) bias = te.placeholder((num_filter // oc_block, 1, 1, oc_block), name="bias") @memoize("topi.tests.test_topi_conv2d_NCHWc.verify_conv2d_NCHWc") def get_ref_data(): a_np = np.random.uniform(size=(batch, in_channel, in_height, in_width)).astype(dtype) w_np = np.random.uniform(size=(num_filter, in_channel, kernel, kernel)).astype(dtype) b_np = np.random.uniform(size=(num_filter, 1, 1)).astype(dtype) dw_np = tvm.topi.testing.dilate_python(w_np, (1, 1, dilation, dilation)) c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride, padding) if add_bias: c_np += b_np if add_relu: c_np = np.maximum(c_np, 0) return ( _transform_data(a_np, ic_block), _transform_kernel(w_np, ic_block, oc_block), _transform_bias(b_np, oc_block), _transform_data(c_np, oc_block), ) a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.Target(device): C = topi.x86.conv2d_NCHWc( A, W, (stride, stride), padding, (dilation, dilation), "NCHW%dc" % ic_block, "NCHW%dc" % oc_block, dtype, ) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.x86.schedule_conv2d_NCHWc([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-3) # test llvm only for now since conv2d_NCHWc implement is missing in other backend. for device in ["llvm"]: with autotvm.tophub.context( device): # load tophub pre-tuned parameters check_device(device)
def depthwise_conv2d_with_workload_NCHWc(batch, in_channel, in_height, channel_multiplier, filter_height, stride, padding, dilation=1): in_width = in_height filter_channel = in_channel filter_width = filter_height stride_h = stride_w = stride assert ( channel_multiplier == 1 ), "depthwise_conv2d_NCHWc currently does not support channel multiplier > 1." pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width)) padding_args = (pad_h, pad_w) out_channel = filter_channel * channel_multiplier # for testing functionality, # we choose arbitrary block size that can divide the channel, # regardless of the performance. oc_block = 1 for bn in range(16, 0, -1): if out_channel % bn == 0: oc_block = bn break ic_block = 1 for bn in range(oc_block, 0, -1): if in_channel % bn == 0: ic_block = bn break # placeholder Input = te.placeholder( (batch, in_channel // ic_block, in_height, in_width, ic_block), name="Input") Filter = te.placeholder( (out_channel // oc_block, 1, filter_height, filter_width, 1, oc_block), name="Filter") in_layout = "NCHW%dc" % ic_block out_layout = "NCHW%dc" % oc_block dtype = "float32" def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.Target(device): # declare DepthwiseConv2d = topi.x86.depthwise_conv2d_NCHWc( Input, Filter, (stride_h, stride_w), padding, (dilation, dilation), in_layout, out_layout, dtype, ) # TODO: add scale_shift implement for NCHWc and add test here Relu = topi.nn.relu(DepthwiseConv2d) # schedule s1 = topi.x86.schedule_depthwise_conv2d_NCHWc(DepthwiseConv2d) s2 = topi.x86.schedule_depthwise_conv2d_NCHWc(Relu) # build the kernels f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Relu], device) # Prepare pod type for test data closure input_shape = (batch, in_channel, in_height, in_width) filter_shape = (filter_channel, channel_multiplier, filter_height, filter_width) # Use memoize, pickle the test data for next time use. @memoize("topi.tests.test_topi_depthwise_conv2d.NCHWc") def get_ref_data(): input_np = np.random.uniform(size=input_shape).astype(dtype) filter_np = np.random.uniform(size=filter_shape).astype(dtype) # correctness with scipy dw_np = tvm.topi.testing.dilate_python( filter_np, (1, 1, dilation, dilation)).astype(dtype) depthwise_conv2d_scipy = tvm.topi.testing.depthwise_conv2d_python_nchw( input_np, dw_np, stride, padding) relu_scipy = np.maximum(depthwise_conv2d_scipy, 0) return ( _transform_data(input_np, ic_block), _transform_kernel(filter_np, oc_block), _transform_data(depthwise_conv2d_scipy, oc_block), _transform_data(relu_scipy, oc_block), ) # Get the test data (input_np, filter_np, depthwise_conv2d_scipy, relu_scipy) = get_ref_data() input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) depthwise_conv2d_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx) relu_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # launch kernel 1 (depthwise_conv2d) f1(input_tvm, filter_tvm, depthwise_conv2d_tvm) # launch kernel 2 (depthwise_conv2d + relu) f2(input_tvm, filter_tvm, relu_tvm) tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) # test llvm only for now since depthwise_conv2d_NCHWc implement is missing in other backend. for device in ["llvm"]: with autotvm.tophub.context( device): # load tophub pre-tuned parameters check_device(device)