def verify_conv2d_transpose_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, output_padding): in_height, in_width = in_size kernel_height, kernel_width = kernel stride_height, stride_width = stride pad_top, pad_left, pad_bottom, pad_right = padding A = te.placeholder((batch, in_channel, in_height, in_width), name='A') W = te.placeholder((in_channel, num_filter, kernel_height, kernel_width), name='W') a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) dtype = A.dtype @memoize( "topi.tests.test_topi_conv2d_transpose.verify_conv2d_transpose_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = tvm.topi.testing.conv2d_transpose_nchw_python( a_np, w_np, stride, padding, output_padding) c_np = np.maximum(b_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_device(device, ctx): print("Running on target: %s" % device) with tvm.target.create(device): fcompute, fschedule = tvm.topi.testing.dispatch( device, _conv2d_transpose_nchw_implement) B = fcompute(A, W, [stride_height, stride_width], [pad_top, pad_left, pad_bottom, pad_right], A.dtype, output_padding) C = topi.nn.relu(B) s1 = fschedule([B]) s2 = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) func1 = tvm.build(s1, [A, W, B], device) func2 = tvm.build(s2, [A, W, C], device) func1(a, w, b) func2(a, w, c) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) for device, ctx in tvm.testing.enabled_targets(): check_device(device, ctx)
def _compute_conv1d(attrs, inputs, out_type): """Compute definition of conv1d""" strides = get_const_tuple(attrs.strides) padding = get_const_tuple(attrs.padding) dilation = get_const_tuple(attrs.dilation) out_dtype = attrs.out_dtype out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype return [ topi_compute(inputs[0], inputs[1], strides, padding, dilation, out_dtype) ]
def compute_conv3d_transpose(attrs, inputs, out_dtype): """Compute definition of conv3d_transpose""" padding = get_const_tuple(attrs.padding) strides = get_const_tuple(attrs.strides) output_padding = get_const_tuple(attrs.output_padding) out_dtype = attrs.out_dtype out_dtype = (inputs[0].dtype if out_dtype in ("same", "") else out_dtype) out = topi_compute(inputs[0], inputs[1], strides, padding, out_dtype, output_padding) return [out]
def get_ref_data(): a_np = generate_quantized_np(get_const_tuple(A.shape), activation_bits, input_type) w_np = generate_quantized_np(get_const_tuple(W.shape), weight_bits, input_type) if unipolar: w_ = np.copy(w_np).astype(out_dtype) for x in np.nditer(w_, op_flags=['readwrite']): x[...] = 1 if x == 1 else -1 b_np = tvm.topi.testing.conv2d_nhwc_python(a_np, w_, stride, padding).astype(out_dtype) else: b_np = tvm.topi.testing.conv2d_nhwc_python(a_np, w_np, stride, padding).astype(out_dtype) return a_np, w_np, b_np
def _compute_dilation2d(attrs, inputs, out_type): padding = get_const_tuple(attrs.padding) strides = get_const_tuple(attrs.strides) dilations = get_const_tuple(attrs.dilations) data_layout = attrs.get_str("data_layout") out_dtype = attrs.out_dtype out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype args = [inputs[0], inputs[1], strides, padding, dilations] if need_data_layout: args.append(data_layout) args.append(out_dtype) return [topi_compute(*args)]
def _compute_deformable_conv2d(attrs, inputs, out_dtype): assert attrs.data_layout == "NCHW" padding = get_const_tuple(attrs.padding) strides = get_const_tuple(attrs.strides) dilation = get_const_tuple(attrs.dilation) deformable_groups = attrs.deformable_groups groups = attrs.groups out_dtype = attrs.out_dtype out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype out = topi_compute(inputs[0], inputs[1], inputs[2], strides, padding, dilation, deformable_groups, groups, out_dtype) return [out]
def verify_pool_grad(n, ic, ih, kh, sh, padding, pool_type, ceil_mode, count_include_pad=True, add_relu=False): """verify function of pool_grad""" iw = ih kw = kh sw = sh pt, pl, pb, pr = padding A = te.placeholder((n, ic, ih, iw), name='A') B = topi.nn.pool(A, kernel=[kh, kw], stride=[sh, sw], padding=padding, pool_type=pool_type, ceil_mode=ceil_mode, layout="NCHW", count_include_pad=count_include_pad) dtype = A.dtype bshape = get_const_tuple(B.shape) ashape = get_const_tuple(A.shape) if ceil_mode: assert bshape[2] == int(math.ceil(float(ashape[2] - kh + pt + pb) / sh) + 1) assert bshape[3] == int(math.ceil(float(ashape[3] - kw + pl + pr) / sw) + 1) else: assert bshape[2] == int(math.floor(float(ashape[2] - kh + pt + pb) / sh) + 1) assert bshape[3] == int(math.floor(float(ashape[3] - kw + pl + pr) / sw) + 1) OutGrad = te.placeholder(bshape, name='OutGrad') PoolGrad = topi.nn.pool_grad(OutGrad, A, kernel=[kh, kw], stride=[sh, sw], padding=padding, pool_type=pool_type, ceil_mode=ceil_mode, layout="NCHW", count_include_pad=count_include_pad) if add_relu: PoolGrad = topi.nn.relu(PoolGrad) a_np = np.random.uniform(low=0.001, size=(n, ic, ih, iw)).astype(dtype) out_grad_np = np.random.uniform(low=0.001, size=bshape).astype(dtype) pool_grad_np = tvm.topi.testing.pool_grad_nchw(a_np, out_grad_np, pool_size=(kh, kw), strides=(sh, sw), padding=padding, pool_type=pool_type, ceil_mode=ceil_mode, count_include_pad=count_include_pad) if add_relu: pool_grad_np = np.maximum(pool_grad_np, 0.) def check_device(device, ctx): print("Running on target: %s" % device) with tvm.target.Target(device): s_func = tvm.topi.testing.dispatch(device, _pool_grad_schedule) s = s_func(PoolGrad) a = tvm.nd.array(a_np, ctx) out_grad = tvm.nd.array(out_grad_np, ctx) pool_grad = tvm.nd.array(np.zeros(get_const_tuple(PoolGrad.shape), dtype=dtype), ctx) f = tvm.build(s, [A, OutGrad, PoolGrad], device) f(a, out_grad, pool_grad) tvm.testing.assert_allclose(pool_grad.asnumpy(), pool_grad_np, rtol=1e-5) for device, ctx in tvm.testing.enabled_targets(): check_device(device, ctx)
def verify_deformable_conv2d_nchw(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1, deformable_groups=1, groups=1): print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d, %d, %d)" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, deformable_groups, groups)) A = te.placeholder((batch, in_channel, in_size, in_size), name='A') out_size = (in_size - (kernel - 1) * dilation - 1 + 2 * padding) // stride + 1 Offset = te.placeholder((batch, deformable_groups * kernel * kernel * 2, out_size, out_size), name='offset') W = te.placeholder((num_filter, in_channel, kernel, kernel), name='W') bias = te.placeholder((num_filter, 1, 1), name='bias') a_shape = get_const_tuple(A.shape) offset_shape = get_const_tuple(Offset.shape) w_shape = get_const_tuple(W.shape) bias_shape = get_const_tuple(bias.shape) dtype = A.dtype @memoize("topi.tests.test_topi_deformable_conv2d_nchw.verify_deformable_conv2d_nchw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) offset_np = np.random.randn(*offset_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = np.random.uniform(size=bias_shape).astype(dtype) c_np = tvm.topi.testing.deformable_conv2d_nchw_python(a_np, offset_np, w_np, stride, padding, dilation, deformable_groups, groups) return a_np, offset_np, w_np, c_np a_np, offset_np, w_np, c_np = get_ref_data() def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) fcompute, fschedule = tvm.topi.testing.dispatch(device, _deformable_conv2d_implement) with tvm.target.create(device): C = fcompute(A, Offset, W, stride, padding, dilation, deformable_groups, groups, dtype) s = fschedule([C]) a = tvm.nd.array(a_np, ctx) offset = tvm.nd.array(offset_np, ctx) w = tvm.nd.array(w_np, ctx) c = tvm.nd.empty(c_np.shape, dtype=c_np.dtype, ctx=ctx) func = tvm.build(s, [A, Offset, W, C], device) func(a, offset, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) for device in ['llvm', 'cuda']: check_device(device)
def conv2d_NCHWc_shape_func(attrs, inputs, _): """ Shape function for contrib_conv2d_NCHWc op. """ strides = get_const_tuple(attrs.strides) padding = get_const_tuple(attrs.padding) dilation = get_const_tuple(attrs.dilation) out_layout = attrs.out_layout oc_bn = int(out_layout[4:-1]) return [_conv2d_NCHWc_shape_func(inputs[0], inputs[1], convert(strides), convert(padding), convert(dilation), convert(oc_bn))]
def verify_leaky_relu(m, alpha): A = te.placeholder((m, ), name='A') B = topi.nn.leaky_relu(A, alpha) s = te.create_schedule([B.op]) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) b_np = a_np * (a_np > 0) + a_np * (a_np < 0) * alpha ctx = tvm.cpu(0) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) foo = tvm.build(s, [A, B], "llvm", name="leaky_relu") foo(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def get_ref_data(a_shape, b_shape, input_dtype): a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_dtype) b_np = generate_quantized_np(get_const_tuple(b_shape), weight_bits, input_dtype) if unipolar: b_ = np.copy(b_np).astype(out_dtype) for x in np.nditer(b_, op_flags=['readwrite']): x[...] = 1 if x == 1 else -1 c_np = np.dot(a_np, b_.T) else: c_np = np.dot(a_np, b_np.T) return a_np, b_np, c_np
def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) # build the kernel f = tvm.build(schedule, [Input, Out_grad, Weight_grad], device) # prepare pod type for test data closure dtype = Out_grad.dtype out_grad_shape = get_const_tuple(Out_grad.shape) in_shape = get_const_tuple(Input.shape) # use memoize to pickle the test data for next time use @memoize("topi.tests.test_topi_depthwise_conv2d_backward_weight.nhwc") def get_ref_data(): out_grad_np = np.random.uniform(size=out_grad_shape).astype(dtype) input_np = np.random.uniform(size=in_shape).astype(dtype) dilated_out_grad_np = tvm.topi.testing.dilate_python( out_grad_np, [1, stride_h, stride_w, 1]) pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple( [padding_h, padding_w], (filter_h, filter_w)) padded_input_np = np.zeros( (batch, in_h + pad_top + pad_bottom, in_w + pad_left + pad_right, in_channel)) padded_input_np[:, pad_top:in_h + pad_top, pad_left:in_w + pad_left, :] = input_np weight_grad_np = np.zeros( (filter_h, filter_w, in_channel, channel_multiplier)) for c in range(in_channel): for m in range(channel_multiplier): for b in range(batch): weight_grad_np[:, :, c, m] += signal.convolve2d(padded_input_np[b, :, :, c], \ np.rot90(dilated_out_grad_np[b, :, :, c*channel_multiplier+m%channel_multiplier], 2), \ mode='valid')[0:filter_h, 0:filter_w] return (out_grad_np, input_np, weight_grad_np) (out_grad_np, input_np, weight_grad_np) = get_ref_data() out_grad_tvm = tvm.nd.array(out_grad_np, ctx) input_tvm = tvm.nd.array(input_np, ctx) weight_grad_tvm = tvm.nd.array(np.zeros(shape=fshape, dtype=dtype), ctx) # launch the kernel timer = f.time_evaluator(f.entry_name, ctx, number=1) tcost = timer(input_tvm, out_grad_tvm, weight_grad_tvm).mean tvm.testing.assert_allclose(weight_grad_np, weight_grad_tvm.asnumpy(), rtol=1e-4)
def compute_bitserial_conv2d(attrs, inputs, out_dtype): """Compute definition for bitserial conv2d.""" padding = get_const_tuple(attrs.padding) strides = get_const_tuple(attrs.strides) activation_bits = attrs.activation_bits weight_bits = attrs.weight_bits pack_dtype = attrs.pack_dtype out_dtype = attrs.out_dtype unipolar = attrs.unipolar return [ topi_compute(inputs[0], inputs[1], strides, padding, activation_bits, weight_bits, pack_dtype, out_dtype, unipolar) ]
def verify_conv1d_transpose_ncw(batch, in_channel, in_size, num_filter, kernel, stride, padding, output_padding): in_width = in_size A = te.placeholder((batch, in_channel, in_width), name='A') W = te.placeholder((in_channel, num_filter, kernel), name='W') a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) dtype = A.dtype @memoize( "topi.tests.test_topi_conv1d_transpose.verify_conv1d_transpose_ncw") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) b_np = tvm.topi.testing.conv1d_transpose_ncw_python( a_np, w_np, stride, padding, output_padding) c_np = np.maximum(b_np, 0) return a_np, w_np, b_np, c_np a_np, w_np, b_np, c_np = get_ref_data() def check_device(device): ctx = tvm.context(device, 0) if not ctx.exist: print("Skip because %s is not enabled" % device) return with tvm.target.create(device): fcompute, fschedule = tvm.topi.testing.dispatch( device, _conv1d_transpose_ncw_implement) B = fcompute(A, W, stride, padding, A.dtype, output_padding) C = topi.nn.relu(B) s1 = fschedule([B]) s2 = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) func1 = tvm.build(s1, [A, W, B], device) func2 = tvm.build(s2, [A, W, C], device) func1(a, w, b) func2(a, w, c) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) for device in get_all_backend(): check_device(device)
def test_conv2d_hwcn_map(): batch = 64 in_channel = 128 in_height = 16 in_width = 16 num_filter = 128 kernel = 3 stride = 2 padding = 'SAME' A = te.placeholder((in_height, in_width, in_channel, batch), name='A') W = te.placeholder((kernel, kernel, in_channel, num_filter), name='W') B = topi.nn.conv2d_hwcn(A, W, stride, padding) C = topi.nn.relu(B) s1 = topi.cuda.schedule_conv2d_hwcn([B]) s2 = topi.cuda.schedule_conv2d_hwcn([C]) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) w_np = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype) b_np = tvm.topi.testing.conv2d_hwcn_python(a_np, w_np, stride, padding) c_np = np.maximum(b_np, 0) def check_device(device): if not tvm.runtime.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) with tvm.transform.PassContext( config={ "tir.UrollLoop": { "auto_unroll_max_step": 128, "explicit_unroll": device == "rocm" } }): func1 = tvm.build(s1, [A, W, B], device) func1(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) func2 = tvm.build(s2, [A, W, C], device) func2(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) for device in ['cuda', 'opencl', 'rocm']: check_device(device)
def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) conv2d_nchw, schedule_conv2d_nchw = tvm.topi.testing.get_conv2d_nchw_implement(device) k = 10.0 dilation = (1, 1) with tvm.target.create(device): A = te.placeholder((batch, in_channel, in_size, in_size), name='A') W = te.placeholder((num_filter, in_channel, kernel, kernel), name='W') B = conv2d_nchw(A, W, stride, padding, dilation, A.dtype) if typ == "add": C = B + k elif typ == "sub": C = B - k elif typ == "mul": C = B * k elif typ == "div": C = B / k else: raise NotImplementedError() s = schedule_conv2d_nchw([C]) foo = tvm.build(s, [A, W, B, C], device, name="conv2d_scalar_" + typ) a_npy = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) w_npy = np.random.uniform(size=get_const_tuple(W.shape)).astype(W.dtype) b_npy = tvm.topi.testing.conv2d_nchw_python(a_npy, w_npy, stride, padding) c_npy = np.random.uniform(size=get_const_tuple(B.shape)).astype(B.dtype) if typ == "add": c_npy = b_npy + k elif typ == "sub": c_npy = b_npy - k elif typ == "mul": c_npy = b_npy * k elif typ == "div": c_npy = b_npy / k else: raise NotImplementedError() a_nd = tvm.nd.array(a_npy, ctx) w_nd = tvm.nd.array(w_npy, ctx) b_nd = tvm.nd.array(np.empty(b_npy.shape).astype(B.dtype), ctx) c_nd = tvm.nd.array(np.empty(c_npy.shape).astype(C.dtype), ctx) foo(a_nd, w_nd, b_nd, c_nd) tvm.testing.assert_allclose(c_nd.asnumpy(), c_npy, rtol=1E-4, atol=1E-4)
def verify_conv2d_nhwc(batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation=1): in_height = in_width = in_size A = te.placeholder((batch, in_height, in_width, in_channel), name="A") W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W") a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) dtype = A.dtype @memoize("topi.tests.test_topi_conv2d_nhwc.verify_nhwc.v2") def get_ref_data(): a_np = np.random.uniform(size=a_shape).astype(dtype) w_np = np.random.uniform(size=w_shape).astype(dtype) dw_np = tvm.topi.testing.dilate_python(w_np, (dilation, dilation, 1, 1)) b_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride, padding) return a_np, w_np, b_np a_np, w_np, b_np = get_ref_data() def check_device(device): if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.Target(device): fcompute, fschedule = tvm.topi.testing.dispatch( device, _conv2d_nhwc_implement) B = fcompute(A, W, stride, padding, dilation, dtype) s = fschedule([B]) ctx = tvm.context(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, W, B], device) func(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device in ["llvm", "cuda"]: check_device(device)
def verify_bitserial_conv2d_nhwc(batch, in_size, in_channel, num_filter, kernel, stride, padding, activation_bits, weight_bits, unipolar): in_height = in_width = in_size input_dtype = 'uint32' out_dtype = 'int32' with tvm.target.Target('llvm'): A = te.placeholder((batch, in_height, in_width, in_channel), dtype=input_dtype, name='A') W = te.placeholder((kernel, kernel, in_channel, num_filter), dtype=input_dtype, name='W') B = topi.x86.bitserial_conv2d_nhwc(A, W, stride, padding, activation_bits, weight_bits, input_dtype, out_dtype, unipolar) s = topi.x86.schedule_bitserial_conv2d_nhwc([B]) a_shape = get_const_tuple(A.shape) w_shape = get_const_tuple(W.shape) @memoize("topi.tests.test_topi_bitseral_conv2d_nhwc") def get_ref_data(): a_np = generate_quantized_np(get_const_tuple(a_shape), activation_bits, input_dtype) w_np = generate_quantized_np(get_const_tuple(w_shape), weight_bits, input_dtype) if unipolar: w_ = np.copy(w_np).astype(out_dtype) for x in np.nditer(w_, op_flags=['readwrite']): x[...] = 1 if x == 1 else -1 b_np = tvm.topi.testing.conv2d_nhwc_python( a_np, w_, stride, padding).astype(out_dtype) else: b_np = tvm.topi.testing.conv2d_nhwc_python( a_np, w_np, stride, padding).astype(out_dtype) return a_np, w_np, b_np a_np, w_np, b_np = get_ref_data() ctx = tvm.cpu(0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) func = tvm.build(s, [A, W, B], 'llvm') func(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
def check_device(device, host="llvm"): ctx = tvm.context(device, 0) if not tvm.runtime.enabled(host): return if not ctx.exist: print("skip because %s is not enabled.." % device) return sout = te.create_schedule(out.op) mout = tvm.build(sout, [out] + inputs) out_shape = get_const_tuple(out.shape) l, h = data_range input_data = [ tvm.nd.array( np.random.uniform(l, h, size=get_const_tuple( input.shape)).astype(input.dtype)) for input in inputs ] ones = topi.full_like(out, 1.0) # we provide head to sum and reduce the output dimension, # which equals to grad(out.sum(), inputs) grads = te.gradient(out, inputs, head=ones) grad_sched = te.create_schedule([grad.op for grad in grads]) mgrad = tvm.build(grad_sched, list(grads) + inputs) # print(tvm.lower(grad_sched, list(grads) + inputs, simple_mode=True)) grad_data = [ tvm.nd.empty(get_const_tuple(i.shape), g.dtype) for i, g in zip(inputs, grads) ] mgrad(*grad_data, *input_data) g_res = [g.asnumpy() for g in grad_data] if desired_grads: assert isinstance(desired_grads, list) for actual, desired in zip(g_res, desired_grads): assert_allclose(actual, desired, rtol=0.1, atol=1e-2) else: def forward(*in_data): out_data = tvm.nd.empty(out_shape, out.dtype) mout(out_data, *[tvm.nd.array(d) for d in list(in_data)]) return out_data.asnumpy().sum() check_numerical_grads(forward, [d.asnumpy() for d in input_data], g_res)
def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.Target(device): fcompute, fschedule = tvm.topi.testing.dispatch( device, _group_conv2d_nchw_implement) C = fcompute(A, W, stride, padding, dilation, groups, dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" %\ (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % \ (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return if device == "cuda" and not tvm.contrib.nvcc.have_int8( ctx.compute_version): print("Skip because int8 intrinsics are not available") return print("Running on target: %s" % device) with tvm.target.Target(device): C = topi.cuda.group_conv2d_NCHWc_int8(A, W, stride, padding, dilation, groups, dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = topi.cuda.schedule_group_conv2d_NCHWc_int8([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" %\ (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d_%d" % \ (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation, groups)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def _compute_roi_align(attrs, inputs, out_type): assert attrs.layout == "NCHW" pooled_size = get_const_tuple(attrs.pooled_size) return [topi_compute(inputs[0], inputs[1], pooled_size=pooled_size, spatial_scale=attrs.spatial_scale, sample_ratio=attrs.sample_ratio)]
def dilation2d_strategy(attrs, inputs, out_type, target): """dilation2d_strategy generic strategy""" logger.warning("dilation2d_strategy is not optimized for this platform.") strategy = _op.OpStrategy() dilations = get_const_tuple(attrs.dilations) layout = attrs.data_layout kernel_layout = attrs.kernel_layout assert layout in ["NCHW", "NHWC"] (dilation_h, dilation_w) = dilations if dilation_h < 1 or dilation_w < 1: raise ValueError("dilation should be positive value") if layout == "NCHW": assert kernel_layout == "IHW" strategy.add_implementation( wrap_compute_dilation2d(topi.image.dilation2d_nchw), wrap_topi_schedule(topi.generic.schedule_dilation2d_nchw), name="dilation2d_nchw.generic") elif layout == "NHWC": assert kernel_layout == "HWI" strategy.add_implementation( wrap_compute_dilation2d(topi.image.dilation2d_nhwc), wrap_topi_schedule(topi.generic.schedule_dilation2d_nhwc), name="dilation2d_nhwc.generic") else: raise RuntimeError("Unsupported dilation2d layout {}".format(layout)) return strategy
def split_shape_func(attrs, inputs, _): """ Shape function for split op. """ if isinstance(attrs.indices_or_sections, (int, tvm.tir.IntImm)): indices_or_sections = get_const_int(attrs.indices_or_sections) assert indices_or_sections > 0, "Slice count must be > 0" else: indices_or_sections = list(get_const_tuple(attrs.indices_or_sections)) assert sorted( indices_or_sections)[0] > 0 and indices_or_sections == sorted( indices_or_sections), "split_indices must be sorted" axis = get_const_int(attrs.axis) if axis < 0: axis += get_const_int(inputs[0].shape[0]) num_out = (indices_or_sections if isinstance(indices_or_sections, int) else len(indices_or_sections) + 1) if isinstance(indices_or_sections, int): indices_or_sections = [indices_or_sections] return [ _split_shape_func(inputs[0], convert(i), convert(indices_or_sections), convert(axis)) for i in range(num_out) ]
def squeeze_shape_func(attrs, inputs, _): """ Shape function for squeeze op. """ axis = attrs.axis if attrs.axis is None else get_const_tuple(attrs.axis) keep_axes = [] remove_axes = [] if axis is not None: for i in range(inputs[0].shape[0].value): if i not in axis: keep_axes.append(i) else: remove_axes.append(i) # Due to current relay type system, it is possible even # a static kernel function needs shape function. To handle # this case, we allow axis to be None in squeeze shape func # for now. # TODO(kevinthesun): Enhance relay type system to avoid this. if keep_axes: out = _squeeze_shape_func(inputs[0], convert(keep_axes), convert(remove_axes)) else: out = te.compute((), lambda *indices: 0) return [out]
def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(ctx): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.Target(device): C = topi.x86.conv2d_NCHWc(A, W, (stride, stride), (padding, padding), (dilation, dilation), 'NCHW%dc' % ic_block, "NCHW%dc" % oc_block, dtype) s = topi.x86.schedule_conv2d_NCHWc([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation)) # print(tvm.lower(s, [A, W, C], simple_mode=True)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-3)
def verify_global_pool(dshape, pool_type, layout='NCHW'): """verify function of global_pool""" assert layout in ["NCHW", "NHWC"] A = te.placeholder(shape=dshape, name='A') B = topi.nn.global_pool(A, pool_type=pool_type, layout=layout) B = topi.nn.relu(B) a_np = np.random.uniform(size=get_const_tuple(A.shape)).astype(A.dtype) axis = (layout.find('H'), layout.find('W')) if pool_type == 'avg': b_np = np.mean(a_np, axis=axis, keepdims=True) elif pool_type == 'max': b_np = np.max(a_np, axis=axis, keepdims=True) b_np = np.maximum(b_np, 0.0) def check_device(device, ctx): print("Running on target: %s" % device) with tvm.target.Target(device): s_func = tvm.topi.testing.dispatch(device, _adaptive_pool_schedule) if device == "cuda": s = s_func(B, layout) else: s = s_func(B) a = tvm.nd.array(a_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) f = tvm.build(s, [A, B], device) f(a, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) for device, ctx in tvm.testing.enabled_targets(): check_device(device, ctx)
def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return if not nvcc.have_tensorcore(ctx.compute_version): print("skip because gpu does not support Tensor Cores") return print("Running on target: %s" % device) with tvm.target.create(device): fcompute, fschedule = tvm.topi.testing.dispatch(device, _conv2d_nhwc_tensorcore_implement) C = fcompute(A, W, stride, padding, dilation, 'float32') if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % ( batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % ( batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) func(a, w, c) rtol = 1e-3 tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=rtol)
def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return if not nvcc.have_tensorcore(ctx.compute_version): print("skip because gpu does not support Tensor Cores") return print("Running on target: %s" % device) with tvm.target.Target(device): fcompute, fschedule = topi.testing.dispatch(device, _conv2d_hwnc_tensorcore_implement) C = fcompute(A, W, stride, padding, dilation, dtype, "int32") s = fschedule([C]) a = tvm.nd.array(a_np.transpose((1, 2, 0, 3)), ctx) w = tvm.nd.array(w_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) func = tvm.build( s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) rtol = 1e-3 tvm.testing.assert_allclose(c.asnumpy().transpose((2, 0, 1, 3)), c_np, rtol=rtol)
def check_device(device, ctx): print("Running on target: %s" % device) fcompute, fschedule = tvm.topi.testing.dispatch( device, _conv3d_ncdhw_implement) with tvm.target.Target(device): C = fcompute(A, W, (stride, stride, stride), padding, (dilation, dilation, dilation), dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) if add_bias: func = tvm.build(s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) func(a, w, b, c) else: func = tvm.build(s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation)) func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-4)