def depthwise_conv2d_with_workload_nhwc(batch, in_channel, in_height, channel_multiplier, filter_height, stride_h, padding): in_width = in_height filter_channel = in_channel filter_width = filter_height stride_w = stride_h # placeholder Input = tvm.placeholder((batch, in_height, in_width, in_channel), name='Input') Filter = tvm.placeholder( (filter_height, filter_width, filter_channel, channel_multiplier), name='Filter') Scale = tvm.placeholder((in_channel * channel_multiplier, ), name='Scale') Shift = tvm.placeholder((in_channel * channel_multiplier, ), name='Shift') # declare DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc( Input, Filter, stride=[stride_h, stride_w], padding=padding) ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift) Relu = topi.nn.relu(ScaleShift) # schedule s1 = schedule_depthwise_conv2d_nhwc(DepthwiseConv2d) s2 = schedule_depthwise_conv2d_nhwc(ScaleShift) s3 = schedule_depthwise_conv2d_nhwc(Relu) def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.context(device, 0) # build the kernels f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) # Prepare pod type for test data closure dtype = Input.dtype input_shape = get_const_tuple(Input.shape) filter_shape = get_const_tuple(Filter.shape) scale_shape = get_const_tuple(Scale.shape) shift_shape = get_const_tuple(Shift.shape) scale_shift_shape = get_const_tuple(ScaleShift.shape) # Use memoize, pickle the test data for next time use. @memoize("topi.tests.test_topi_depthwise_conv2d.nhwc") def get_ref_data(): input_np = np.random.uniform(size=input_shape).astype(dtype) filter_np = np.random.uniform(size=filter_shape).astype(dtype) scale_np = np.random.uniform(size=scale_shape).astype(dtype) shift_np = np.random.uniform(size=shift_shape).astype(dtype) # correctness with scipy depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nhwc( input_np, filter_np, stride=[stride_h, stride_w], padding=padding) scale_shift_scipy = np.zeros(shape=scale_shift_shape) for c in range(in_channel * channel_multiplier): scale_shift_scipy[:, :, :, c] = depthwise_conv2d_scipy[:, :, :, c] * scale_np[ c] + shift_np[ c] relu_scipy = np.maximum(scale_shift_scipy, 0) return (input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) # Get the test data (input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy) = get_ref_data() # prepare data input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) scale_tvm = tvm.nd.array(scale_np, ctx) shift_tvm = tvm.nd.array(shift_np, ctx) depthwise_conv2d_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx) scale_shift_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx) relu_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # launch kernel 1 (depthwise_conv2d) timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # launch kernel 2 (depthwise_conv2d + scale_shift) timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1) tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # launch kernel 3 (depthwise_conv2d + scale_shift + relu) timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1) tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean relu_scipy = np.maximum(scale_shift_scipy, 0) np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) check_device("opencl") check_device("cuda") check_device("metal")
def test_depthwise_conv2d_nhwc(): """You may test different settings.""" batch = 1 in_channel = 256 in_height = 96 in_width = 96 filter_channel = in_channel channel_multiplier = 1 filter_height = 3 filter_width = 3 stride_h = 1 stride_w = 1 padding = 'SAME' # or 'VALID' # Placeholder Input = tvm.placeholder((batch, in_height, in_width, in_channel), name='Input') Filter = tvm.placeholder( (filter_height, filter_width, filter_channel, channel_multiplier), name='Filter') Stride = [stride_h, stride_w] Scale = tvm.placeholder((in_channel * channel_multiplier, ), name='Scale') Shift = tvm.placeholder((in_channel * channel_multiplier, ), name='Shift') # Declare DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, Filter, Stride, padding) ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift) Relu = topi.nn.relu(ScaleShift) # Schedule s1 = schedule_depthwise_conv2d_nhwc(DepthwiseConv2d) s2 = schedule_depthwise_conv2d_nhwc(ScaleShift) s3 = schedule_depthwise_conv2d_nhwc(Relu) input_np = np.random.uniform(size=get_const_tuple(Input.shape)).astype( Input.dtype) filter_np = np.random.uniform(size=get_const_tuple(Filter.shape)).astype( Filter.dtype) scale_np = np.random.uniform(size=(in_channel * channel_multiplier)).astype(Scale.dtype) shift_np = np.random.uniform(size=(in_channel * channel_multiplier)).astype(Shift.dtype) def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.context(device, 0) # Build the kernel f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) # Prepare data input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) scale_tvm = tvm.nd.array(scale_np, ctx) shift_tvm = tvm.nd.array(shift_np, ctx) depthwise_conv2d_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape), dtype=DepthwiseConv2d.dtype), ctx) scale_shift_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx) relu_tvm = tvm.nd.array( np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # Measure time cost of kernel 1 (depthwise_conv2d) timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # Measure time cost of kernel 2 (depthwise_conv2d + scale_shift) timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1000) tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # Measure time cost of kernel 3 (depthwise_conv2d + scale_shift + relu) timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1000) tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean print("Input shape = " + str(get_const_tuple(Input.shape))) print("Filter shape = " + str(get_const_tuple(Filter.shape))) print("Stride = (%d, %d)" % (stride_h, stride_w)) print("padding = %s\n" % padding) print("Output shape = " + str(get_const_tuple(DepthwiseConv2d.shape))) print("average time cost of 1000 runs (depthwise_conv2d) = %g us" % (tcost_1 * 1e6)) print( "average time cost of 1000 runs (depthwise_conv2d + scale_shift) = %g us" % (tcost_2 * 1e6)) print( "average time cost of 1000 runs (depthwise_conv2d + scale_shift + relu) = %g us" % (tcost_3 * 1e6)) # correctness depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nhwc( input_np, filter_np, stride=[stride_h, stride_w], padding=padding) scale_shift_scipy = np.zeros(shape=get_const_tuple(ScaleShift.shape)) for c in range(in_channel * channel_multiplier): scale_shift_scipy[:, :, :, c] = depthwise_conv2d_scipy[:, :, :, c] * scale_np[ c] + shift_np[c] relu_scipy = np.maximum(scale_shift_scipy, 0) np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) np.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) print("success") for device in ['cuda', 'opencl', 'rocm']: with tvm.build_config(auto_unroll_max_step=128, detect_global_barrier=False, restricted_func=True): check_device(device)
def test_depthwise_conv2d_nhwc(): """You may test different settings.""" batch = 1 in_channel = 256 in_height = 96 in_width = 96 filter_channel = in_channel channel_multiplier = 1 filter_height = 3 filter_width = 3 stride_h = 1 stride_w = 1 padding = 'SAME' # or 'VALID' # Placeholder Input = tvm.placeholder((batch, in_height, in_width, in_channel), name='Input') Filter = tvm.placeholder((filter_height, filter_width,filter_channel, channel_multiplier), name='Filter') Stride = [stride_h, stride_w] Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale') Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift') # Declare DepthwiseConv2d = topi.nn.depthwise_conv2d_nhwc(Input, Filter, Stride, padding) ScaleShift = topi.nn.scale_shift_nhwc(DepthwiseConv2d, Scale, Shift) Relu = topi.nn.relu(ScaleShift) # Schedule s1 = schedule_depthwise_conv2d_nhwc(DepthwiseConv2d) s2 = schedule_depthwise_conv2d_nhwc(ScaleShift) s3 = schedule_depthwise_conv2d_nhwc(Relu) input_np = np.random.uniform(size=get_const_tuple(Input.shape)).astype(Input.dtype) filter_np = np.random.uniform(size=get_const_tuple(Filter.shape)).astype(Filter.dtype) scale_np = np.random.uniform(size=(in_channel * channel_multiplier)).astype(Scale.dtype) shift_np = np.random.uniform(size=(in_channel * channel_multiplier)).astype(Shift.dtype) def check_device(device): if not tvm.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.context(device, 0) # Build the kernel f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device) f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device) f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device) # Prepare data input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) scale_tvm = tvm.nd.array(scale_np, ctx) shift_tvm = tvm.nd.array(shift_np, ctx) depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),dtype=DepthwiseConv2d.dtype), ctx) scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx) relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx) # Measure time cost of kernel 1 (depthwise_conv2d) timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000) tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean # Measure time cost of kernel 2 (depthwise_conv2d + scale_shift) timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1000) tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean # Measure time cost of kernel 3 (depthwise_conv2d + scale_shift + relu) timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1000) tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean print("Input shape = " + str(get_const_tuple(Input.shape))) print("Filter shape = " + str(get_const_tuple(Filter.shape))) print("Stride = (%d, %d)" % (stride_h, stride_w)) print("padding = %s\n" % padding) print("Output shape = " + str(get_const_tuple(DepthwiseConv2d.shape))) print("average time cost of 1000 runs (depthwise_conv2d) = %g us" % (tcost_1*1e6)) print("average time cost of 1000 runs (depthwise_conv2d + scale_shift) = %g us" % (tcost_2*1e6)) print("average time cost of 1000 runs (depthwise_conv2d + scale_shift + relu) = %g us" % (tcost_3*1e6)) # correctness depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nhwc(input_np, filter_np, stride=[stride_h, stride_w], padding=padding) scale_shift_scipy = np.zeros(shape=get_const_tuple(ScaleShift.shape)) for c in range(in_channel * channel_multiplier): scale_shift_scipy[:,:,:,c] = depthwise_conv2d_scipy[:,:,:,c] * scale_np[c] + shift_np[c] relu_scipy = np.maximum(scale_shift_scipy, 0) tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5) tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5) tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5) print("success") for device in ['cuda', 'opencl', 'rocm']: with tvm.build_config(auto_unroll_max_step=128, detect_global_barrier=False, restricted_func=True): check_device(device)