Exemplo n.º 1
0
def depthwise_conv2d_with_workload_nchw(batch, in_channel, in_height,
                                        channel_multiplier, filter_height,
                                        stride_h, padding):
    in_width = in_height
    filter_channel = in_channel
    filter_width = filter_height
    stride_w = stride_h
    # placeholder
    Input = tvm.placeholder((batch, in_channel, in_height, in_width),
                            name='Input')
    Filter = tvm.placeholder(
        (filter_channel, channel_multiplier, filter_height, filter_width),
        name='Filter')
    Scale = tvm.placeholder((in_channel * channel_multiplier, ), name='Scale')
    Shift = tvm.placeholder((in_channel * channel_multiplier, ), name='Shift')
    # declare
    DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(
        Input, Filter, stride=[stride_h, stride_w], padding=padding)
    ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
    Relu = topi.nn.relu(ScaleShift)
    # schedule
    s1 = schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
    s2 = schedule_depthwise_conv2d_nchw(ScaleShift)
    s3 = schedule_depthwise_conv2d_nchw(Relu)

    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        ctx = tvm.context(device, 0)
        # build the kernels
        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
        f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
        f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)

        # Prepare pod type for test data closure
        dtype = Input.dtype
        input_shape = get_const_tuple(Input.shape)
        filter_shape = get_const_tuple(Filter.shape)
        scale_shape = get_const_tuple(Scale.shape)
        shift_shape = get_const_tuple(Shift.shape)
        scale_shift_shape = get_const_tuple(ScaleShift.shape)

        # Use memoize, pickle the test data for next time use.
        @memoize("topi.tests.test_topi_depthwise_conv2d.nchw")
        def get_ref_data():
            input_np = np.random.uniform(size=input_shape).astype(dtype)
            filter_np = np.random.uniform(size=filter_shape).astype(dtype)
            scale_np = np.random.uniform(size=scale_shape).astype(dtype)
            shift_np = np.random.uniform(size=shift_shape).astype(dtype)
            # correctness with scipy
            depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
                input_np,
                filter_np,
                stride=[stride_h, stride_w],
                padding=padding)
            scale_shift_scipy = np.zeros(shape=scale_shift_shape)
            for c in range(in_channel * channel_multiplier):
                scale_shift_scipy[:,
                                  c, :, :] = depthwise_conv2d_scipy[:, c, :, :] * scale_np[
                                      c] + shift_np[c]
                relu_scipy = np.maximum(scale_shift_scipy, 0)
            return (input_np, filter_np, scale_np, shift_np,
                    depthwise_conv2d_scipy, scale_shift_scipy, relu_scipy)

        # Get the test data
        (input_np, filter_np, scale_np, shift_np, depthwise_conv2d_scipy,
         scale_shift_scipy, relu_scipy) = get_ref_data()

        input_tvm = tvm.nd.array(input_np, ctx)
        filter_tvm = tvm.nd.array(filter_np, ctx)
        scale_tvm = tvm.nd.array(scale_np, ctx)
        shift_tvm = tvm.nd.array(shift_np, ctx)
        depthwise_conv2d_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),
                     dtype=DepthwiseConv2d.dtype), ctx)
        scale_shift_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(ScaleShift.shape),
                     dtype=ScaleShift.dtype), ctx)
        relu_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
        # launch kernel 1 (depthwise_conv2d)
        timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1)
        tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
        # launch kernel 2 (depthwise_conv2d + scale_shift)
        timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1)
        tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm,
                          scale_shift_tvm).mean
        # launch kernel 3 (depthwise_conv2d + scale_shift + relu)
        timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1)
        tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm,
                          relu_tvm).mean
        np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(),
                                   depthwise_conv2d_scipy,
                                   rtol=1e-5)
        np.testing.assert_allclose(scale_shift_tvm.asnumpy(),
                                   scale_shift_scipy,
                                   rtol=1e-5)
        np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)

    check_device("opencl")
    check_device("cuda")
    check_device("metal")
Exemplo n.º 2
0
def test_depthwise_conv2d_nchw():
    """You may test different settings."""
    batch = 1
    in_channel = 256
    in_height = 96
    in_width = 96

    filter_channel = in_channel
    channel_multiplier = 1
    filter_height = 3
    filter_width = 3

    stride_h = 1
    stride_w = 1

    padding = 'SAME'  # or 'VALID'

    # Placeholder
    Input = tvm.placeholder((batch, in_channel, in_height, in_width),
                            name='Input')
    Filter = tvm.placeholder(
        (filter_channel, channel_multiplier, filter_height, filter_width),
        name='Filter')
    Stride = [stride_h, stride_w]
    Scale = tvm.placeholder((in_channel * channel_multiplier, ), name='Scale')
    Shift = tvm.placeholder((in_channel * channel_multiplier, ), name='Shift')
    # Declare
    DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter, Stride,
                                                    padding)
    ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
    Relu = topi.nn.relu(ScaleShift)
    # Schedule
    s1 = schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
    s2 = schedule_depthwise_conv2d_nchw(ScaleShift)
    s3 = schedule_depthwise_conv2d_nchw(Relu)
    input_np = np.random.uniform(size=get_const_tuple(Input.shape)).astype(
        Input.dtype)
    filter_np = np.random.uniform(size=get_const_tuple(Filter.shape)).astype(
        Filter.dtype)
    scale_np = np.random.uniform(size=(in_channel *
                                       channel_multiplier)).astype(Scale.dtype)
    shift_np = np.random.uniform(size=(in_channel *
                                       channel_multiplier)).astype(Shift.dtype)

    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        ctx = tvm.context(device, 0)
        # Build the kernel
        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
        f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
        f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)
        # Prepare data
        input_tvm = tvm.nd.array(input_np, ctx)
        filter_tvm = tvm.nd.array(filter_np, ctx)
        scale_tvm = tvm.nd.array(scale_np, ctx)
        shift_tvm = tvm.nd.array(shift_np, ctx)

        depthwise_conv2d_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),
                     dtype=DepthwiseConv2d.dtype), ctx)
        scale_shift_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(ScaleShift.shape),
                     dtype=ScaleShift.dtype), ctx)
        relu_tvm = tvm.nd.array(
            np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
        # Measure time cost of kernel 1 (depthwise_conv2d)
        timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000)
        tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
        # Measure time cost of kernel 2 (depthwise_conv2d + scale_shift)
        timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1000)
        tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm,
                          scale_shift_tvm).mean
        # Measure time cost of kernel 3 (depthwise_conv2d + scale_shift + relu)
        timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1000)
        tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm,
                          relu_tvm).mean
        print("Input shape = " + str(get_const_tuple(Input.shape)))
        print("Filter shape = " + str(get_const_tuple(Filter.shape)))
        print("Stride = (%d, %d)" % (stride_h, stride_w))
        print("padding = %s\n" % padding)
        print("Output shape = " + str(get_const_tuple(DepthwiseConv2d.shape)))
        print("average time cost of 1000 runs (depthwise_conv2d) = %g us" %
              (tcost_1 * 1e6))
        print(
            "average time cost of 1000 runs (depthwise_conv2d + scale_shift) = %g us"
            % (tcost_2 * 1e6))
        print(
            "average time cost of 1000 runs (depthwise_conv2d + scale_shift + relu) = %g us"
            % (tcost_3 * 1e6))
        # correctness
        depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(
            input_np, filter_np, stride=[stride_h, stride_w], padding=padding)
        scale_shift_scipy = np.zeros(shape=get_const_tuple(ScaleShift.shape))
        for c in range(in_channel * channel_multiplier):
            scale_shift_scipy[:,
                              c, :, :] = depthwise_conv2d_scipy[:, c, :, :] * scale_np[
                                  c] + shift_np[c]
        relu_scipy = np.maximum(scale_shift_scipy, 0)
        np.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(),
                                   depthwise_conv2d_scipy,
                                   rtol=1e-5)
        np.testing.assert_allclose(scale_shift_tvm.asnumpy(),
                                   scale_shift_scipy,
                                   rtol=1e-5)
        np.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
        print("success")

    for device in ['cuda', 'opencl', 'rocm']:
        with tvm.build_config(auto_unroll_max_step=128,
                              unroll_explicit=device == 'rocm',
                              detect_global_barrier=False,
                              restricted_func=True):
            check_device(device)
Exemplo n.º 3
0
def test_depthwise_conv2d_nchw():
    """You may test different settings."""
    batch = 1
    in_channel = 256
    in_height = 96
    in_width = 96

    filter_channel = in_channel
    channel_multiplier = 1
    filter_height = 3
    filter_width = 3

    stride_h = 1
    stride_w = 1

    padding = 'SAME' # or 'VALID'

    # Placeholder
    Input = tvm.placeholder((batch, in_channel, in_height, in_width), name='Input')
    Filter = tvm.placeholder((filter_channel, channel_multiplier, filter_height, filter_width), name='Filter')
    Stride = [stride_h, stride_w]
    Scale = tvm.placeholder((in_channel * channel_multiplier,), name='Scale')
    Shift = tvm.placeholder((in_channel * channel_multiplier,), name='Shift')
    # Declare
    DepthwiseConv2d = topi.nn.depthwise_conv2d_nchw(Input, Filter, Stride, padding)
    ScaleShift = topi.nn.scale_shift_nchw(DepthwiseConv2d, Scale, Shift)
    Relu = topi.nn.relu(ScaleShift)
    # Schedule
    s1 = schedule_depthwise_conv2d_nchw(DepthwiseConv2d)
    s2 = schedule_depthwise_conv2d_nchw(ScaleShift)
    s3 = schedule_depthwise_conv2d_nchw(Relu)
    input_np = np.random.uniform(size=get_const_tuple(Input.shape)).astype(Input.dtype)
    filter_np = np.random.uniform(size=get_const_tuple(Filter.shape)).astype(Filter.dtype)
    scale_np = np.random.uniform(size=(in_channel * channel_multiplier)).astype(Scale.dtype)
    shift_np = np.random.uniform(size=(in_channel * channel_multiplier)).astype(Shift.dtype)

    def check_device(device):
        if not tvm.module.enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        ctx = tvm.context(device, 0)
        # Build the kernel
        f1 = tvm.build(s1, [Input, Filter, DepthwiseConv2d], device)
        f2 = tvm.build(s2, [Input, Filter, Scale, Shift, ScaleShift], device)
        f3 = tvm.build(s3, [Input, Filter, Scale, Shift, Relu], device)
        # Prepare data
        input_tvm = tvm.nd.array(input_np, ctx)
        filter_tvm = tvm.nd.array(filter_np, ctx)
        scale_tvm = tvm.nd.array(scale_np, ctx)
        shift_tvm = tvm.nd.array(shift_np, ctx)

        depthwise_conv2d_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(DepthwiseConv2d.shape),dtype=DepthwiseConv2d.dtype), ctx)
        scale_shift_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(ScaleShift.shape), dtype=ScaleShift.dtype), ctx)
        relu_tvm = tvm.nd.array(np.zeros(shape=get_const_tuple(Relu.shape), dtype=Relu.dtype), ctx)
        # Measure time cost of kernel 1 (depthwise_conv2d)
        timer_1 = f1.time_evaluator(f1.entry_name, ctx, number=1000)
        tcost_1 = timer_1(input_tvm, filter_tvm, depthwise_conv2d_tvm).mean
        # Measure time cost of kernel 2 (depthwise_conv2d + scale_shift)
        timer_2 = f2.time_evaluator(f2.entry_name, ctx, number=1000)
        tcost_2 = timer_2(input_tvm, filter_tvm, scale_tvm, shift_tvm, scale_shift_tvm).mean
        # Measure time cost of kernel 3 (depthwise_conv2d + scale_shift + relu)
        timer_3 = f3.time_evaluator(f3.entry_name, ctx, number=1000)
        tcost_3 = timer_3(input_tvm, filter_tvm, scale_tvm, shift_tvm, relu_tvm).mean
        print("Input shape = " + str(get_const_tuple(Input.shape)))
        print("Filter shape = " + str(get_const_tuple(Filter.shape)))
        print("Stride = (%d, %d)" % (stride_h, stride_w))
        print("padding = %s\n" % padding)
        print("Output shape = " + str(get_const_tuple(DepthwiseConv2d.shape)))
        print("average time cost of 1000 runs (depthwise_conv2d) = %g us" % (tcost_1*1e6))
        print("average time cost of 1000 runs (depthwise_conv2d + scale_shift) = %g us" % (tcost_2*1e6))
        print("average time cost of 1000 runs (depthwise_conv2d + scale_shift + relu) = %g us" % (tcost_3*1e6))
        # correctness
        depthwise_conv2d_scipy = topi.testing.depthwise_conv2d_python_nchw(input_np, filter_np, stride=[stride_h, stride_w], padding=padding)
        scale_shift_scipy = np.zeros(shape=get_const_tuple(ScaleShift.shape))
        for c in range(in_channel * channel_multiplier):
            scale_shift_scipy[:,c,:,:] = depthwise_conv2d_scipy[:,c,:,:] * scale_np[c] + shift_np[c]
        relu_scipy = np.maximum(scale_shift_scipy, 0)
        tvm.testing.assert_allclose(depthwise_conv2d_tvm.asnumpy(), depthwise_conv2d_scipy, rtol=1e-5)
        tvm.testing.assert_allclose(scale_shift_tvm.asnumpy(), scale_shift_scipy, rtol=1e-5)
        tvm.testing.assert_allclose(relu_tvm.asnumpy(), relu_scipy, rtol=1e-5)
        print("success")

    for device in ['cuda', 'opencl', 'rocm']:
        with tvm.build_config(auto_unroll_max_step=128,
                              unroll_explicit=device == 'rocm',
                              detect_global_barrier=False,
                              restricted_func=True):
            check_device(device)