示例#1
0
def verify_convolution(input_dim, filter, padding):
    dtype = "float32"
    N, C, H, W = input_dim
    OC, _, KH, KW = filter
    a_np = np.random.uniform(size=input_dim).astype(dtype)
    w_np = np.random.uniform(size=(OC, C, KH, KW)).astype(dtype)
    w_np_cm = np.transpose(w_np, axes=(2, 3, 1, 0))
    b_np = conv2d_nchw_python(a_np, w_np, [1, 1], padding)
    inputs = [("input1", datatypes.Array(C, H, W))]
    output = [("output", datatypes.Array(*b_np.shape))]
    builder = NeuralNetworkBuilder(inputs, output)
    builder.add_convolution(
        name="conv",
        kernel_channels=3,
        output_channels=OC,
        height=KH,
        width=KW,
        stride_height=1,
        stride_width=1,
        border_mode=padding.lower(),
        groups=1,
        W=w_np_cm,
        b=None,
        has_bias=False,
        is_deconv=False,
        input_name="input1",
        output_name="output",
    )
    model = cm.models.MLModel(builder.spec)
    for target, dev in tvm.testing.enabled_targets():
        out = run_tvm_graph(model, target, dev, [a_np], ["input1"], output_shape=None)
        tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
def check_conv2d_output(
        data_tensor: LabelledTensor, kernel_tensor: LabelledTensor,
        micro_output_tensor: LabelledTensor, strides, padding):
    data_nchw_np = data_tensor.with_layout('NCHW').data
    kernel_oihw_np = kernel_tensor.with_layout('OIHW').data
    micro_output_nchw_np = micro_output_tensor.with_layout('NCHW').data

    topi_output_np = conv2d_nchw_python(data_nchw_np, kernel_oihw_np, strides, padding)
    tvm.testing.assert_allclose(micro_output_nchw_np.shape, topi_output_np.shape)
    for i in range(micro_output_nchw_np.shape[0]):
        tvm.testing.assert_allclose(micro_output_nchw_np[i], topi_output_np[i])
        print('ok', micro_output_nchw_np[i])
示例#3
0
def evaluate_func(sch, args, N, H, W, CO, CI, KH, KW):
    func = tvm.build(sch, args, target)

    # Check correctness
    data_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
    weight_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
    conv_np = conv2d_nchw_python(data_np, weight_np, strides, padding)
    out_np = conv_np

    ctx = tvm.gpu()
    data_tvm = tvm.nd.array(data_np, ctx=ctx)
    weight_tvm = tvm.nd.array(weight_np, ctx=ctx)
    out_tvm = tvm.nd.empty(out_np.shape, ctx=ctx)
    func(data_tvm, weight_tvm, out_tvm)

    # Check results
    np.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3)

    # Evaluate execution time
    evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=50)
    print("Execution time of this operator: %.3f ms" %
          (np.median(evaluator(data_tvm, weight_tvm, out_tvm).results) * 1000))

    return np.median(evaluator(data_tvm, weight_tvm, out_tvm).results)
# cooperative fetching, unrolling and operator fusion.

print(tvm.lower(sch, args, simple_mode=True))

######################################################################
# Check correctness and evaluate performance
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# We build the binary and check its correctness and performance.

func = tvm.build(sch, args, target)

# Check correctness
data_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
weight_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
bias_np = np.random.uniform(size=(1, CO, 1, 1)).astype(np.float32)
conv_np = conv2d_nchw_python(data_np, weight_np, strides, padding)
out_np = np.maximum(conv_np + bias_np, 0.0)

ctx = tvm.gpu()
data_tvm = tvm.nd.array(data_np, ctx=ctx)
weight_tvm = tvm.nd.array(weight_np, ctx=ctx)
bias_tvm = tvm.nd.array(bias_np, ctx=ctx)
out_tvm = tvm.nd.empty(out_np.shape, ctx=ctx)
func(data_tvm, weight_tvm, bias_tvm, out_tvm)

# Check results
np.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3)

# Evaluate execution time
evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500)
print(
示例#5
0
# Format the input/output arrays with tvm.nd.array to the DLPack standard
data_nd = tvm.nd.array(data_packed, ctx)
kernel_nd = tvm.nd.array(kernel_packed, ctx)
res_nd = tvm.nd.array(np.zeros(output_shape).astype(res.dtype), ctx)

# Clear stats
if env.TARGET in ["sim", "tsim"]:
    simulator.clear_stats()

# Invoke the module to perform the computation
f(data_nd, kernel_nd, res_nd)

# Verify against numpy implementation
res_ref = conv2d_nchw_python(data_np.astype(env.acc_dtype),
                            kernel_np.astype(env.acc_dtype),
                            (stride_h, stride_w),
                            (pad_h, pad_w)).astype(env.acc_dtype)
res_ref = res_ref >> env.INP_WIDTH
res_ref = np.clip(res_ref, 0, inp_max)
res_ref = res_ref.astype(res.dtype)
res_ref = res_ref.reshape((batch_size // env.BATCH,
                           env.BATCH,
                           out_channels // env.BLOCK_OUT,
                           env.BLOCK_OUT,
                           fout_height,
                           fout_width)).transpose((0, 2, 4, 5, 1, 3))
tvm.testing.assert_allclose(res_ref, res_nd.asnumpy())

# Print stats
if env.TARGET in ["sim", "tsim"]:
    sim_stats = simulator.stats()
示例#6
0
# inspect the best config
dispatch_context = autotvm.apply_history_best("conv2d.log")
best_config = dispatch_context.query(task.target, task.workload)
print("\nBest config:")
print(best_config)

# apply history best from log file
with autotvm.apply_history_best("conv2d.log"):
    with tvm.target.Target("cuda"):
        s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding)
        func = tvm.build(s, arg_bufs)

# check correctness
a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
c_np = conv2d_nchw_python(a_np, w_np, strides, padding)

ctx = tvm.gpu()
a_tvm = tvm.nd.array(a_np, ctx=ctx)
w_tvm = tvm.nd.array(w_np, ctx=ctx)
c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx)
func(a_tvm, w_tvm, c_tvm)

tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)

# Evaluate running time. Here we choose a large repeat number (400) to reduce the noise
# and the overhead of kernel launch. You can also use nvprof to validate the result.
evaluator = func.time_evaluator(func.entry_name, ctx, number=400)
print("Time cost of this operator: %f" % evaluator(a_tvm, w_tvm, c_tvm).mean)
def validate(workload, target, dev, input_shapes, *args, **kwargs):
    s, placeholders = workload(*input_shapes, *args, **kwargs)
    func = tvm.driver.build(s, [*placeholders],
                            target=target,
                            name="TestFunction")

    args_tvm = []
    args_np = []
    for var in placeholders[:-1]:
        var_np = np.random.uniform(size=[i.value
                                         for i in var.shape]).astype(var.dtype)
        args_np.append(var_np)
        args_tvm.append(tvm.nd.array(var_np, dev))
    args_tvm.append(
        tvm.nd.array(
            np.zeros([i.value for i in placeholders[-1].shape],
                     dtype=placeholders[-1].dtype), dev))
    func(*args_tvm)

    if "plus_one" in workload.__name__:
        np_result = args_np[0] + 1.0
    elif "matmul" in workload.__name__:
        if "inner" in workload.__name__:
            np_result = np.matmul(args_np[0].reshape(32, 256),
                                  args_np[1].reshape(32, 256).transpose(1, 0))
        elif "accum" in workload.__name__:
            np_result = np.matmul(
                args_np[0].transpose((1, 0, 2)).reshape(64, 128),
                args_np[1].reshape(128, 64))
        else:
            np_result = np.matmul(
                args_np[0].transpose((0, 2, 1)).reshape(128, 64),
                args_np[1].transpose(1, 0, 2).reshape(64, 128),
            )
    elif "conv2d_1x1_NCHWc_RSCKk" in workload.__name__:
        vec_length = args_np[1].shape[-1]
        # nchwc -> nchw
        args_np[0] = (args_np[0].transpose((0, 1, 4, 2, 3)).reshape(
            args_np[0].shape[0],
            args_np[0].shape[1] * args_np[0].shape[-1],
            args_np[0].shape[2],
            args_np[0].shape[3],
        ))
        # rsckk -> rsck -> kcrs
        args_np[1] = (args_np[1].reshape(
            args_np[1].shape[0],
            args_np[1].shape[1],
            args_np[1].shape[2],
            args_np[1].shape[3] * args_np[1].shape[4],
        ).transpose((3, 2, 0, 1)))
        np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
        # nkhw -> nkhwk
        np_result = np_result.reshape(
            np_result.shape[0],
            np_result.shape[1] // vec_length,
            vec_length,
            np_result.shape[2],
            np_result.shape[3],
        ).transpose(0, 1, 3, 4, 2)
    elif "conv2d_1x1_WCHNc_CRSKk" in workload.__name__:
        vec_length = args_np[1].shape[-1]
        # wchnc -> nchw
        args_np[0] = (args_np[0].transpose((3, 1, 4, 2, 0)).reshape(
            args_np[0].shape[3],
            args_np[0].shape[1] * args_np[0].shape[-1],
            args_np[0].shape[2],
            args_np[0].shape[0],
        ))
        # crskk -> crsk -> kcrs
        args_np[1] = (args_np[1].reshape(
            args_np[1].shape[0],
            args_np[1].shape[1],
            args_np[1].shape[2],
            args_np[1].shape[3] * args_np[1].shape[4],
        ).transpose((3, 0, 1, 2)))
        np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
        # nkhw -> nkkhw -> wkhnk
        np_result = np_result.reshape(
            np_result.shape[0],
            np_result.shape[1] // vec_length,
            vec_length,
            np_result.shape[2],
            np_result.shape[3],
        ).transpose(4, 1, 3, 0, 2)
    elif "NCHW_KCRS" in workload.__name__:
        np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
    elif "NCHWc_KCRSk" in workload.__name__:
        vec_length = args_np[1].shape[-1]
        # nchwc -> nchw
        args_np[0] = (args_np[0].transpose((0, 1, 4, 2, 3)).reshape(
            args_np[0].shape[0],
            args_np[0].shape[1] * args_np[0].shape[-1],
            args_np[0].shape[2],
            args_np[0].shape[3],
        ))
        # kcrsk/cmrsc -> kcrs/cmrs
        args_np[1] = (args_np[1].transpose((0, 4, 1, 2, 3)).reshape(
            args_np[1].shape[0] * args_np[1].shape[4],
            args_np[1].shape[1],
            args_np[1].shape[2],
            args_np[1].shape[3],
        ))
        if "depthwise" in workload.__name__:
            # np_result = testing.depthwise_conv2d_python_nchw(args_np[0], args_np[1], 1, "VALID")
            np_result = ref_depthwise_convolution(args_np[0], args_np[1], [],
                                                  [])
        else:
            # np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0)
            np_result = ref_convolution(args_np[0], args_np[1], [], [])
        # nkhw -> nkhwk
        np_result = np_result.reshape(
            np_result.shape[0],
            np_result.shape[1] // vec_length,
            vec_length,
            np_result.shape[2],
            np_result.shape[3],
        ).transpose(0, 1, 3, 4, 2)
    np.testing.assert_allclose(args_tvm[-1].asnumpy(),
                               np_result,
                               rtol=1e-2,
                               atol=1e-2)