def verify_convolution(input_dim, filter, padding): dtype = "float32" N, C, H, W = input_dim OC, _, KH, KW = filter a_np = np.random.uniform(size=input_dim).astype(dtype) w_np = np.random.uniform(size=(OC, C, KH, KW)).astype(dtype) w_np_cm = np.transpose(w_np, axes=(2, 3, 1, 0)) b_np = conv2d_nchw_python(a_np, w_np, [1, 1], padding) inputs = [("input1", datatypes.Array(C, H, W))] output = [("output", datatypes.Array(*b_np.shape))] builder = NeuralNetworkBuilder(inputs, output) builder.add_convolution( name="conv", kernel_channels=3, output_channels=OC, height=KH, width=KW, stride_height=1, stride_width=1, border_mode=padding.lower(), groups=1, W=w_np_cm, b=None, has_bias=False, is_deconv=False, input_name="input1", output_name="output", ) model = cm.models.MLModel(builder.spec) for target, dev in tvm.testing.enabled_targets(): out = run_tvm_graph(model, target, dev, [a_np], ["input1"], output_shape=None) tvm.testing.assert_allclose(out, b_np, rtol=1e-5)
def check_conv2d_output( data_tensor: LabelledTensor, kernel_tensor: LabelledTensor, micro_output_tensor: LabelledTensor, strides, padding): data_nchw_np = data_tensor.with_layout('NCHW').data kernel_oihw_np = kernel_tensor.with_layout('OIHW').data micro_output_nchw_np = micro_output_tensor.with_layout('NCHW').data topi_output_np = conv2d_nchw_python(data_nchw_np, kernel_oihw_np, strides, padding) tvm.testing.assert_allclose(micro_output_nchw_np.shape, topi_output_np.shape) for i in range(micro_output_nchw_np.shape[0]): tvm.testing.assert_allclose(micro_output_nchw_np[i], topi_output_np[i]) print('ok', micro_output_nchw_np[i])
def evaluate_func(sch, args, N, H, W, CO, CI, KH, KW): func = tvm.build(sch, args, target) # Check correctness data_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) weight_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) conv_np = conv2d_nchw_python(data_np, weight_np, strides, padding) out_np = conv_np ctx = tvm.gpu() data_tvm = tvm.nd.array(data_np, ctx=ctx) weight_tvm = tvm.nd.array(weight_np, ctx=ctx) out_tvm = tvm.nd.empty(out_np.shape, ctx=ctx) func(data_tvm, weight_tvm, out_tvm) # Check results np.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3) # Evaluate execution time evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=50) print("Execution time of this operator: %.3f ms" % (np.median(evaluator(data_tvm, weight_tvm, out_tvm).results) * 1000)) return np.median(evaluator(data_tvm, weight_tvm, out_tvm).results)
# cooperative fetching, unrolling and operator fusion. print(tvm.lower(sch, args, simple_mode=True)) ###################################################################### # Check correctness and evaluate performance # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # We build the binary and check its correctness and performance. func = tvm.build(sch, args, target) # Check correctness data_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) weight_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) bias_np = np.random.uniform(size=(1, CO, 1, 1)).astype(np.float32) conv_np = conv2d_nchw_python(data_np, weight_np, strides, padding) out_np = np.maximum(conv_np + bias_np, 0.0) ctx = tvm.gpu() data_tvm = tvm.nd.array(data_np, ctx=ctx) weight_tvm = tvm.nd.array(weight_np, ctx=ctx) bias_tvm = tvm.nd.array(bias_np, ctx=ctx) out_tvm = tvm.nd.empty(out_np.shape, ctx=ctx) func(data_tvm, weight_tvm, bias_tvm, out_tvm) # Check results np.testing.assert_allclose(out_np, out_tvm.asnumpy(), rtol=1e-3) # Evaluate execution time evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500) print(
# Format the input/output arrays with tvm.nd.array to the DLPack standard data_nd = tvm.nd.array(data_packed, ctx) kernel_nd = tvm.nd.array(kernel_packed, ctx) res_nd = tvm.nd.array(np.zeros(output_shape).astype(res.dtype), ctx) # Clear stats if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() # Invoke the module to perform the computation f(data_nd, kernel_nd, res_nd) # Verify against numpy implementation res_ref = conv2d_nchw_python(data_np.astype(env.acc_dtype), kernel_np.astype(env.acc_dtype), (stride_h, stride_w), (pad_h, pad_w)).astype(env.acc_dtype) res_ref = res_ref >> env.INP_WIDTH res_ref = np.clip(res_ref, 0, inp_max) res_ref = res_ref.astype(res.dtype) res_ref = res_ref.reshape((batch_size // env.BATCH, env.BATCH, out_channels // env.BLOCK_OUT, env.BLOCK_OUT, fout_height, fout_width)).transpose((0, 2, 4, 5, 1, 3)) tvm.testing.assert_allclose(res_ref, res_nd.asnumpy()) # Print stats if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats()
# inspect the best config dispatch_context = autotvm.apply_history_best("conv2d.log") best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from log file with autotvm.apply_history_best("conv2d.log"): with tvm.target.Target("cuda"): s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding) func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding) ctx = tvm.gpu() a_tvm = tvm.nd.array(a_np, ctx=ctx) w_tvm = tvm.nd.array(w_np, ctx=ctx) c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx) func(a_tvm, w_tvm, c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise # and the overhead of kernel launch. You can also use nvprof to validate the result. evaluator = func.time_evaluator(func.entry_name, ctx, number=400) print("Time cost of this operator: %f" % evaluator(a_tvm, w_tvm, c_tvm).mean)
def validate(workload, target, dev, input_shapes, *args, **kwargs): s, placeholders = workload(*input_shapes, *args, **kwargs) func = tvm.driver.build(s, [*placeholders], target=target, name="TestFunction") args_tvm = [] args_np = [] for var in placeholders[:-1]: var_np = np.random.uniform(size=[i.value for i in var.shape]).astype(var.dtype) args_np.append(var_np) args_tvm.append(tvm.nd.array(var_np, dev)) args_tvm.append( tvm.nd.array( np.zeros([i.value for i in placeholders[-1].shape], dtype=placeholders[-1].dtype), dev)) func(*args_tvm) if "plus_one" in workload.__name__: np_result = args_np[0] + 1.0 elif "matmul" in workload.__name__: if "inner" in workload.__name__: np_result = np.matmul(args_np[0].reshape(32, 256), args_np[1].reshape(32, 256).transpose(1, 0)) elif "accum" in workload.__name__: np_result = np.matmul( args_np[0].transpose((1, 0, 2)).reshape(64, 128), args_np[1].reshape(128, 64)) else: np_result = np.matmul( args_np[0].transpose((0, 2, 1)).reshape(128, 64), args_np[1].transpose(1, 0, 2).reshape(64, 128), ) elif "conv2d_1x1_NCHWc_RSCKk" in workload.__name__: vec_length = args_np[1].shape[-1] # nchwc -> nchw args_np[0] = (args_np[0].transpose((0, 1, 4, 2, 3)).reshape( args_np[0].shape[0], args_np[0].shape[1] * args_np[0].shape[-1], args_np[0].shape[2], args_np[0].shape[3], )) # rsckk -> rsck -> kcrs args_np[1] = (args_np[1].reshape( args_np[1].shape[0], args_np[1].shape[1], args_np[1].shape[2], args_np[1].shape[3] * args_np[1].shape[4], ).transpose((3, 2, 0, 1))) np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) # nkhw -> nkhwk np_result = np_result.reshape( np_result.shape[0], np_result.shape[1] // vec_length, vec_length, np_result.shape[2], np_result.shape[3], ).transpose(0, 1, 3, 4, 2) elif "conv2d_1x1_WCHNc_CRSKk" in workload.__name__: vec_length = args_np[1].shape[-1] # wchnc -> nchw args_np[0] = (args_np[0].transpose((3, 1, 4, 2, 0)).reshape( args_np[0].shape[3], args_np[0].shape[1] * args_np[0].shape[-1], args_np[0].shape[2], args_np[0].shape[0], )) # crskk -> crsk -> kcrs args_np[1] = (args_np[1].reshape( args_np[1].shape[0], args_np[1].shape[1], args_np[1].shape[2], args_np[1].shape[3] * args_np[1].shape[4], ).transpose((3, 0, 1, 2))) np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) # nkhw -> nkkhw -> wkhnk np_result = np_result.reshape( np_result.shape[0], np_result.shape[1] // vec_length, vec_length, np_result.shape[2], np_result.shape[3], ).transpose(4, 1, 3, 0, 2) elif "NCHW_KCRS" in workload.__name__: np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) elif "NCHWc_KCRSk" in workload.__name__: vec_length = args_np[1].shape[-1] # nchwc -> nchw args_np[0] = (args_np[0].transpose((0, 1, 4, 2, 3)).reshape( args_np[0].shape[0], args_np[0].shape[1] * args_np[0].shape[-1], args_np[0].shape[2], args_np[0].shape[3], )) # kcrsk/cmrsc -> kcrs/cmrs args_np[1] = (args_np[1].transpose((0, 4, 1, 2, 3)).reshape( args_np[1].shape[0] * args_np[1].shape[4], args_np[1].shape[1], args_np[1].shape[2], args_np[1].shape[3], )) if "depthwise" in workload.__name__: # np_result = testing.depthwise_conv2d_python_nchw(args_np[0], args_np[1], 1, "VALID") np_result = ref_depthwise_convolution(args_np[0], args_np[1], [], []) else: # np_result = testing.conv2d_nchw_python(args_np[0], args_np[1], 1, 0) np_result = ref_convolution(args_np[0], args_np[1], [], []) # nkhw -> nkhwk np_result = np_result.reshape( np_result.shape[0], np_result.shape[1] // vec_length, vec_length, np_result.shape[2], np_result.shape[3], ).transpose(0, 1, 3, 4, 2) np.testing.assert_allclose(args_tvm[-1].asnumpy(), np_result, rtol=1e-2, atol=1e-2)