def test_fused_mul_div_rsqrt_mul_isfinite_red(shape, dtype='float32', poly_sch=False): input = gen_data(shape, dtype) expect = compute_expect(input) input_shape = [shape, shape] input_dtype = [dtype, dtype] if poly_sch: mod = utils.op_build(fused_mul_div_rsqrt_mul_isfinite_red_auto, input_shape, input_dtype, attrs={"target": "cuda"}) else: mod = utils.op_build(fused_mul_div_rsqrt_mul_isfinite_red_manual, input_shape, input_dtype) outputs = [np.full( (1, ), False, 'bool')] + [np.full(shape, np.nan, dtype)] * 3 output = utils.mod_launch(mod, [*input, *outputs], outputs=list(range(-len(outputs), 0)), expect=expect) ret = compare_tensor(output[0], expect[0], rtol=5e-03, atol=1.e-08) ret &= compare_tensor(output[1], expect[1], rtol=5e-03, atol=1.e-08) ret &= compare_tensor(output[2], expect[2], rtol=5e-03, atol=1.e-08) ret &= compare_tensor(output[3], expect[3], rtol=5e-03, atol=1.e-08) print("Test {}".format("Pass" if ret else "Failed")) if not ret: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data = to_tvm_nd_array(input) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *data, *expect, 400)
def test_fused_bn_double_follow_relu(in_shape, in_dtype='float16', layout='NHWC', out_dtype='float16', poly_sch=False): if layout != "NHWC" and layout != "NCHW": raise NotImplementedError( 'Layout not supported {} '.format(layout)) inter_dtype = 'float32' inputs, output, expect = gen_data(in_shape, in_dtype, inter_dtype, layout, out_dtype) input_shape_list = [i.shape for i in inputs] input_dtype_list = [inter_dtype] * 4 + [in_dtype] + [inter_dtype] * 4 + [in_dtype] op_attrs = [layout, out_dtype] if poly_sch: mod = utils.op_build(fused_bn_double_follow_relu_auto, input_shape_list, input_dtype_list, op_attrs=op_attrs, attrs={"target": "cuda"}) else: mod = utils.op_build(fused_bn_double_follow_relu_manual, input_shape_list, input_dtype_list, op_attrs=op_attrs) outputs = [output] arglist = inputs + outputs output = utils.mod_launch(mod, arglist, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs = to_tvm_nd_array(inputs) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, expect, 400)
def test_fused_pad(shape, pad_before, pad_after, layout='NHWC', pad_value=0.0, poly_sch=False): op_attrs = [pad_before, pad_after, layout, pad_value] if poly_sch: mod = utils.op_build(fused_pad_auto, [shape], ['float32'], op_attrs=op_attrs, attrs={"target": "cuda"}) else: mod = utils.op_build(fused_pad_manual, [shape], ['float32'], op_attrs=op_attrs) data, output, expect = gen_data(shape, pad_before, pad_after, layout, pad_value) args = (data, output) output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data = to_tvm_nd_array(data) expect = to_tvm_nd_array(expect) gpu_profiling(mod, data, expect, 400)
def test_fused_bn_reduce(in_shape, in_dtype='float16', layout='NHWC', out_dtype='float32', poly_sch=False): if layout != "NHWC" and layout != "NCHW": raise NotImplementedError('Layout not supported {} '.format(layout)) op_attrs = [layout, out_dtype] if poly_sch: mod = utils.op_build_test(fused_bn_reduce, [in_shape], [in_dtype], kernel_name="fused_bn_reduce", op_attrs=op_attrs, attrs={"target": "cuda"}) data, outputs, expect = gen_data(in_shape, in_dtype, layout, out_dtype) inputs = [data] arglist = inputs + outputs output = utils.mod_launch(mod, arglist, outputs=tuple(range(-len(outputs), 0)), expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs = to_tvm_nd_array(inputs) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, *expect, 400)
def test_fused_relu_grad_bn_double_reduce_grad(shape, out_shape, dtype="float32", layout="NHWC", out_dtype="float16", poly_sch=False): shape_list = [shape] * 5 + [out_shape] + [shape] * 3 + [out_shape] + [shape] * 3 + [out_shape] * 3 dtype_list = [dtype] * 5 +[out_dtype] +[dtype] * 3 + [out_dtype] + [dtype] * 3 +[out_dtype] * 3 op_attrs = [layout, out_dtype] if poly_sch: mod = utils.op_build_test( fused_relu_grad_bn_double_reduce_grad, shape_list, dtype_list, op_attrs=op_attrs, kernel_name="fused_relu_grad_bn_double_reduce_grad", attrs={ "target": "cuda"}) inshp_data, outshp_data, output, expect = gen_data(shape, out_shape, dtype, out_dtype) inputs = [inshp_data] * 5 + [outshp_data] + [inshp_data] * 3 + [outshp_data] + [inshp_data] * 3 + [outshp_data] * 3 outputs = [output, output] arg_list = inputs + outputs outputs = utils.mod_launch(mod, arg_list, outputs=tuple(range(-len(outputs), 0)), expect=expect) res = np.allclose(outputs, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs = to_tvm_nd_array(inputs) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, *expect, 400)
def test_fused_relu_grad_bn_update_grad(shape, out_shape, dtype="float16", layout="NHWC", out_dtype="float32", poly_sch=False): shape_list = [out_shape, shape, shape, shape] dtype_list = [out_dtype, dtype, dtype, dtype] op_attrs = [layout] if poly_sch: mod = utils.op_build( fused_relu_grad_bn_update_grad_auto, shape_list, dtype_list, op_attrs=op_attrs, attrs={ "target": "cuda"}) else: mod = utils.op_build(fused_relu_grad_bn_update_grad_manual, shape_list, dtype_list, op_attrs=op_attrs) head, data_sum, in_bn, in_active, output, expect = gen_data(shape, out_shape, dtype, out_dtype, layout) outputs = [output, output] inputs = [data_sum, in_bn, head, in_active] arg_list = inputs + outputs outputs = utils.mod_launch(mod, arg_list, outputs=tuple(range(-len(outputs), 0)), expect=expect) res = np.allclose(outputs, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs = to_tvm_nd_array(inputs) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, *expect, 400)
def test_ms_reduce_max(in_shape, in_dtype, axis=None, keepdims=False, poly_sch=False): if poly_sch: mod = utils.op_build_test(reduce_max, (in_shape, ), (in_dtype, ), op_attrs=[axis, keepdims], kernel_name="reduce_max", attrs={ "target": "cuda", "enable_akg_reduce_lib": True, "enable_atomic_add": True }) data, output, expect = gen_data(in_shape, in_dtype, axis, keepdims) args = (data, output) output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data, expect = to_tvm_nd_array([data, expect]) gpu_profiling(mod, data, expect, 400)
def test_fused_relu_grad(shape, c1=0, poly_sch=False): dtype = 'float16' input = gen_data(shape, dtype) expect = compute_expect(input, c1) shapes = [shape] * 3 dtypes = [dtype] * 3 attrs = [c1] if poly_sch: mod = utils.op_build(fused_relu_grad_auto, shapes, dtypes, op_attrs=attrs, attrs={"target": "cuda"}) else: mod = utils.op_build(fused_relu_grad_manual, shapes, dtypes, op_attrs=attrs) output = np.full(shape, np.nan, dtype) output = utils.mod_launch(mod, (*input, output), expect=expect) res = np.allclose(output, expect, rtol=5e-3, atol=1e-8) print("Test {}".format("Pass" if res else "Failed")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data = to_tvm_nd_array(input) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *data, expect, 400)
def get_result(desc, poly, attrs=None): if poly: reduce_lib_key = "enable_akg_reduce_lib" if reduce_lib_key not in attrs.keys(): attrs[reduce_lib_key] = poly if attrs == {}: mod = composite.build(desc, {'dim':"0 0 9728 9728"}, poly=poly) else: mod = composite.build(desc, attrs, poly=poly) input_for_mod, expect, output_indexes = gen_json_data(desc) output = utils.mod_launch(mod, input_for_mod, output_indexes) rtol, atol = get_rtol_atol("FUSED", "float32") flag = True if len(output_indexes) > 1: if not all(map(lambda x, y: compare_tensor(x, y, rtol=rtol, atol=atol), output, expect)): logging.info(mod.imported_modules[0].get_source()) flag = False else: if not compare_tensor(output, expect, rtol=rtol, atol=atol): logging.info(mod.imported_modules[0].get_source()) flag = False desc_d = json.loads(desc) if desc_d["process"] == "cuda": inputs = to_tvm_nd_array(input_for_mod) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, *expect, repeat_time=400) return flag
def test_fused_l2loss_grad(shape, layout, fill_data=4e-05, poly_sch=False): data_1 = gen_data(shape, 'float16') data_2 = gen_data(shape, 'float32') expect, output = compute_py(data_1, data_2, layout, fill_data) input_list = [shape, shape] dtype_list = ['float16', 'float32'] op_attrs = [layout, fill_data] if poly_sch: mod = utils.op_build_test(fused_l2loss_grad, input_list, dtype_list, kernel_name="fused_l2loss_grad", op_attrs=op_attrs, attrs={"target": "cuda"}) args = [data_1, data_2, output] output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data = to_tvm_nd_array([data_1, data_2]) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *data, expect, 400)
def get_result(desc, poly, attrs=None): backend = _get_backend(desc) if backend == "cuda" and not attrs: attrs = _add_attrs_from_json(desc, attrs, poly) if poly: reduce_lib_key = "enable_akg_reduce_lib" if reduce_lib_key not in attrs.keys(): attrs[reduce_lib_key] = poly build_attrs = attrs if attrs else None mod = composite.build(desc, build_attrs, poly=poly) input_for_mod, expect, output_indexes = gen_json_data(desc) output = utils.mod_launch(mod, input_for_mod, output_indexes) if not all( map(_compare_func, output if isinstance(output, (list, tuple)) else [output], expect if isinstance(expect, (list, tuple)) else [expect])): logging.info(mod.imported_modules[0].get_source()) return False if backend == "cuda": inputs = to_tvm_nd_array(input_for_mod) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, *expect, repeat_time=400) return True
def get_gpu_cycles(mod, *mod_args, device_id=0, save_log=False): "get gpu profiling cycles." func = tvm.get_global_func('GPUProfilerInit') func("") from akg.utils.result_analysis import gpu_profiling gpu_profiling(mod, *mod_args, repeat_time=400, device_id=device_id) func = tvm.get_global_func('GPUProfilerStop') a = func() return int(a)
def test_fused_relu_grad_bn_reduce_grad(shape_1, shape_2, layout='NHWC', poly_sch=False): data_1 = gen_data(shape_1, 'float32') data_2 = gen_data(shape_1, 'float32') data_3 = gen_data(shape_1, 'float32') data_4 = gen_data(shape_1, 'float32') data_5 = gen_data(shape_1, 'float32') data_6 = gen_data(shape_1, 'float32') data_7 = gen_data(shape_2, 'float16') data_8 = gen_data(shape_2, 'float16') data_9 = gen_data(shape_2, 'float16') expect, output = compute_py(data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9, layout) input_list = [ shape_1, shape_1, shape_1, shape_1, shape_1, shape_1, shape_2, shape_2, shape_2 ] dtype_list = [ 'float32', 'float32', 'float32', 'float32', 'float32', 'float32', 'float16', 'float16', 'float16' ] op_attrs = [layout] if poly_sch: mod = utils.op_build_test( fused_relu_grad_bn_reduce_grad_auto, input_list, dtype_list, kernel_name="fused_relu_grad_bn_reduce_grad_auto", op_attrs=op_attrs, attrs={"target": "cuda"}) else: mod = utils.op_build_test( fused_relu_grad_bn_reduce_grad_manual, input_list, dtype_list, kernel_name="fused_relu_grad_bn_reduce_grad_manual", op_attrs=op_attrs) args = [ data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9, output ] output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1e-08) print("Test {}".format("Pass" if res else "Failed")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs = to_tvm_nd_array([ data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9 ]) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, expect, 400)
def test_fused_relu_grad_bn_double_update_grad(shape_f16, shape_f32, layout='NHWC', poly_sch=False): data_1 = gen_data(shape_f32, 'float32') data_2 = gen_data(shape_f16, 'float16') data_3 = gen_data(shape_f32, 'float32') data_4 = gen_data(shape_f16, 'float16') data_5 = gen_data(shape_f16, 'float16') data_6 = gen_data(shape_f16, 'float16') data_7 = gen_data(shape_f16, 'float16') shape_list = [ shape_f32, shape_f16, shape_f32, shape_f16, shape_f16, shape_f16, shape_f16 ] dtype_list = [ 'float32', 'float16', 'float32', 'float16', 'float16', 'float16', 'float16' ] data_list = [data_1, data_2, data_3, data_4, data_5, data_6, data_7] data_tmp7, data_tmp15, data_tmp22, out_shape = compute_py( data_1, data_2, data_3, data_4, data_5, data_6, data_7, layout) expect = [data_tmp7, data_tmp15, data_tmp22] output = np.full(out_shape, np.nan, 'float32') output = [output, output, output] if poly_sch: mod = utils.op_build(fused_relu_grad_bn_double_update_grad_auto, shape_list, dtype_list, op_attrs=[layout], attrs={"target": "cuda"}) else: mod = utils.op_build(fused_relu_grad_bn_double_update_grad_manual, shape_list, dtype_list, op_attrs=[layout]) output = utils.mod_launch( mod, (data_1, data_2, data_3, data_4, data_5, data_6, data_7, *output), outputs=tuple(range(-len(output), 0)), expect=expect) res = True res &= np.allclose(output[0], expect[0], rtol=5e-03, atol=1e-8) res &= np.allclose(output[1], expect[1], rtol=5e-03, atol=1e-8) res &= np.allclose(output[2], expect[2], rtol=5e-03, atol=1e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data_list = to_tvm_nd_array(data_list) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *data_list, *expect, 400)
def test_ms_add(shape1, shape2, dtype, poly_sch=False): if poly_sch: mod = utils.op_build_test(add, (shape1, shape2), (dtype, dtype), kernel_name="add", attrs={"target": "cuda"}) lhs, rhs, output, expect = gen_data(shape1, shape2, dtype) output = utils.mod_launch(mod, (lhs, rhs, output), expect = expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") lhs, rhs, expect = to_tvm_nd_array([lhs, rhs, expect]) gpu_profiling(mod, lhs, rhs, expect, 400)
def test_ms_tile(shape, multiples, dtype, poly_sch=False): if poly_sch: mod = utils.op_build_test(tile_auto, [shape], [dtype], op_attrs=[multiples], kernel_name="tile_auto", attrs={"target": "cuda"}) else: mod = utils.op_build_test(tile_manual, [shape], [dtype], op_attrs=[multiples], kernel_name="tile_manual") data, output, expect = gen_data(shape, multiples, dtype) output = utils.mod_launch(mod, (data, output), expect = expect) ret = compare_tensor(output, expect, rtol=5e-03, atol=1.e-8, equal_nan=True) print("Test {}".format("Pass" if ret else "Failed")) if not ret: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data, expect = to_tvm_nd_array([data, expect]) gpu_profiling(mod, data, expect, 400)
def test_ms_round(shape, dtype, poly_sch=False): if poly_sch: mod = utils.op_build_test(round_auto, [shape], [dtype], attrs={"target": "cuda"}, kernel_name="round_auto") else: mod = utils.op_build_test(round_manual, [shape], [dtype], kernel_name="round_manual") data, output, expect = gen_data(shape, dtype) output = utils.mod_launch(mod, (data, output), expect = expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data, expect = to_tvm_nd_array([data, expect]) gpu_profiling(mod, data, expect, 400)
def test_ms_log(in_shape, in_dtype, poly_sch=False): if poly_sch: mod = utils.op_build_test(log_auto, (in_shape, ), (in_dtype, ), kernel_name="log_auto", attrs={"target":"cuda"}) else: mod = utils.op_build_test(log_manual, (in_shape, ), (in_dtype, ), kernel_name="log_manual") data, output, expect = gen_data(in_shape, in_dtype) args = (data, output) output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-7) # from 1e-8 changing to 1e-7 print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data, expect = to_tvm_nd_array([data, expect]) gpu_profiling(mod, data, expect, 400)
def test_ms_bmm(shape1, shape2, dtype, out_dtype="float32", layout1="NHDT", layout2="NHDT", layout_out="NHDT", shape_bias=None, add_bias=False, tensor_core=True, poly_sch=False, dim="", bind_block="", bind_thread=""): op_attrs = [out_dtype, layout1, layout2, layout_out, tensor_core, add_bias] if poly_sch: mod = utils.op_build_test(batch_matmul, (shape1, shape2, shape_bias), (dtype, dtype, out_dtype), op_attrs=op_attrs, attrs={ "target": "cuda", "use_shared_memory": True, "pragma_enable_tensor_core": tensor_core, "enable_auto_fuse": False, "dim": dim, "bind_block": bind_block, "bind_thread": bind_thread, "vector_load_type": "float4", "pragma_enable_matmul": True }, kernel_name="batch_matmul") lhs, rhs, bias, output, expect = gen_data(shape1, shape2, dtype, out_dtype, layout1, layout2, layout_out, shape_bias, add_bias) args = (lhs, rhs, bias, output) output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") lhs, rhs, bias, expect = to_tvm_nd_array([lhs, rhs, bias, expect]) gpu_profiling(mod, lhs, rhs, bias, expect, repeat_time=10000)
def test_expand_dims(shape1, axis, dtype, poly_sch=False): if poly_sch: mod = utils.op_build_test(expand_dims_auto, [shape1], [dtype], op_attrs=[axis], attrs={"target": "cuda"}, kernel_name="expand_dims_auto") else: mod = utils.op_build_test(expand_dims_manual, [shape1], [dtype], op_attrs=[axis], kernel_name="expand_dims_manual") expect, input1, output = gen_data(axis, dtype, shape1) args = (input1, output) output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") input1, expect = to_tvm_nd_array([input1, expect]) gpu_profiling(mod, input1, expect, 400)
def test_ms_minimum(shape1, shape2, dtype, poly_sch=False): if poly_sch: mod = utils.op_build(minimum_auto, (shape1, shape2), (dtype, dtype), attrs={"target": "cuda"}) else: mod = utils.op_build(minimum_manual, (shape1, shape2), (dtype, dtype)) lhs, rhs, output, expect = gen_data(shape1, shape2, dtype) args = (lhs, rhs, output) output = utils.mod_launch(mod, args, expect=expect) res = compare_tensor(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") lhs, rhs, expect = to_tvm_nd_array([lhs, rhs, expect]) gpu_profiling(mod, lhs, rhs, expect, 400)
def test_ms_cast(shape, srcType, dstType, poly_sch=False): if poly_sch: mod = utils.op_build_test(cast, [shape], [srcType], [dstType], attrs={"target": "cuda"}, kernel_name="cast") output, expect, inputs = gen_data(shape, srcType, dstType) output = utils.mod_launch(mod, (inputs, output), expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs, expect = to_tvm_nd_array([inputs, expect]) gpu_profiling(mod, inputs, expect, 400)
def test_ms_equal(shapes, dtype, poly_sch=False): if poly_sch: mod = utils.op_build_test(equal_auto, shapes, [dtype, dtype], kernel_name="equal_auto", attrs={"target": "cuda"}) else: mod = utils.op_build_test(equal_manual, shapes, [dtype, dtype], kernel_name="equal_manual") inputs1, output1, expect1 = gen_data(shapes, dtype) output1 = utils.mod_launch(mod, (*inputs1, output1), expect=expect1) if shapes[0] == shapes[1]: inputs2 = [] inputs2.append(inputs1[0]) inputs2.append(inputs1[0]) expect2 = np.equal(inputs2[0], inputs2[1]) output2 = np.full(expect2.shape, 0, bool) output2 = utils.mod_launch(mod, (*inputs2, output2), expect=expect1) res = np.allclose(output1, expect1, rtol=5e-03, atol=1.e-8) and np.allclose( output2, expect2, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs1 = to_tvm_nd_array(inputs1) inputs2 = to_tvm_nd_array(inputs2) expect1 = to_tvm_nd_array(expect1) expect2 = to_tvm_nd_array(expect2) gpu_profiling(mod, *inputs1, expect1, *inputs2, expect2, 400) else: res = np.allclose(output1, expect1, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs1 = to_tvm_nd_array(inputs1) expect1 = to_tvm_nd_array(expect1) gpu_profiling(mod, *inputs1, expect1, 400)
def test_ms_trans_data(shape, axes, dtype, poly_sch=False): if poly_sch: mod = utils.op_build_test(trans_data, [shape], [dtype], op_attrs=[axes], kernel_name="trans_data", attrs={"target": "cuda"}) data, output, expect = gen_data(shape, axes, dtype) output = utils.mod_launch(mod, (data, output), expect=expect) ret = compare_tensor(output, expect, rtol=5e-03, atol=1.e-8, equal_nan=True) print("Test {}".format("Pass" if ret else "Failed")) data, expect = to_tvm_nd_array([data, expect]) gpu_profiling(mod, data, expect, 400)
def test_ms_rsqrt(shape1, dtype, poly_sch=False): if poly_sch: mod = utils.op_build(rsqrt_auto, (shape1,), (dtype,), attrs={"target": "cuda"}) else: mod = utils.op_build(rsqrt_manual, (shape1,), (dtype,)) expect, input1, output = gen_data(dtype, shape1) args = (input1, output) output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") input1, expect = to_tvm_nd_array([input1, expect]) gpu_profiling(mod, input1, expect, 400)
def test_fused_is_finite(shape, layout='NHWC', poly_sch=False): if poly_sch: mod = utils.op_build(fused_is_finite_auto, [shape], ['float32'], op_attrs=[layout], attrs={"target": "cuda"}) else: mod = utils.op_build(fused_is_finite_manual, [shape], ['float32'], op_attrs=[layout]) data, expect, output = gen_data(shape, 'float32', layout) args = (data, output) output = utils.mod_launch(mod, args, expect = expect) res = np.allclose(output, expect, rtol=5e-03, atol=1e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data, expect = to_tvm_nd_array([data, expect]) gpu_profiling(mod, data, expect, 400)
def test_ms_one_hot(shape, depth, dtype, on_value, off_value, axis, poly_sch=False): if poly_sch: mod = utils.op_build(one_hot_auto, [shape], [dtype], op_attrs=[on_value, off_value, depth, axis, dtype], attrs={"target": "cuda"}) else: mod = utils.op_build(one_hot_manual, [shape], [dtype], op_attrs=[on_value, off_value, depth, axis, dtype]) # gen data expect, data_tmp, on_value_tensor, off_value_tensor, output = gen_data(axis, depth, dtype, shape, on_value, off_value) data = data_tmp.astype(dtype) output = utils.mod_launch(mod, (data, output), expect = expect) ret = compare_tensor(output, expect, rtol=5e-03, atol=1.e-8, equal_nan=True) print("Test {}".format("Pass" if ret else "Failed")) if not ret: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data, expect = to_tvm_nd_array([data, expect]) gpu_profiling(mod, data, expect, 400)
def test_ms_divide(shape, dtype, poly_sch=False): if poly_sch: mod = utils.op_build(divide_auto, [shape, shape], [dtype, dtype], attrs={"target": "cuda"}) else: mod = utils.op_build(divide_manual, [shape, shape], [dtype, dtype]) lhs, rhs, output, expect = gen_data(shape, dtype) output = utils.mod_launch(mod, (lhs, rhs, output), expect=expect) ret = compare_tensor(output, expect, rtol=5e-03, atol=1.e-8, equal_nan=True) print("Test {}".format("Pass" if ret else "Failed")) if not ret: print("Error cuda:==========================") print(mod.imported_modules[0].get_soure()) raise AssertionError("Test fail") lhs, rhs, expect = to_tvm_nd_array([lhs, rhs, expect]) gpu_profiling(mod, lhs, rhs, expect, 400)
def test_ms_addn(shape, dtype, n, poly_sch=False): shapes = [] for i in range(n): shapes.append(shape) if poly_sch: mod = utils.op_build_test(addn_auto, [shapes], [dtype], attrs={"target": "cuda"}, kernel_name="addn_auto") else: mod = utils.op_build_test(addn_manual, [shapes], [dtype], kernel_name="addn_manual") expect, inputs, output = gen_data(shape, shapes, dtype, n) output = utils.mod_launch(mod, (*inputs, output), expect=expect) res = compare_tensor(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs = to_tvm_nd_array(inputs) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, expect, 400)
def test_fused_bn_update(shape, dtype="float32", c1=(1 / (256 * 7 * 7)), c2=1.001e-05, c3=1.00007975, c4=0.100000024, poly_sch=False): input = gen_data(shape, dtype) expect = compute_expect(input, c1, c2, c3, c4) attrs = [dtype, c1, c2, c3, c4] shapes = [input[0].shape] * 4 dtypes = [dtype] * 4 if poly_sch: mod = utils.op_build_test(fused_bn_update_auto, shapes, dtypes, kernel_name="fused_bn_update_auto", op_attrs=attrs, attrs={"target": "cuda"}) else: mod = utils.op_build_test(fused_bn_update_manual, shapes, dtypes, kernel_name="fused_bn_update_manual", op_attrs=attrs) outputs = [np.full(shape, np.nan, dtype)] * 3 attrs_list = input + outputs output = utils.mod_launch(mod, attrs_list, outputs=(range(-len(outputs), 0)), expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Failed")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data = to_tvm_nd_array(input) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *data, *expect, 400)