def test_fused_pad(shape, pad_before, pad_after, layout='NHWC', pad_value=0.0, poly_sch=False): op_attrs = [pad_before, pad_after, layout, pad_value] if poly_sch: mod = utils.op_build(fused_pad_auto, [shape], ['float32'], op_attrs=op_attrs, attrs={"target": "cuda"}) else: mod = utils.op_build(fused_pad_manual, [shape], ['float32'], op_attrs=op_attrs) data, output, expect = gen_data(shape, pad_before, pad_after, layout, pad_value) args = (data, output) output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data = to_tvm_nd_array(data) expect = to_tvm_nd_array(expect) gpu_profiling(mod, data, expect, 400)
def test_fused_l2loss_grad(shape, layout, fill_data=4e-05, poly_sch=False): data_1 = gen_data(shape, 'float16') data_2 = gen_data(shape, 'float32') expect, output = compute_py(data_1, data_2, layout, fill_data) input_list = [shape, shape] dtype_list = ['float16', 'float32'] op_attrs = [layout, fill_data] if poly_sch: mod = utils.op_build_test(fused_l2loss_grad, input_list, dtype_list, kernel_name="fused_l2loss_grad", op_attrs=op_attrs, attrs={"target": "cuda"}) args = [data_1, data_2, output] output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data = to_tvm_nd_array([data_1, data_2]) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *data, expect, 400)
def test_fused_relu_grad(shape, c1=0, poly_sch=False): dtype = 'float16' input = gen_data(shape, dtype) expect = compute_expect(input, c1) shapes = [shape] * 3 dtypes = [dtype] * 3 attrs = [c1] if poly_sch: mod = utils.op_build(fused_relu_grad_auto, shapes, dtypes, op_attrs=attrs, attrs={"target": "cuda"}) else: mod = utils.op_build(fused_relu_grad_manual, shapes, dtypes, op_attrs=attrs) output = np.full(shape, np.nan, dtype) output = utils.mod_launch(mod, (*input, output), expect=expect) res = np.allclose(output, expect, rtol=5e-3, atol=1e-8) print("Test {}".format("Pass" if res else "Failed")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data = to_tvm_nd_array(input) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *data, expect, 400)
def test_fused_mul_div_rsqrt_mul_isfinite_red(shape, dtype='float32', poly_sch=False): input = gen_data(shape, dtype) expect = compute_expect(input) input_shape = [shape, shape] input_dtype = [dtype, dtype] if poly_sch: mod = utils.op_build(fused_mul_div_rsqrt_mul_isfinite_red_auto, input_shape, input_dtype, attrs={"target": "cuda"}) else: mod = utils.op_build(fused_mul_div_rsqrt_mul_isfinite_red_manual, input_shape, input_dtype) outputs = [np.full( (1, ), False, 'bool')] + [np.full(shape, np.nan, dtype)] * 3 output = utils.mod_launch(mod, [*input, *outputs], outputs=list(range(-len(outputs), 0)), expect=expect) ret = compare_tensor(output[0], expect[0], rtol=5e-03, atol=1.e-08) ret &= compare_tensor(output[1], expect[1], rtol=5e-03, atol=1.e-08) ret &= compare_tensor(output[2], expect[2], rtol=5e-03, atol=1.e-08) ret &= compare_tensor(output[3], expect[3], rtol=5e-03, atol=1.e-08) print("Test {}".format("Pass" if ret else "Failed")) if not ret: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data = to_tvm_nd_array(input) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *data, *expect, 400)
def test_fused_bn_reduce(in_shape, in_dtype='float16', layout='NHWC', out_dtype='float32', poly_sch=False): if layout != "NHWC" and layout != "NCHW": raise NotImplementedError('Layout not supported {} '.format(layout)) op_attrs = [layout, out_dtype] if poly_sch: mod = utils.op_build_test(fused_bn_reduce, [in_shape], [in_dtype], kernel_name="fused_bn_reduce", op_attrs=op_attrs, attrs={"target": "cuda"}) data, outputs, expect = gen_data(in_shape, in_dtype, layout, out_dtype) inputs = [data] arglist = inputs + outputs output = utils.mod_launch(mod, arglist, outputs=tuple(range(-len(outputs), 0)), expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs = to_tvm_nd_array(inputs) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, *expect, 400)
def get_result(desc, poly, attrs=None): if poly: reduce_lib_key = "enable_akg_reduce_lib" if reduce_lib_key not in attrs.keys(): attrs[reduce_lib_key] = poly if attrs == {}: mod = composite.build(desc, {'dim':"0 0 9728 9728"}, poly=poly) else: mod = composite.build(desc, attrs, poly=poly) input_for_mod, expect, output_indexes = gen_json_data(desc) output = utils.mod_launch(mod, input_for_mod, output_indexes) rtol, atol = get_rtol_atol("FUSED", "float32") flag = True if len(output_indexes) > 1: if not all(map(lambda x, y: compare_tensor(x, y, rtol=rtol, atol=atol), output, expect)): logging.info(mod.imported_modules[0].get_source()) flag = False else: if not compare_tensor(output, expect, rtol=rtol, atol=atol): logging.info(mod.imported_modules[0].get_source()) flag = False desc_d = json.loads(desc) if desc_d["process"] == "cuda": inputs = to_tvm_nd_array(input_for_mod) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, *expect, repeat_time=400) return flag
def test_fused_relu_grad_bn_update_grad(shape, out_shape, dtype="float16", layout="NHWC", out_dtype="float32", poly_sch=False): shape_list = [out_shape, shape, shape, shape] dtype_list = [out_dtype, dtype, dtype, dtype] op_attrs = [layout] if poly_sch: mod = utils.op_build( fused_relu_grad_bn_update_grad_auto, shape_list, dtype_list, op_attrs=op_attrs, attrs={ "target": "cuda"}) else: mod = utils.op_build(fused_relu_grad_bn_update_grad_manual, shape_list, dtype_list, op_attrs=op_attrs) head, data_sum, in_bn, in_active, output, expect = gen_data(shape, out_shape, dtype, out_dtype, layout) outputs = [output, output] inputs = [data_sum, in_bn, head, in_active] arg_list = inputs + outputs outputs = utils.mod_launch(mod, arg_list, outputs=tuple(range(-len(outputs), 0)), expect=expect) res = np.allclose(outputs, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs = to_tvm_nd_array(inputs) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, *expect, 400)
def get_result(desc, poly, attrs=None): backend = _get_backend(desc) if backend == "cuda" and not attrs: attrs = _add_attrs_from_json(desc, attrs, poly) if poly: reduce_lib_key = "enable_akg_reduce_lib" if reduce_lib_key not in attrs.keys(): attrs[reduce_lib_key] = poly build_attrs = attrs if attrs else None mod = composite.build(desc, build_attrs, poly=poly) input_for_mod, expect, output_indexes = gen_json_data(desc) output = utils.mod_launch(mod, input_for_mod, output_indexes) if not all( map(_compare_func, output if isinstance(output, (list, tuple)) else [output], expect if isinstance(expect, (list, tuple)) else [expect])): logging.info(mod.imported_modules[0].get_source()) return False if backend == "cuda": inputs = to_tvm_nd_array(input_for_mod) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, *expect, repeat_time=400) return True
def test_fused_relu_grad_bn_double_reduce_grad(shape, out_shape, dtype="float32", layout="NHWC", out_dtype="float16", poly_sch=False): shape_list = [shape] * 5 + [out_shape] + [shape] * 3 + [out_shape] + [shape] * 3 + [out_shape] * 3 dtype_list = [dtype] * 5 +[out_dtype] +[dtype] * 3 + [out_dtype] + [dtype] * 3 +[out_dtype] * 3 op_attrs = [layout, out_dtype] if poly_sch: mod = utils.op_build_test( fused_relu_grad_bn_double_reduce_grad, shape_list, dtype_list, op_attrs=op_attrs, kernel_name="fused_relu_grad_bn_double_reduce_grad", attrs={ "target": "cuda"}) inshp_data, outshp_data, output, expect = gen_data(shape, out_shape, dtype, out_dtype) inputs = [inshp_data] * 5 + [outshp_data] + [inshp_data] * 3 + [outshp_data] + [inshp_data] * 3 + [outshp_data] * 3 outputs = [output, output] arg_list = inputs + outputs outputs = utils.mod_launch(mod, arg_list, outputs=tuple(range(-len(outputs), 0)), expect=expect) res = np.allclose(outputs, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs = to_tvm_nd_array(inputs) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, *expect, 400)
def test_fused_bn_double_follow_relu(in_shape, in_dtype='float16', layout='NHWC', out_dtype='float16', poly_sch=False): if layout != "NHWC" and layout != "NCHW": raise NotImplementedError( 'Layout not supported {} '.format(layout)) inter_dtype = 'float32' inputs, output, expect = gen_data(in_shape, in_dtype, inter_dtype, layout, out_dtype) input_shape_list = [i.shape for i in inputs] input_dtype_list = [inter_dtype] * 4 + [in_dtype] + [inter_dtype] * 4 + [in_dtype] op_attrs = [layout, out_dtype] if poly_sch: mod = utils.op_build(fused_bn_double_follow_relu_auto, input_shape_list, input_dtype_list, op_attrs=op_attrs, attrs={"target": "cuda"}) else: mod = utils.op_build(fused_bn_double_follow_relu_manual, input_shape_list, input_dtype_list, op_attrs=op_attrs) outputs = [output] arglist = inputs + outputs output = utils.mod_launch(mod, arglist, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs = to_tvm_nd_array(inputs) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, expect, 400)
def test_fused_relu_grad_bn_reduce_grad(shape_1, shape_2, layout='NHWC', poly_sch=False): data_1 = gen_data(shape_1, 'float32') data_2 = gen_data(shape_1, 'float32') data_3 = gen_data(shape_1, 'float32') data_4 = gen_data(shape_1, 'float32') data_5 = gen_data(shape_1, 'float32') data_6 = gen_data(shape_1, 'float32') data_7 = gen_data(shape_2, 'float16') data_8 = gen_data(shape_2, 'float16') data_9 = gen_data(shape_2, 'float16') expect, output = compute_py(data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9, layout) input_list = [ shape_1, shape_1, shape_1, shape_1, shape_1, shape_1, shape_2, shape_2, shape_2 ] dtype_list = [ 'float32', 'float32', 'float32', 'float32', 'float32', 'float32', 'float16', 'float16', 'float16' ] op_attrs = [layout] if poly_sch: mod = utils.op_build_test( fused_relu_grad_bn_reduce_grad_auto, input_list, dtype_list, kernel_name="fused_relu_grad_bn_reduce_grad_auto", op_attrs=op_attrs, attrs={"target": "cuda"}) else: mod = utils.op_build_test( fused_relu_grad_bn_reduce_grad_manual, input_list, dtype_list, kernel_name="fused_relu_grad_bn_reduce_grad_manual", op_attrs=op_attrs) args = [ data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9, output ] output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1e-08) print("Test {}".format("Pass" if res else "Failed")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs = to_tvm_nd_array([ data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9 ]) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, expect, 400)
def test_fused_relu_grad_bn_double_update_grad(shape_f16, shape_f32, layout='NHWC', poly_sch=False): data_1 = gen_data(shape_f32, 'float32') data_2 = gen_data(shape_f16, 'float16') data_3 = gen_data(shape_f32, 'float32') data_4 = gen_data(shape_f16, 'float16') data_5 = gen_data(shape_f16, 'float16') data_6 = gen_data(shape_f16, 'float16') data_7 = gen_data(shape_f16, 'float16') shape_list = [ shape_f32, shape_f16, shape_f32, shape_f16, shape_f16, shape_f16, shape_f16 ] dtype_list = [ 'float32', 'float16', 'float32', 'float16', 'float16', 'float16', 'float16' ] data_list = [data_1, data_2, data_3, data_4, data_5, data_6, data_7] data_tmp7, data_tmp15, data_tmp22, out_shape = compute_py( data_1, data_2, data_3, data_4, data_5, data_6, data_7, layout) expect = [data_tmp7, data_tmp15, data_tmp22] output = np.full(out_shape, np.nan, 'float32') output = [output, output, output] if poly_sch: mod = utils.op_build(fused_relu_grad_bn_double_update_grad_auto, shape_list, dtype_list, op_attrs=[layout], attrs={"target": "cuda"}) else: mod = utils.op_build(fused_relu_grad_bn_double_update_grad_manual, shape_list, dtype_list, op_attrs=[layout]) output = utils.mod_launch( mod, (data_1, data_2, data_3, data_4, data_5, data_6, data_7, *output), outputs=tuple(range(-len(output), 0)), expect=expect) res = True res &= np.allclose(output[0], expect[0], rtol=5e-03, atol=1e-8) res &= np.allclose(output[1], expect[1], rtol=5e-03, atol=1e-8) res &= np.allclose(output[2], expect[2], rtol=5e-03, atol=1e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data_list = to_tvm_nd_array(data_list) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *data_list, *expect, 400)
def expand_dims_run(shape, axis, dtype, kernel_name="expand_dims", attrs={}): op_attr = [axis] if 'tuning' in attrs.keys(): t = attrs.get("tuning", False) kernel_name = attrs.get("kernel_name", False) mod = utils.op_build_test(ExpandDims, [shape], [dtype], op_attr, kernel_name=kernel_name, attrs=attrs, tuning=t) if t: expect, input, output = gen_data(axis, dtype, shape) return mod, expect, (input, output) else: return mod else: mod = utils.op_build_test(ExpandDims, [shape], [dtype], op_attr, kernel_name=kernel_name, attrs=attrs) expect, input, output = gen_data(axis, dtype, shape) output = utils.mod_launch(mod, (input, output), expect=expect) if attrs.get("profiling", False): import akg target_name = attrs["target"].split()[0] args_list = to_tvm_nd_array([input, output], akg.tvm.context(target_name, 0)) target_profiling(mod, *args_list, target=target_name, repeat_time=attrs["repeat_times"]) return input, output, expect, compare_tensor(output, expect, rtol=5e-03, equal_nan=True)
def standard_normal_run(seed, shape, attrs=None): if not attrs: attrs = {"target": "cuda"} mod = utils.op_build_test(standard_normal, [], [], kernel_name="standard_normal", op_attrs=[seed, shape], attrs=attrs) output, expect = gen_data(shape) output = utils.mod_launch(mod, (output, ), expect=expect) res = output.shape == expect.shape res &= abs(np.mean(output) - 0) < 1e-1 res &= abs(np.std(output) - 1) < 1e-1 print("Test {}".format("Pass" if res else "Fail")) target_name = attrs["target"].split()[0] if not res: mod_source = mod if target_name != "llvm": mod_source = mod.imported_modules[0] print("Error {}:========================".format(target_name)) print(mod_source.get_source()) raise AssertionError("Test fail") if attrs["profiling"]: output = to_tvm_nd_array(output, akg.tvm.context(target_name, 0)) target_profiling(mod, output, target=target_name, repeat_time=attrs["repeat_times"]) return output, output, expect, res
def csrmv_run(shape1, dtype1, shape2, dtype2, poly_sch=True, attrs=None): if not attrs: attrs = {"target": "cuda"} if attrs["target"] == "cuda": attrs["enable_akg_reduce_lib"] = True attrs["enable_atomic_add"] = True data, indices, indptr, weight, expect = gen_data(shape1, dtype1, shape2, dtype2) attrs["is_csr"] = True mod = utils.op_build_test(csr_mv, [data.shape, indices.shape, indptr.shape, weight.shape], ["float32", "int32", "int32", "float32"], polyhedral=poly_sch, attrs=attrs, kernel_name='csrmv') output_shape = expect.shape output = np.zeros(output_shape, dtype="float32") output = utils.mod_launch(mod, (data, indices, indptr, weight, output), expect=expect) res = compare_tensor(output, expect, rtol=5e-3, atol=1e-8) print("Test {}".format("Pass" if res else "Failed")) target_name = attrs["target"].split()[0] if not res: mod_source = mod if target_name != "llvm": mod_source = mod.imported_modules[0] print("Error {}:========================".format(target_name)) print(mod_source.get_source()) raise AssertionError("Test fail") if attrs["profiling"]: args_list = to_tvm_nd_array([data, indices, indptr, weight, output], akg.tvm.context(target_name, 0)) target_profiling(mod, *args_list, target=target_name, repeat_time=attrs["repeat_times"]) return (data, indices, indptr, weight), output, expect, res
def select_run(shape_cond, shape_x, dtype_cond, dtype_x, attrs=None): """select_run implementation""" if attrs is None: attrs = {} mod = utils.op_build_test(select, [shape_cond, shape_x, shape_x], [dtype_cond, dtype_x, dtype_x], kernel_name='select', op_attrs=[], attrs=attrs) args, exp_output, cond, x1, x2 = gen_data(shape_cond, shape_x, dtype_cond, dtype_x) acu_output = utils.mod_launch(mod, args, expect=exp_output) if attrs.get("profiling", False): import akg target_name = attrs["target"].split()[0] args_list = to_tvm_nd_array(args, akg.tvm.context(target_name, 0)) target_profiling(mod, *args_list, target=target_name, repeat_time=attrs["repeat_times"]) # compare result rtol, atol = get_rtol_atol("select", dtype_x) testcase_result = compare_tensor(acu_output, exp_output, rtol=rtol, atol=atol, equal_nan=True) return [cond, x1, x2], acu_output, exp_output, testcase_result
def fused_is_finite_run(shape, layout='NHWC', poly_sch=True, attrs=None): if not attrs: attrs = {"target": "cuda"} attrs.update({"enable_akg_reduce_lib": True, "enable_atomic_add": True}) dtype = "float32" mod = utils.op_build_test(fused_is_finite, [shape], [dtype], op_attrs=[layout], kernel_name="fused_is_finite", polyhedral=poly_sch, attrs=attrs) data, expect, output = gen_data(shape, dtype, layout) args = (data, output) output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1e-8) print("Test {}".format("Pass" if res else "Fail")) target_name = attrs["target"].split()[0] if not res: mod_source = mod if target_name != "llvm": mod_source = mod.imported_modules[0] print("Error {}:========================".format(target_name)) print(mod_source.get_source()) raise AssertionError("Test fail") if attrs["profiling"]: data, output = to_tvm_nd_array([data, output], akg.tvm.context(target_name, 0)) target_profiling(mod, data, output, target=target_name, repeat_time=attrs["repeat_times"]) return data, output, expect, res
def fused_relu_grad_run(shape, c1=0, poly_sch=True, attrs=None): if not attrs: attrs = {"target": "cuda"} dtype='float16' input = gen_data(shape, dtype) expect = compute_expect(input, c1) shapes = [shape] * 3 dtypes = [dtype] * 3 op_attrs = [c1] mod = utils.op_build_test(fused_relu_grad, shapes, dtypes, op_attrs=op_attrs, kernel_name="fused_relu_grad", polyhedral=poly_sch, attrs=attrs) output = np.full(shape, np.nan, dtype) output = utils.mod_launch(mod, (*input, output), expect=expect) res = np.allclose(output, expect, rtol=5e-3, atol=1e-8) print("Test {}".format("Pass" if res else "Failed")) target_name = attrs["target"].split()[0] if not res: mod_source = mod if target_name != "llvm": mod_source = mod.imported_modules[0] print("Error {}:========================".format(target_name)) print(mod_source.get_source()) raise AssertionError("Test fail") if attrs["profiling"]: data = to_tvm_nd_array([*input, output], akg.tvm.context(target_name, 0)) target_profiling(mod, *data, target=target_name, repeat_time=attrs["repeat_times"]) return input, output, expect, res
def fused_mul_div_rsqrt_mul_isfinite_red_run(shape, dtype='float32', poly_sch=True, attrs=None): if not attrs: attrs = {"target": "cuda"} attrs.update({"enable_akg_reduce_lib": True, "enable_atomic_add": True}) inputs = gen_data(shape, dtype) expect = compute_expect(inputs) input_shape = [shape, shape] input_dtype = [dtype, dtype] mod = utils.op_build_test(fused_mul_div_rsqrt_mul_isfinite_red, input_shape, input_dtype, kernel_name="fused_mul_div_rsqrt_mul_isfinite_red", polyhedral=poly_sch, attrs=attrs) outputs = [np.full((1,), False, 'bool')] + [np.full(shape, np.nan, dtype)] * 3 output = utils.mod_launch(mod, [*inputs, *outputs], outputs=list(range(-len(outputs), 0)), expect=expect) ret = compare_tensor(output[0], expect[0], rtol=5e-03, atol=1.e-08) ret &= compare_tensor(output[1], expect[1], rtol=5e-03, atol=1.e-08) ret &= compare_tensor(output[2], expect[2], rtol=5e-03, atol=1.e-08) ret &= compare_tensor(output[3], expect[3], rtol=5e-03, atol=1.e-08) print("Test {}".format("Pass" if ret else "Failed")) target_name = attrs["target"].split()[0] if not ret: mod_source = mod if target_name != "llvm": mod_source = mod.imported_modules[0] print("Error {}:========================".format(target_name)) print(mod_source.get_source()) raise AssertionError("Test fail") if attrs["profiling"]: data = to_tvm_nd_array([*inputs, *outputs], akg.tvm.context(target_name, 0)) target_profiling(mod, *data, target=target_name, repeat_time=attrs["repeat_times"]) return inputs, outputs, expect, ret
def fused_bn_update_grad_run(shape, out_shape, dtype="float16", out_dtype="float32", layout="NHWC", poly_sch=True, attrs=None): if not attrs: attrs = {"target": "cuda"} attrs.update({"enable_akg_reduce_lib": True, "enable_atomic_add": True}) shape_list = [shape, out_shape, shape] dtype_list = [dtype, out_dtype, dtype] op_attrs = [layout] mod = utils.op_build_test(fused_bn_update_grad, shape_list, dtype_list, op_attrs=op_attrs, kernel_name="fused_bn_update_grad", polyhedral=poly_sch, attrs=attrs) head, data_sum, in_bn, output, expect = gen_data(shape, out_shape, dtype, out_dtype, layout) outputs = [output, output] inputs = [head, data_sum, in_bn] arg_list = inputs + outputs outputs = utils.mod_launch(mod, arg_list, outputs=tuple(range(-len(outputs), 0)), expect=expect) res = np.allclose(outputs, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) target_name = attrs["target"].split()[0] if not res: mod_source = mod if target_name != "llvm": mod_source = mod.imported_modules[0] print("Error {}:========================".format(target_name)) print(mod_source.get_source()) raise AssertionError("Test fail") if attrs["profiling"]: arg_list = to_tvm_nd_array(arg_list, akg.tvm.context(target_name, 0)) target_profiling(mod, *arg_list, target=target_name, repeat_time=attrs["repeat_times"]) return inputs, outputs, expect, res
def sqrt_run(shape, dtype, attrs): if 'tuning' in attrs.keys(): t = attrs.get("tuning", False) kernel_name = attrs.get("kernel_name", False) mod = utils.op_build_test(sqrt, [shape], [dtype], kernel_name=kernel_name, attrs=attrs, tuning=t) if t: expect, input, output = gen_data(dtype, shape) return mod, expect, (input, output) else: return mod else: expect, input, output = gen_data(dtype, shape) mod = utils.op_build_test(sqrt, [shape], [dtype], kernel_name='sqrt', attrs=attrs) output = utils.mod_launch(mod, (input, output), expect=expect) if attrs.get("profiling", False): target_name = attrs["target"].split()[0] args_list = to_tvm_nd_array([input, output], akg.tvm.context(target_name, 0)) target_profiling(mod, *args_list, target=target_name, repeat_time=attrs["repeat_times"]) return input, output, expect, compare_tensor(output, expect, rtol=5e-03, equal_nan=True)
def tensor_scatter_add_run(data_shape, data_type, indices_shape, indices_type, axis, poly_sch=True, attrs=None): op_attrs = [axis] default_attrs = {"target": "cuda"} if attrs: default_attrs.update(attrs) if len(indices_shape) > 1: updates_shape = indices_shape[:-1] + data_shape[indices_shape[-1]:] else: updates_shape = indices_shape + data_shape[1:] mod = utils.op_build_test(tensor_scatter_add, [data_shape, indices_shape, updates_shape], [data_type, indices_type, data_type], attrs=default_attrs, kernel_name="tensor_scatter_add", polyhedral=poly_sch) # gen data indices_shape = indices_shape + (1, ) if len( indices_shape) == 1 else indices_shape params, indices, updates, expect = gen_data(data_shape, data_type, indices_shape, indices_type) output_shape = expect.shape if len(expect.shape) == 0: output_shape = (1, ) output = np.zeros(output_shape, expect.dtype) output = utils.mod_launch(mod, (params, indices, updates, output), expect=expect) atol, rtol = get_rtol_atol("tensor_scatter_add", data_type) res = compare_tensor(output, expect, rtol=rtol, atol=atol) print("Test {}".format("Pass" if res else "Failed")) target_name = attrs["target"].split()[0] if not res: mod_source = mod if target_name != "llvm": mod_source = mod.imported_modules[0] print("Error {}:========================".format(target_name)) print(mod_source.get_source()) raise AssertionError("Test fail") if attrs["profiling"]: params, indices, updates, output = to_tvm_nd_array( [params, indices, updates, output], akg.tvm.context(target_name, 0)) target_profiling(mod, params, indices, updates, output, target=target_name, repeat_time=attrs["repeat_times"]) return (params, indices, updates), output, expect, res
def reciprocal_run(shape, dtype, attrs): if 'tuning' in attrs.keys(): t = attrs.get("tuning", False) kernel_name = attrs.get("kernel_name", False) mod = reciprocal_compile(shape, dtype, attrs, kernel_name=kernel_name, tuning=t) if t: expect, input1, output = gen_data(dtype, shape) return mod, expect, (input1, output) else: return mod else: mod = reciprocal_compile(shape, dtype, attrs) expect, input1, output = gen_data(dtype, shape) output = utils.mod_launch(mod, (input1, output), expect=expect) if attrs["profiling"]: target_name = attrs["target"].split()[0] args_list = to_tvm_nd_array([input1, output], akg.tvm.context(target_name, 0)) target_profiling(mod, *args_list, target=target_name, repeat_time=attrs["repeat_times"]) rtol, atol = get_rtol_atol("reciprocal", dtype) return (input1, ), output, expect, compare_tensor(output, expect, rtol=rtol, atol=atol, equal_nan=True)
def test_ms_reduce_max(in_shape, in_dtype, axis=None, keepdims=False, poly_sch=False): if poly_sch: mod = utils.op_build_test(reduce_max, (in_shape, ), (in_dtype, ), op_attrs=[axis, keepdims], kernel_name="reduce_max", attrs={ "target": "cuda", "enable_akg_reduce_lib": True, "enable_atomic_add": True }) data, output, expect = gen_data(in_shape, in_dtype, axis, keepdims) args = (data, output) output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") data, expect = to_tvm_nd_array([data, expect]) gpu_profiling(mod, data, expect, 400)
def batch_matmul_run(shape1, shape2, dtype, out_dtype="float32", layout1="NHDT", layout2="NHDT", layout_out="NHDT", shape_bias=None, add_bias=False, tensor_core=True, poly_sch=True, attrs=None): op_attrs = [out_dtype, layout1, layout2, layout_out, tensor_core, add_bias] default_attrs = attrs if not attrs: default_attrs = {"target": "cuda"} if default_attrs["target"] == "cuda" and tensor_core: default_attrs.update({ "pragma_enable_matmul": True, "enable_auto_inline": False }) elif default_attrs["target"] == "llvm": if "pragma_enable_matmul" not in default_attrs.keys(): default_attrs["pragma_enable_matmul"] = True if "feature" not in default_attrs.keys(): default_attrs["feature"] = "avx" mod = utils.op_build_test(BatchMatMul, (shape1, shape2, shape_bias), (dtype, dtype, out_dtype), op_attrs=op_attrs, attrs=default_attrs, polyhedral=poly_sch, kernel_name="batch_matmul") lhs, rhs, bias, output, expect = gen_data(shape1, shape2, dtype, out_dtype, layout1, layout2, layout_out, shape_bias, add_bias) args = (lhs, rhs, bias, output) output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) target_name = default_attrs["target"].split()[0] if not res: mod_source = mod if target_name != "llvm": mod_source = mod.imported_modules[0] print("Error {}:========================".format(target_name)) print(mod_source.get_source()) raise AssertionError("Test fail") if attrs["profiling"]: args = to_tvm_nd_array(args, akg.tvm.context(target_name, 0)) target_profiling(mod, *args, target=target_name, repeat_time=attrs["repeat_times"]) return (lhs, rhs, bias), output, expect, res
def fused_gather_gather_add_mul_max_exp_scatter_add_run( input1_shape, input2_shape, input3_shape, input4_shape, data_dtype, indices_type, axis, poly_sch=True, attrs=None): op_attrs = [axis] default_attrs = {"target": "cuda"} if attrs: default_attrs.update(attrs) mod = utils.op_build_test( fused_gather_gather_add_mul_max_exp_scatter_add, [input1_shape, input2_shape, input3_shape, input4_shape], [data_dtype, indices_type, data_dtype, indices_type], op_attrs=op_attrs, attrs=default_attrs, polyhedral=poly_sch, kernel_name="fused_gather_gather_add_mul_max_exp_scatter_add", ) # gen data input1, input2, input3, input4, expect1, expect2 = gen_data( input1_shape, input2_shape, input3_shape, input4_shape, data_dtype, indices_type, axis) output1 = np.zeros(expect1.shape, expect1.dtype) output2 = deepcopy(input1) output1, output2 = utils.mod_launch( mod, (input1, input2, input3, input4, output1, output2), outputs=(-2, -1)) atol, rtol = get_rtol_atol( "fused_gather_gather_add_mul_max_exp_scatter_add", data_dtype) res = compare_tensor(output1, expect1, rtol=rtol, atol=atol) res &= compare_tensor(output2, expect2, rtol=rtol, atol=atol) print("Test {}".format("Pass" if res else "Failed")) target_name = attrs["target"].split()[0] if not res: mod_source = mod if target_name != "llvm": mod_source = mod.imported_modules[0] print("Error {}:========================".format(target_name)) print(mod_source.get_source()) raise AssertionError("Test fail") if attrs["profiling"]: inputs = to_tvm_nd_array( [input1, input2, input3, input4, output1, output2], akg.tvm.context(target_name, 0)) target_profiling(mod, *inputs, target=target_name, repeat_time=attrs["repeat_times"]) return (input1, input2, input3, input4), (output1, output2), (expect1, expect2), res
def conv_fusion_run(shape_data, shape_filter1, shape_filter2, stride1, stride2, padding1, padding2, dilation1, dilation2, dtype, out_dtype="float32", poly_sch=True, attrs=None): if not attrs: attrs = {"target": "cuda"} op_attrs = [stride1, stride2, padding1, padding2, dilation1, dilation2] attrs.update({ "enable_auto_fuse": False, "shared_memory_tensors": "out input_1 input_2 input_3", "pragma_disable_loop_fusion": True, "dim": "3 0 1 1 3 1 1 1 3 2 4 4 3 3 52 52 3 4 64 64" }) mod = utils.op_build_test(ConvFusion, (shape_data, shape_filter1, shape_filter2), (dtype, dtype, dtype), op_attrs=op_attrs, attrs=attrs, polyhedral=poly_sch, kernel_name="conv_fusion_auto") data, weight1, weight2, output, expect = fusion_gen_data( shape_data, shape_filter1, shape_filter2, stride1, stride2, padding1, padding2, dilation1, dilation2, dtype, out_dtype) args = (data, weight1, weight2, output) output = utils.mod_launch(mod, args, expect=expect) res = np.allclose(output, expect, rtol=5e-3, atol=1.e-8) print("Test {}".format("Pass")) target_name = attrs["target"].split()[0] if not res: mod_source = mod if target_name != "llvm": mod_source = mod.imported_modules[0] print("Error {}:========================".format(target_name)) print(mod_source.get_source()) raise AssertionError("Test fail") if attrs["profiling"]: data, weight1, weight2, output = to_tvm_nd_array( [data, weight1, weight2, output], akg.tvm.context(target_name, 0)) target_profiling(mod, data, weight1, weight2, output, target=target_name, repeat_time=attrs["repeat_times"]) return (data, weight1, weight2), output, expect, res
def cast_run(shape, srcType, dstType, attrs={}): op_attrs = [dstType] if attrs.get("dynamic"): attrs["enable_double_buffer"] = False var_shape = [] for i in range(len(shape)): var_shape.append(tvm.var("I" + str(i))) build_shape = var_shape else: build_shape = shape if 'tuning' in attrs.keys(): t = attrs.get("tuning", False) kernel_name = attrs.get("kernel_name", False) mod = utils.op_build_test(Cast, [build_shape], [srcType], op_attrs, kernel_name=kernel_name, attrs=attrs, tuning=t) if t: args, exp_output, input = gen_data(dstType, shape, srcType) return mod, exp_output, args else: return mod else: mod = utils.op_build_test(Cast, [build_shape], [srcType], op_attrs, kernel_name='cast', attrs=attrs) args, exp_output, input = gen_data(dstType, shape, srcType) if attrs.get("dynamic"): for i in range(len(shape)): args.append(shape[i]) block_dim = compute_blockdim(shape) args.append(block_dim) acu_output = utils.mod_launch(mod, args, outputs=(1, ), expect=exp_output) # compare result rtol, atol = get_rtol_atol("cast", dstType) TestCase_Result = compare_tensor(acu_output, exp_output, rtol=rtol, atol=atol, equal_nan=True) if attrs.get("profiling", False): target_name = attrs["target"].split()[0] args_list = to_tvm_nd_array(args, akg.tvm.context(target_name, 0)) target_profiling(mod, *args_list, target=target_name, repeat_time=attrs["repeat_times"]) return input, acu_output, exp_output, TestCase_Result
def csr_reduce_sum_run(shape, dtype1, dtype2, axis, nnz=-1, poly_sch=True, attrs=None): if not attrs: attrs = {"target": "cuda"} if attrs["target"] == "cuda": attrs["enable_akg_reduce_lib"] = True attrs["enable_atomic_add"] = True op_attrs = [axis, shape] # gen data data, col_idx, row_idx, expect = gen_data(shape, dtype1, dtype2, axis, nnz=nnz) output_shape = expect.shape attrs["is_csr"] = True mod = utils.op_build_test(csr_reduce_sum, [data.shape, col_idx.shape, row_idx.shape], [dtype1, dtype2, dtype2], op_attrs=op_attrs, polyhedral=poly_sch, attrs=attrs, kernel_name="csr_reduce_sum") if len(expect.shape) == 0: output_shape = (1, ) output = np.zeros(output_shape, expect.dtype) output = utils.mod_launch(mod, (data, col_idx, row_idx, output), expect=expect) atol, rtol = get_rtol_atol("csr_reduce_sum", dtype1) res = compare_tensor(output, expect, rtol=rtol, atol=atol) print("Test {}".format("Pass" if res else "Failed")) target_name = attrs["target"].split()[0] if not res: mod_source = mod if target_name != "llvm": mod_source = mod.imported_modules[0] print("Error {}:========================".format(target_name)) print(mod_source.get_source()) raise AssertionError("Test fail") if attrs["profiling"]: args_list = to_tvm_nd_array([data, col_idx, row_idx, output], akg.tvm.context(target_name, 0)) target_profiling(mod, *args_list, target=target_name, repeat_time=attrs["repeat_times"]) return (data, col_idx, row_idx), output, expect, res
def test_ms_addn(shape, dtype, n, poly_sch=False): shapes = [] for i in range(n): shapes.append(shape) if poly_sch: mod = utils.op_build_test(addn_auto, [shapes], [dtype], attrs={"target": "cuda"}, kernel_name="addn_auto") else: mod = utils.op_build_test(addn_manual, [shapes], [dtype], kernel_name="addn_manual") expect, inputs, output = gen_data(shape, shapes, dtype, n) output = utils.mod_launch(mod, (*inputs, output), expect=expect) res = compare_tensor(output, expect, rtol=5e-03, atol=1.e-8) print("Test {}".format("Pass" if res else "Fail")) if not res: print("Error cuda:========================") print(mod.imported_modules[0].get_source()) raise AssertionError("Test fail") inputs = to_tvm_nd_array(inputs) expect = to_tvm_nd_array(expect) gpu_profiling(mod, *inputs, expect, 400)