def maximum_compute(input_x, input_y, output_z, kernel_name="maximum"): """ calculating data maximum Parameters ---------- input_data: TVM tensor the placeholder of input data output_data: dict shape and dtype of output, should be same shape and type as input kernel_name: str cce kernel name, default value is sqrt Returns ------- result: TVM tensor the result of sqrt """ shape1 = te.lang.cce.util.shape_to_list(input_x.shape) shape2 = te.lang.cce.util.shape_to_list(input_y.shape) shape1 = util.scalar2tensor_one(shape1) shape2 = util.scalar2tensor_one(shape2) shape1, shape2, shape_max = broadcast_shapes( shape1, shape2, param_name_input1="select1_result", param_name_input2="maximum_ones") data1_tmp1 = te.lang.cce.broadcast(input_x, shape_max) data2_tmp1 = te.lang.cce.broadcast(input_y, shape_max) res = te.lang.cce.vmax(data1_tmp1, data2_tmp1) return res
def _mul_check_format(x, y): format_pattern = 0 shape1 = x.get("shape") shape2 = y.get("shape") list_format = [x.get("format"), y.get("format")] shape1 = util.scalar2tensor_one(shape1) shape2 = util.scalar2tensor_one(shape2) check_list = [["FRACTAL_NZ", "ND"], ["ND", "FRACTAL_NZ"], ["FRACTAL_NZ", "NHWC"], ["NHWC", "FRACTAL_NZ"], ["FRACTAL_NZ", "NCHW"], ["NCHW", "FRACTAL_NZ"]] if list_format == check_list[0] \ and (len(shape2) != 1 or (len(shape2) == 1 and shape2[0] != 1)): format_pattern = 1 elif list_format == check_list[1] \ and (len(shape1) != 1 or (len(shape1) == 1 and shape1[0] != 1)): format_pattern = 2 elif list_format == check_list[2] \ and (len(shape2) != 1 or (len(shape2) == 1 and shape2[0] != 1)): format_pattern = 1 elif list_format == check_list[3] \ and (len(shape1) != 1 or (len(shape1) == 1 and shape1[0] != 1)): format_pattern = 2 elif list_format == check_list[4] \ and (len(shape2) != 1 or (len(shape2) == 1 and shape2[0] != 1)): format_pattern = 1 elif list_format == check_list[5] \ and (len(shape1) != 1 or (len(shape1) == 1 and shape1[0] != 1)): format_pattern = 2 return format_pattern
def op_select_format(grad, x1, x2, y, axis, keepdims, kernel_name="softmax_grad_ext"): """ select format dynamically """ origin_shape0 = util.scalar2tensor_one(grad.get("ori_shape")) origin_shape1 = util.scalar2tensor_one(x1.get("ori_shape")) origin_shape2 = util.scalar2tensor_one(x2.get("ori_shape")) condition_0 = len(origin_shape2) == 1 and origin_shape2[0] == 1 condition_1 = _division_sixteen(origin_shape0) condition_2 = _division_sixteen(origin_shape1) if condition_0 and condition_1 and condition_2: # NZ + NZ + Scalar input0 = gen_param(classify="input0", name="grad", datatype="float16,float", format="FRACTAL_NZ, FRACTAL_NZ") input1 = gen_param(classify="input1", name="x1", datatype="float16,float", format="FRACTAL_NZ, FRACTAL_NZ") input2 = gen_param(classify="input2", name="x2", datatype="float16,float", format="ND,ND") output0 = gen_param(classify="output0", name="y", datatype="float16,float", format="FRACTAL_NZ,FRACTAL_NZ") else: # ND+ND+ND input0 = gen_param(classify="input0", name="grad", datatype="float16,float", format="ND,ND") input1 = gen_param(classify="input1", name="x1", datatype="float16,float", format="ND,ND") input2 = gen_param(classify="input2", name="x2", datatype="float16,float", format="ND,ND") output0 = gen_param(classify="output0", name="y", datatype="float16,float", format="ND,ND") param_list = [input0, input1, input2, output0] param_dynamic_in_json = get_dynamic_param_in_json(param_list) return param_dynamic_in_json
def mul(x, y, output, kernel_name="mul"): """ do element-wise mul operation between two input tensors Parameters: ---------- x : dict. shape, dtype of input x y : dict. shape, dtype of input y output : dict. shape, dtype of ouput kernel_name : str. cce kernel name, default value is "mul" Returns ------- None """ # format_pattern = 1 Nz and vector # format_pattern = 2 vector and Nz # format_pattern = 0 Nz scalar Nz Nz ND ND format_pattern = _mul_check_format(x, y) shape_x, shape_y = _infer_shape(format_pattern, x, y) shape_x = util.scalar2tensor_one(shape_x) dtype_x = x.get("dtype").lower() shape_y = util.scalar2tensor_one(shape_y) dtype_y = y.get("dtype").lower() op_utils.check_shape(shape_x, param_name="x") op_utils.check_shape(shape_y, param_name="y") if dtype_x != dtype_y: raise RuntimeError("dtype of inputs should be consistent") dtype = dtype_x check_list = ("int32", "float16", "float32", "int16") op_utils.check_dtype(dtype, check_list, param_name="x") vmul_support = tbe_platform.cce_conf.api_check_support( "te.lang.cce.vmul", "float32") if dtype_x == "float32" and not vmul_support: raise RuntimeError( "Input dtype is float32, but do not support on the platform") shape_x, shape_y, shape_max = op_utils.broadcast_shapes( shape_x, shape_y, param_name_input1="x", param_name_input2="y") shape_x, shape_y = op_utils.refine_shapes_for_broadcast(shape_x, shape_y) input_x = tvm.placeholder(shape_x, dtype=dtype, name="x") input_y = tvm.placeholder(shape_y, dtype=dtype, name="y") res = _mul_compute(input_x, input_y, output, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": (input_x, input_y, res)} te.lang.cce.cce_build_code(sch, config)
def op_select_format(input_tensor, input_mask, input_keep_prob, output, kernel_name="dropout_do_mask"): """ _division_sixteen : judge whether the last two dimensions are divided by 16 scalar2tensor_one : convert scalar to tensor """ shape_0 = input_tensor.get("ori_shape") shape_1 = input_mask.get("ori_shape") shape_2 = input_keep_prob.get("ori_shape") shape_0 = util.scalar2tensor_one(shape_0) shape_1 = util.scalar2tensor_one(shape_1) shape_2 = util.scalar2tensor_one(shape_2) if _division_sixteen(shape_0) and not _division_sixteen( shape_1) and not _division_sixteen(shape_2): # Nz+ND+ND input0 = gen_param(classify="input0", name="x", datatype="float16,float16,float,float", format="ND,FRACTAL_NZ,ND,FRACTAL_NZ") input1 = gen_param(classify="input1", name="mask", datatype="uint8,uint8,uint8,uint8", format="ND,ND,ND,ND") input2 = gen_param(classify="input2", name="keep_prob", datatype="float16,float16,float,float", format="ND,ND,ND,ND") output0 = gen_param(classify="output0", name="y", datatype="float16,float16,float,float", format="ND,FRACTAL_NZ,ND,FRACTAL_NZ") else: # ND+ND input0 = gen_param(classify="input0", name="x", datatype="float16,float", format="ND,ND") input1 = gen_param(classify="input1", name="mask", datatype="uint8,uint8", format="ND,ND") input2 = gen_param(classify="input2", name="keep_prob", datatype="float16,float", format="ND,ND") output0 = gen_param(classify="output0", name="y", datatype="float16,float", format="ND,ND") param_list = [input0, input1, input2, output0] param_dynamic_in_json = get_dynamic_param_in_json(param_list) return param_dynamic_in_json
def fake_learned_scale_quant_perlayer( input_x, alpha, quant_max, out, neg_trunc, kernel_name="fake_learned_scale_quant_perlayer"): """FakeLearnedScaleQuantPerLayer""" input_shape = input_x.get("shape") input_dtype = input_x.get("dtype") alpha_shape = alpha.get("ori_shape") alpha_dtype = alpha.get("dtype") quant_max_shape = quant_max.get("ori_shape") quant_max_dtype = quant_max.get("dtype") alpha_shape = util.scalar2tensor_one(alpha_shape) quant_max_shape = util.scalar2tensor_one(quant_max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(alpha_shape, 1, 1, 1) util.check_shape_rule(quant_max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(alpha_shape) util.check_tensor_shape_size(quant_max_shape) check_list = ["float32", "float16"] input_dtype = input_dtype.lower() alpha_dtype = alpha_dtype.lower() quant_max_dtype = quant_max_dtype.lower() util.check_dtype_rule(input_dtype, check_list) util.check_dtype_rule(alpha_dtype, check_list) util.check_dtype_rule(quant_max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), ) input_data = tvm.placeholder(input_shape, name="x", dtype=input_dtype) alpha_data = tvm.placeholder(alpha_shape, name="alpha_data", dtype=alpha_dtype) quant_max_data = tvm.placeholder(quant_max_shape, name="quant_max_data", dtype=quant_max_dtype) res = fake_learned_scale_quant_perlayer_compute(input_data, alpha_data, quant_max_data, neg_trunc, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [input_data, alpha_data, quant_max_data, res] config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list, "bool_storage_as_1bit": False } te.lang.cce.cce_build_code(sch, config)
def add(input_x, input_y, output_z, kernel_name="add"): """ algorithm: add calculating data's add, c = a + b Parameters ---------- input_x : dict shape and dtype of first input, only support float16, float32, int32 input_y : dict shape and dtype of second input, only support float16, float32, int32 output_z: dict shape and dtype of output, should be broadcast shape and type as input kernel_name : str cce kernel name, default value is add Returns ------- None """ # format_pattern = 1 Nz and vector # format_pattern = 2 vector and Nz # format_pattern = 0 Nz scalar Nz Nz ND ND format_pattern = _add_check_format(input_x, input_y) shape_x, shape_y = _infer_shape(format_pattern, input_x, input_y) shape_x = util.scalar2tensor_one(shape_x) shape_y = util.scalar2tensor_one(shape_y) check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") check_tuple = ("float16", "float32", "int32") input_data_type = input_x.get("dtype").lower() check_dtype(input_data_type, check_tuple, param_name="input_x") shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") if shape_x[-1] == 1 and shape_y[-1] == 1 and shape_max[-1] == 1: shape_x = shape_x if len(shape_x) == 1 else shape_x[:-1] shape_y = shape_y if len(shape_y) == 1 else shape_y[:-1] shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1] data_x = tvm.placeholder(shape_x, name="data_1", dtype=input_data_type) data_y = tvm.placeholder(shape_y, name="data_2", dtype=input_data_type) res = add_compute(data_x, data_y, output_z, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": (data_x, data_y, res) } te.lang.cce.cce_build_code(schedule, config)
def _infer_shape(format_pattern, x, y): shape_x = x.get("shape") shape_y = y.get("shape") ori_shape_x = x.get("ori_shape") ori_shape_y = y.get("ori_shape") shape_x = util.scalar2tensor_one(shape_x) shape_y = util.scalar2tensor_one(shape_y) if format_pattern == 1: ori_shape_x, shape_y, shape_max = broadcast_shapes( ori_shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") if shape_y[-2] == 1 and shape_y[-1] == ori_shape_x[-1]: shape_y.append(1) shape_y.append(1) shape_y[-3] = 1 shape_y[-1] = shape_x[-1] shape_y[-4] = shape_x[-4] elif shape_y[-2] == ori_shape_x[-2] and shape_y[-1] == 1: shape_y.append(1) shape_y.append(1) shape_y[-4] = 1 shape_y[-2] = shape_x[-2] shape_y[-3] = shape_x[-3] elif shape_y[-2] == shape_y[-1] == 1: shape_y.append(1) shape_y.append(1) elif format_pattern == 2: shape_x, ori_shape_y, shape_max = broadcast_shapes( shape_x, ori_shape_y, param_name_input1="input_x", param_name_input2="input_y") if shape_x[-2] == 1 and shape_x[-1] == ori_shape_y[-1]: shape_x.append(1) shape_x.append(1) shape_x[-3] = 1 shape_x[-1] = shape_y[-1] shape_x[-4] = shape_y[-4] elif shape_x[-2] == ori_shape_y[-2] and shape_x[-1] == 1: shape_x.append(1) shape_x.append(1) shape_x[-4] = 1 shape_x[-2] = shape_y[-2] shape_x[-3] = shape_y[-3] elif shape_x[-2] == shape_x[-1] == 1: shape_x.append(1) shape_x.append(1) return shape_x, shape_y
def fake_quant_with_min_max_grad(dout, x, min_val, max_val, dx, num_bits, quant_delay, symmetric, narrow_range, kernel_name="fake_quant_with_min_max_grad"): """FakeQuantWithMinMaxGrad""" input_shape = x.get("shape") input_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") min_shape = util.scalar2tensor_one(min_shape) max_shape = util.scalar2tensor_one(max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(min_shape, 1, 1, 1) util.check_shape_rule(max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", 'float16'] x_dtype = input_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]),) shape_min, _, _ = util.produce_shapes(min_shape, input_shape) if symmetric: quant_min = 0 - 2 ** (num_bits - 1) quant_max = 2 ** (num_bits - 1) - 1 else: quant_min = 0 quant_max = 2 ** num_bits - 1 if narrow_range: quant_min = quant_min + 1 dout_data = tvm.placeholder(input_shape, name="dout", dtype=x_dtype) input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype) max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype) res = fake_quant_with_min_max_grad_compute(dout_data, input_data, min_data, max_data, quant_min, quant_max, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [dout_data, input_data, min_data, max_data, res] config = {"print_ir": False, "name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def _infer_shape(format_pattern, x, y): shape_x = x.get("shape") shape_y = y.get("shape") ori_shape_x = x.get("ori_shape") ori_shape_y = y.get("ori_shape") shape_x = util.scalar2tensor_one(shape_x) shape_y = util.scalar2tensor_one(shape_y) if format_pattern == 1: ori_shape_x, shape_y, shape_max = op_utils.broadcast_shapes( ori_shape_x, shape_y, param_name_input1="x", param_name_input2="y") if shape_y[-2] == ori_shape_x[-2] and shape_y[-1] == ori_shape_x[-1]: raise RuntimeError("the inputshape of y is illegal") if shape_y[-2] == 1 and shape_y[-1] == ori_shape_x[-1]: shape_y.append(1) shape_y.append(1) shape_y[-3] = 1 shape_y[-1] = shape_x[-1] shape_y[-4] = shape_x[-4] elif shape_y[-2] == ori_shape_x[-2] and shape_y[-1] == 1: shape_y.append(1) shape_y.append(1) shape_y[-4] = 1 shape_y[-2] = shape_x[-2] shape_y[-3] = shape_x[-3] elif shape_y[-2] == shape_y[-1] == 1: shape_y.append(1) shape_y.append(1) elif format_pattern == 2: shape_x, ori_shape_y, shape_max = op_utils.broadcast_shapes( shape_x, ori_shape_y, param_name_input1="x", param_name_input2="y") if shape_x[-2] == ori_shape_y[-2] and shape_x[-1] == ori_shape_y[-1]: raise RuntimeError("the inputshape of x is illegal") if shape_x[-2] == 1 and shape_x[-1] == ori_shape_y[-1]: shape_x.append(1) shape_x.append(1) shape_x[-3] = 1 shape_x[-1] = shape_y[-1] shape_x[-4] = shape_y[-4] elif shape_x[-2] == ori_shape_y[-2] and shape_x[-1] == 1: shape_x.append(1) shape_x.append(1) shape_x[-4] = 1 shape_x[-2] = shape_y[-2] shape_x[-3] = shape_y[-3] elif shape_x[-2] == shape_x[-1] == 1: shape_x.append(1) shape_x.append(1) return shape_x, shape_y
def real_div(x1, x2, y, kernel_name="real_div"): """ algorithm: real_div calculating data's real_div, c = a / b Parameters ---------- x1 : dict shape and dtype of first input, only support float16, float32, int32 x2 : dict shape and dtype of second input, only support float16, float32, int32 y: dict shape and dtype of output, should be broadcast shape and type as input kernel_name : str cce kernel name, default value is real_div Returns ------- None """ shape_x = util.scalar2tensor_one(x1.get("shape")) shape_y = util.scalar2tensor_one(x2.get("shape")) check_shape(shape_x, param_name="x1") check_shape(shape_y, param_name="x2") check_tuple = ("float16", "float32") input_data_type = x1.get("dtype").lower() check_dtype(input_data_type, check_tuple, param_name="x1") input_data_type_x2 = x2.get("dtype").lower() check_dtype(input_data_type_x2, check_tuple, param_name="x2") shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") if shape_x[-1] == 1 and shape_y[-1] == 1 and shape_max[-1] == 1: shape_x = shape_x if len(shape_x) == 1 else shape_x[:-1] shape_y = shape_y if len(shape_y) == 1 else shape_y[:-1] shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1] shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y) data_x = tvm.placeholder(shape_x, name="data_x", dtype=input_data_type) data_y = tvm.placeholder(shape_y, name="data_y", dtype=input_data_type) res = real_div_compute(data_x, data_y, y, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": (data_x, data_y, res) } te.lang.cce.cce_build_code(schedule, config)
def fake_quant_minmax_update(x, min_val, max_val, min_up, max_up, ema, ema_decay, symmetric, narrow_range, training, num_bits, kernel_name="fake_quant_minmax_update"): """FakeQuantPerLayer op""" input_shape = x.get("shape") input_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") min_shape = util.scalar2tensor_one(min_shape) max_shape = util.scalar2tensor_one(max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(min_shape, 1, 1, 1) util.check_shape_rule(max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = input_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]),) shape_min, _, _ = util.produce_shapes(min_shape, input_shape) if symmetric: quant_min = 0 - 2 ** (num_bits - 1) quant_max = 2 ** (num_bits - 1) - 1 else: quant_min = 0 quant_max = 2 ** num_bits - 1 if narrow_range: quant_min = quant_min + 1 input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype) max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype) res_list = fake_quant_minmax_update_compute(input_data, min_data, max_data, ema, ema_decay, quant_min, quant_max, training, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [input_data, min_data, max_data] + list(res_list) config = {"print_ir": False, "name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def minmax_update_perlayer(x, min_val, max_val, min_up, max_up, ema, ema_decay, kernel_name="minmax_update_perlayer"): """MinMaxUpdatePerLayer op""" input_shape = x.get("shape") input_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") min_shape = util.scalar2tensor_one(min_shape) max_shape = util.scalar2tensor_one(max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(min_shape, 1, 1, 1) util.check_shape_rule(max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = input_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), ) shape_min, _, _ = util.produce_shapes(min_shape, input_shape) input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype) max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype) res_list = minmax_update_perlayer_compute(input_data, min_data, max_data, ema, ema_decay) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [input_data, min_data, max_data] + list(res_list) config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def check_ori_shape(input0, input1, input2): """ check the ND shapes whether they can be broadcasted """ shape_0 = list(util.scalar2tensor_one(input0.get("ori_shape"))) shape_1 = list(util.scalar2tensor_one(input1.get("ori_shape"))) shape_2 = list(util.scalar2tensor_one(input2.get("ori_shape"))) shape_input0, shape_input1, shape_max_mul = \ broadcast_shapes(shape_0, shape_1, param_name_input1="input0", param_name_input2="input1") shape_input2, shape_max_mul, shape_max_add0 = \ broadcast_shapes(shape_0, shape_2, param_name_input1="input0", param_name_input2="input2")
def minimum(x1, x2, y, kernel_name="minimum"): """ do element-wise minimum operation between two input tensors Parameters: ---------- x1 : dict shape and dtype of first input, only support float16, float32, int32 x2 : dict shape and dtype of second input, only support float16, float32, int32 y: dict shape and dtype of output, should be the broadcast shape and type as input kernel_name : str cce kernel name, default value is minimum Returns ------- None """ shape1 = util.scalar2tensor_one(x1.get("shape")) shape2 = util.scalar2tensor_one(x2.get("shape")) check_shape(shape1, param_name="x1") check_shape(shape2, param_name="x2") check_list = ["float16", "float32", "int32"] dtype = x1.get("dtype").lower() dtype_x2 = x2.get("dtype").lower() check_dtype(dtype, check_list, param_name="x1") check_dtype(dtype_x2, check_list, param_name="x2") shape1, shape2, _ = broadcast_shapes(shape1, shape2, param_name_input1="x1", param_name_input2="x2") data1 = tvm.placeholder(shape1, dtype=dtype, name="data1") data2 = tvm.placeholder(shape2, dtype=dtype, name="data2") res = minimum_compute(data1, data2, y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data1, data2, res] } te.lang.cce.cce_build_code(sch, config)
def _infer_shape(format_pattern, x, y): shape_x = x.get("shape") shape_y = y.get("shape") ori_shape_x = x.get("ori_shape") ori_shape_y = y.get("ori_shape") shape_x = util.scalar2tensor_one(shape_x) shape_y = util.scalar2tensor_one(shape_y) if format_pattern == 1: ori_shape_x, shape_y, _ = util.produce_shapes(ori_shape_x, shape_y) if shape_y[-2] == 1 and shape_y[-1] == ori_shape_x[-1]: shape_y.append(1) shape_y.append(1) shape_y[-3] = 1 shape_y[-1] = shape_x[-1] shape_y[-4] = shape_x[-4] elif shape_y[-2] == ori_shape_x[-2] and shape_y[-1] == 1: shape_y.append(1) shape_y.append(1) shape_y[-4] = 1 shape_y[-2] = shape_x[-2] shape_y[-3] = shape_x[-3] elif shape_y[-2] == shape_y[-1] == 1: shape_y.append(1) shape_y.append(1) elif format_pattern == 2: shape_x, ori_shape_y, _ = util.produce_shapes(shape_x, ori_shape_y) if shape_x[-2] == 1 and shape_x[-1] == ori_shape_y[-1]: shape_x.append(1) shape_x.append(1) shape_x[-3] = 1 shape_x[-1] = shape_y[-1] shape_x[-4] = shape_y[-4] elif shape_x[-2] == ori_shape_y[-2] and shape_x[-1] == 1: shape_x.append(1) shape_x.append(1) shape_x[-4] = 1 shape_x[-2] = shape_y[-2] shape_x[-3] = shape_y[-3] elif shape_x[-2] == shape_x[-1] == 1: shape_x.append(1) shape_x.append(1) return shape_x, shape_y
def less(input_x, input_y, output_z, kernel_name="less"): """ do element-wise less operation between two input tensors Parameters: ---------- input_x : dict shape and dtype of first input, support float16,float32,int32, int8,uint8 input_y : dict shape and dtype of second input, support float16,float32,int32, int8,uint8 output_x: dict shape and dtype of output, should be broadcast shape and type as input kernel_name : str cce kernel name, default value is less Returns ------- None """ shape_x = util.scalar2tensor_one(input_x.get("shape")) shape_y = util.scalar2tensor_one(input_y.get("shape")) check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") check_list = ("float16", "float32", "int32", "int8", "uint8") input_dtype = input_x.get("dtype").lower() check_dtype(input_dtype, check_list, param_name="input_x") shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y) data_x = tvm.placeholder(shape_x, dtype=input_dtype, name="data_x") data_y = tvm.placeholder(shape_y, dtype=input_dtype, name="data_y") res = less_compute(data_x, data_y, output_z, kernel_name="less") with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_x, data_y, res] } te.lang.cce.cce_build_code(sch, config)
def _check_para_and_getplaceholder(scalar_input, tensor_input, input_dict): check_list = ("float32", ) var_shape = input_dict["var"].get("shape") var_dtype = input_dict["var"].get("dtype") list_placeholder = [] for key, value in input_dict.items(): shape = util.scalar2tensor_one(value.get("shape")) op_utils.check_shape(shape) if value in scalar_input: if not util.is_scalar(shape): raise RuntimeError("The shape of ", key, " must be scalar") if value in tensor_input: if shape != var_shape: raise RuntimeError("The shape of", key, "must be the same as the var") dtype = value.get("dtype").lower() op_utils.check_dtype(dtype, check_list, param_name="var") if dtype != var_dtype: raise RuntimeError("The dtype of", key, "must be the same as the var") shape_refine = (functools_reduce(operator.mul, shape), ) list_placeholder.append( tvm.placeholder(shape=shape_refine, name=key, dtype=dtype)) return list_placeholder
def _condition(x, perm, shape, transpose_first): shape_x = util.scalar2tensor_one(x.get("ori_shape")) if transpose_first: shape_reshapein = _shape_after_transpose(shape_x, perm) else: shape_reshapein = shape_x if not _division_sixteen(_shape_after_transpose(shape, perm)): return False if (len(perm) == 4 and _division_sixteen(shape_x) and perm[3] == 3): if len(shape_reshapein) == 2 and len(shape) == 4: if (shape[0] * shape[1] == shape_reshapein[0] and shape[2] * shape[3] == shape_reshapein[1]): return True if len(shape_reshapein) == 4 and len(shape) == 2: if (shape_reshapein[0] * shape_reshapein[1] == shape[0] and shape_reshapein[2] * shape_reshapein[3] == shape[1]): return True if len(shape_reshapein) == 3 and len(shape) == 4: if (shape[1] * shape[2] == shape_reshapein[1] and shape[0] == shape_reshapein[0] and shape[3] == shape_reshapein[2]): return True if len(shape_reshapein) == 4 and len(shape) == 3: if (shape_reshapein[1] * shape_reshapein[2] == shape[1] and shape_reshapein[0] == shape[0] and shape_reshapein[3] == shape[2]): return True return False
def sub(input_x, input_y, output_z, kernel_name="sub"): """ do element-wise sub operation between two input tensors Parameters: ---------- input_x : dict shape and dtype of input, only support float16, float32,int32 input_y : dict shape and dtype of input, only support float16, float32,int32 output_z: dict shape and dtype of output, should be same shape and type as input kernel_name : kernel name, default value is "sub" Returns ------- None """ shape_x = util.scalar2tensor_one(input_x.get("shape")) shape_y = util.scalar2tensor_one(input_y.get("shape")) check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") check_list = ["float16", "float32", "int32"] dtype = input_x.get("dtype").lower() if not dtype in check_list: raise RuntimeError("sub only support float16, float32, int32") shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") data1 = tvm.placeholder(shape_x, dtype=dtype, name="data1") data2 = tvm.placeholder(shape_y, dtype=dtype, name="data2") res = sub_compute(data1, data2, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data1, data2, res] } te.lang.cce.cce_build_code(sch, config)
def greater(x, y, z, kernel_name="greater"): """ do element-wise greater operation between two input tensors Parameters: ---------- x : dict shape and dtype of input data_x y : dict shape and dtype of input data_y z : dict shape and dtype of output data_z kernel_name : str cce kernel name, default value is "greater" Returns ------- None """ shape_input_x = util.scalar2tensor_one(x.get("shape")) dtype_input_x = x.get("dtype").lower() shape_input_y = util.scalar2tensor_one(y.get("shape")) dtype_input_y = y.get("dtype").lower() check_shape(shape_input_x, param_name="x") check_shape(shape_input_y, param_name="y") check_list = ("float16", "float32", "int32", "int8", "uint8") check_dtype(dtype_input_x, check_list, param_name="x") shape_list = broadcast_shapes(shape_input_x, shape_input_y, param_name_input1="x", param_name_input2="y") reshape_x, reshape_y = refine_shapes_for_broadcast(shape_list[0], shape_list[1]) data_x = tvm.placeholder(reshape_x, dtype=dtype_input_x, name="data_x") data_y = tvm.placeholder(reshape_y, dtype=dtype_input_y, name="data_y") res = greater_compute(data_x, data_y, z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x, data_y, res]} te.lang.cce.cce_build_code(sch, config)
def _add_check_format(x, y): shape1 = x.get("shape") shape2 = y.get("shape") list_format = [x.get("format"), y.get("format")] shape1 = util.scalar2tensor_one(shape1) shape2 = util.scalar2tensor_one(shape2) format_list = ("ND", "NCHW", "NHWC") if list_format[0] == "FRACTAL_NZ" and list_format[1] in format_list \ and (len(shape2) != 1 or (len(shape2) == 1 and shape2[0] != 1)): format_pattern = 1 elif list_format[0] in format_list and list_format[1] == "FRACTAL_NZ" \ and (len(shape1) != 1 or (len(shape1) == 1 and shape1[0] != 1)): format_pattern = 2 else: format_pattern = 0 return format_pattern
def maximum(input_x, input_y, output_z, kernel_name="maximum"): """ do element-wise maximum operation between two input tensors """ shape1 = te.lang.cce.util.shape_to_list(input_x.shape) shape2 = te.lang.cce.util.shape_to_list(input_y.shape) shape1 = util.scalar2tensor_one(shape1) shape2 = util.scalar2tensor_one(shape2) shape1, shape2, shape_max = broadcast_shapes(shape1, shape2, param_name_input1="input_x", param_name_input2="input_y") data1_tmp1 = te.lang.cce.broadcast(input_x, shape_max) data2_tmp1 = te.lang.cce.broadcast(input_y, shape_max) res = te.lang.cce.vmax(data1_tmp1, data2_tmp1) return res
def relu6_d(input_x, output_y, scale=1.0, kernel_name="relu6_d"): """ f(x)= 6(x >= 6) f(x)= 0(x <= 0) f(x)= x(0<x<6) Parameters ---------- input_x : dict shape and dtype of input_x output_y : dict shape and dtype of output_y, should be same shape and type as input kernel_name : str cce kernel name, default value is "relu6" Returns ------ None """ input_shape = util.scalar2tensor_one(input_x.get("shape")) input_dtype = input_x.get("dtype").lower() op_utils.check_shape(input_shape, param_name="input_x") vmaxs_support = tbe_platform.cce_conf.api_check_support( "te.lang.cce.vmaxs", "float32") if input_dtype == "float32" and not vmaxs_support: raise RuntimeError( "Input dtype is float32, but do not support on the platform") # check input tensor data_type check_list = ("int32", "float16", "float32") op_utils.check_dtype(input_dtype, check_list, param_name="input_x") input_shape = [reduce_ins(lambda x, y: x * y, input_shape[:])] input_data = tvm.placeholder(input_shape, name="input_data", dtype=input_dtype) final_res = relu6_d_compute(input_data, output_y, scale, kernel_name=kernel_name) with tvm.target.cce(): auto_sch = topi.generic.auto_schedule(final_res) config = {"name": kernel_name, "tensor_list": (input_data, final_res)} te.lang.cce.cce_build_code(auto_sch, config)
def reciprocal(input_x, output_y, kernel_name="reciprocal"): """ algorithm: reciprocal calculating data's reciprocal,y= 1 / x Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32 output_y: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is reciprocal Returns ------- None """ shape = util.scalar2tensor_one(input_x.get("shape")) check_shape(shape, param_name="input_x") check_list = ["float16", "float32"] inp_dtype = input_x.get("dtype").lower() check_dtype(inp_dtype, check_list, param_name="input_x") shape = util.shape_refine(shape) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape) data = tvm.placeholder(fuseshape, name="data", dtype=inp_dtype) res = reciprocal_compute(data, output_y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)
def fused_mul_add(input0, input1, input2, output, kernel_name="fused_mul_add"): """ function: fused for mul+add Parameters ---------- input0: dict the dict of input of mul, support float16,float32,int32 input1: dict the dict of input of mul, support float16,float32,int32 input2: dict the dict of input of add, support float16,float32,int32 output: dict the dict of output of add, support float16,float32,int32 kernel_name: str cce kernel name, default value is fused_mul_add Returns ------- None """ shape_input0 = list(util.scalar2tensor_one(input0.get("shape"))) shape_input1 = list(util.scalar2tensor_one(input1.get("shape"))) shape_input2 = list(util.scalar2tensor_one(input2.get("shape"))) dtype_input0 = input0.get("dtype").lower() dtype_input1 = input1.get("dtype").lower() dtype_input2 = input2.get("dtype").lower() format_input0 = input0.get("format").upper() format_input1 = input1.get("format").upper() format_input2 = input2.get("format").upper() check_ori_shape(input0, input1, input2) format_pattern = check_format(format_input0, format_input1, format_input2) if format_pattern in [1, 2, 3]: shape_input0, shape_input1, shape_input2 = \ _infer_shape_one(shape_input0, shape_input1, shape_input2, format_pattern) elif format_pattern == 4: shape_input0, shape_input1, shape_input2 = \ _infer_shape_two(shape_input0, shape_input1, shape_input2, format_pattern) else: shape_input0, shape_input1, shape_max_mul = \ broadcast_shapes(shape_input0, shape_input1, param_name_input1="input0", param_name_input2="input1") shape_input2, shape_max_mul, shape_max_add0 = \ broadcast_shapes(shape_input2, shape_max_mul, param_name_input1="input2", param_name_input2="shape_max_mul") data_input0 = tvm.placeholder(shape_input0, name="data_input0", dtype=dtype_input0) data_input1 = tvm.placeholder(shape_input1, name="data_input1", dtype=dtype_input1) data_input2 = tvm.placeholder(shape_input2, name="data_input2", dtype=dtype_input2) res = fused_mul_add_compute(data_input0, data_input1, data_input2, output, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": (data_input0, data_input1, data_input2, res) } te.lang.cce.cce_build_code(sch, config)
def op_select_format(input0, input1, input2, output, kernel_name="fused_mul_add"): """ _division_sixteen : judge whether the last two dimensions are divided by 16 scalar2tensor_one : convert scalar to tensor """ shape_0 = input0.get("ori_shape") shape_1 = input1.get("ori_shape") shape_2 = input2.get("ori_shape") shape_0 = util.scalar2tensor_one(shape_0) shape_1 = util.scalar2tensor_one(shape_1) shape_2 = util.scalar2tensor_one(shape_2) if _division_sixteen(shape_0) and not _division_sixteen(shape_1) \ and not _division_sixteen(shape_2): # Nz+ND+ND input0 = gen_param(classify="input0", name="x1", datatype="float16,float16,float16,float16,float16,\ float,float,float,float,float,\ int32,int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ") input1 = gen_param(classify="input1", name="x2", datatype="float16,float16,float16,float16,float16,\ float,float,float,float,float,\ int32,int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,ND,\ NCHW,NC1HWC0,NHWC,ND,ND,\ NCHW,NC1HWC0,NHWC,ND,ND") input2 = gen_param(classify="input2", name="x3", datatype="float16,float16,float16,float16,float16,\ float,float,float,float,float,\ int32,int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,ND,\ NCHW,NC1HWC0,NHWC,ND,ND,\ NCHW,NC1HWC0,NHWC,ND,ND") output0 = gen_param(classify="output0", name="y", datatype="float16,float16,float16,float16,float16,\ float,float,float,float,float,\ int32,int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ") elif _division_sixteen(shape_0) and not _division_sixteen(shape_1) \ and _division_sixteen(shape_2): # Nz+ND+Nz input0 = gen_param(classify="input0", name="x1", datatype="float16,float16,float16,float16,float16,\ float,float,float,float,float,\ int32,int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ") input1 = gen_param(classify="input1", name="x2", datatype="float16,float16,float16,float16,float16,\ float,float,float,float,float,\ int32,int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,ND,\ NCHW,NC1HWC0,NHWC,ND,ND,\ NCHW,NC1HWC0,NHWC,ND,ND") input2 = gen_param(classify="input2", name="x3", datatype="float16,float16,float16,float16,float16,\ float,float,float,float,float,\ int32,int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ") output0 = gen_param(classify="output0", name="y", datatype="float16,float16,float16,float16,float16,\ float,float,float,float,float,\ int32,int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ") elif not _division_sixteen(shape_0) and _division_sixteen(shape_1) \ and not _division_sixteen(shape_2): # ND+NZ+ND input0 = gen_param(classify="input0", name="x1", datatype="float16,float16,float16,float16,float16,\ float,float,float,float,float,\ int32,int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,ND,\ NCHW,NC1HWC0,NHWC,ND,ND,\ NCHW,NC1HWC0,NHWC,ND,ND") input1 = gen_param(classify="input1", name="x2", datatype="float16,float16,float16,float16,float16,\ float,float,float,float,float,\ int32,int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ") input2 = gen_param(classify="input2", name="x3", datatype="float16,float16,float16,float16,float16,\ float,float,float,float,float,\ int32,int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,ND,\ NCHW,NC1HWC0,NHWC,ND,ND,\ NCHW,NC1HWC0,NHWC,ND,ND") output0 = gen_param(classify="output0", name="y", datatype="float16,float16,float16,float16,float16,\ float,float,float,float,float,\ int32,int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ") elif not _division_sixteen(shape_0) and not _division_sixteen(shape_1) \ and _division_sixteen(shape_2): # ND+ND+NZ input0 = gen_param(classify="input0", name="x1", datatype="float16,float16,float16,float16,float16,\ float,float,float,float,float,\ int32,int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,ND,\ NCHW,NC1HWC0,NHWC,ND,ND,\ NCHW,NC1HWC0,NHWC,ND,ND") input1 = gen_param(classify="input1", name="x2", datatype="float16,float16,float16,float16,float16,\ float,float,float,float,float,\ int32,int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,ND,\ NCHW,NC1HWC0,NHWC,ND,ND,\ NCHW,NC1HWC0,NHWC,ND,ND") input2 = gen_param(classify="input2", name="x3", datatype="float16,float16,float16,float16,float16,\ float,float,float,float,float,\ int32,int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ") output0 = gen_param(classify="output0", name="y", datatype="float16,float16,float16,float16,float16,\ float,float,float,float,float,\ int32,int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ,\ NCHW,NC1HWC0,NHWC,ND,FRACTAL_NZ") else: # ND+ND input0 = gen_param(classify="input0", name="x1", datatype="float16,float16,float16,float16,\ float,float,float,float,\ int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,\ NCHW,NC1HWC0,NHWC,ND,\ NCHW,NC1HWC0,NHWC,ND") input1 = gen_param(classify="input1", name="x2", datatype="float16,float16,float16,float16,\ float,float,float,float,\ int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,\ NCHW,NC1HWC0,NHWC,ND,\ NCHW,NC1HWC0,NHWC,ND") input2 = gen_param(classify="input2", name="x3", datatype="float16,float16,float16,float16,\ float,float,float,float,\ int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,\ NCHW,NC1HWC0,NHWC,ND,\ NCHW,NC1HWC0,NHWC,ND") output0 = gen_param(classify="output0", name="y", datatype="float16,float16,float16,float16,\ float,float,float,float,\ int32,int32,int32,int32", format="NCHW,NC1HWC0,NHWC,ND,\ NCHW,NC1HWC0,NHWC,ND,\ NCHW,NC1HWC0,NHWC,ND") param_list = [input0, input1, input2, output0] param_dynamic_in_json = get_dynamic_param_in_json(param_list) return param_dynamic_in_json
def lamb_next_m_v(input_mul3, input_mul2, input_realdiv1, input_mul1, input_mul0, input_realdiv0, input_mul4, mul0_x, mul1_sub, mul2_x, mul3_sub1, mul4_x, add2_y, y1, y2, y3, y4, kernel_name="lamb_next_m_v"): """ function: For bert lamb fuse Parameters ---------- input_mul3: dict the dict of input of mul_3, and dtype supports 'float16', 'float32' input_mul2: dict the dict of input of mul_2, and dtype supports 'float16', 'float32' input_realdiv1 the dict of input of truediv_1, and dtype supports 'float16', 'float32' input_mul1: dict the dict of input of mul_1, and dtype supports 'float16', 'float32' input_mul0: dict the dict of input of mul, and dtype supports 'float16', 'float32' input_realdiv0 the dict of input of truediv, and dtype supports 'float16', 'float32' input_mul4: dict the dict of input of mul_4, and dtype supports 'float16', 'float32' mul0_x: dict the dict of input of mul, and dtype supports 'float16', 'float32' mul1_sub: dict the dict of input of mul_1, and dtype supports 'float16', 'float32' mul2_x: dict the dict of input of mul_2, and dtype supports 'float16', 'float32' mul3_sub1: dict the dict of input of mul_3, and dtype supports 'float16', 'float32' mul4_x: dict the dict of input of mul_4, and dtype supports 'float16', 'float32' add2_y: dict the dict of input of add_2 and add_4, and dtype supports 'float16', 'float32' y1: dict the dict of output of add_3, and dtype supports 'float16', 'float32' y2: dict the dict of output of add, and dtype supports 'float16', 'float32' y3: dict the dict of output of add_1, and dtype supports 'float16', 'float32' y4: dict the dict of output of truediv_4, and dtype supports 'float16', 'float32' kernel_name: str cce kernel name, default value is lamb_next_m_v Returns ------- None """ shape_input_mul3 = util.scalar2tensor_one(input_mul3.get("shape")) shape_input_mul2 = util.scalar2tensor_one(input_mul2.get("shape")) shape_input_realdiv1 = util.scalar2tensor_one(input_realdiv1.get("shape")) shape_input_mul1 = util.scalar2tensor_one(input_mul1.get("shape")) shape_input_mul0 = util.scalar2tensor_one(input_mul0.get("shape")) shape_input_realdiv0 = util.scalar2tensor_one(input_realdiv0.get("shape")) shape_input_mul4 = util.scalar2tensor_one(input_mul4.get("shape")) shape_mul0_x = util.scalar2tensor_one(mul0_x.get("shape")) shape_mul1_sub = util.scalar2tensor_one(mul1_sub.get("shape")) shape_mul2_x = util.scalar2tensor_one(mul2_x.get("shape")) shape_mul3_sub1 = util.scalar2tensor_one(mul3_sub1.get("shape")) shape_mul4_x = util.scalar2tensor_one(mul4_x.get("shape")) shape_add2_y = util.scalar2tensor_one(add2_y.get("shape")) input_dtype = input_mul3.get("dtype").lower() shape_input_mul3, shape_mul3_sub1, shape_max_mul3 = \ broadcast_shapes(shape_input_mul3, shape_mul3_sub1, param_name_input1="input_mul3", param_name_input2="mul3_sub1") shape_input_mul2, shape_mul2_x, shape_max_mul2 = \ broadcast_shapes(shape_input_mul2, shape_mul2_x, param_name_input1="input_mul2", param_name_input2="mul2_x") shape_max_mul2, shape_max_mul3, shape_max_add1 = \ broadcast_shapes(shape_max_mul2, shape_max_mul3, param_name_input1="shape_max_mul2", param_name_input2="shape_max_mul3") shape_input_realdiv1, shape_max_add1, shape_max_truediv1 = \ broadcast_shapes(shape_input_realdiv1, shape_max_add1, param_name_input1="input_realdiv1", param_name_input2="shape_max_add1") shape_max_truediv1, shape_add2_y, shape_max_add2 = \ broadcast_shapes(shape_max_truediv1, shape_add2_y, param_name_input1="shape_max_truediv1", param_name_input2="add2_y") shape_input_mul1, shape_mul1_sub, shape_max_mul1 = \ broadcast_shapes(shape_input_mul1, shape_mul1_sub, param_name_input1="input_mul1", param_name_input2="mul1_sub") shape_input_mul0, shape_mul0_x, shape_max_mul0 = \ broadcast_shapes(shape_input_mul0, shape_mul0_x, param_name_input1="input_mul0", param_name_input2="mul0_x") shape_max_mul0, shape_max_mul1, shape_max_add0 = \ broadcast_shapes(shape_max_mul0, shape_max_mul1, param_name_input1="shape_max_mul0", param_name_input2="shape_max_mul1") shape_max_add0, shape_input_realdiv0, shape_max_truediv0 = \ broadcast_shapes(shape_max_add0, shape_input_realdiv0, param_name_input1="shape_max_add0", param_name_input2="input_realdiv0") shape_input_mul4, shape_mul4_x, shape_max_mul4 = \ broadcast_shapes(shape_input_mul4, shape_mul4_x, param_name_input1="input_mul4", param_name_input2="mul4_x") data_input_mul3 = tvm.placeholder(shape_input_mul3, name="data_input_mul3", dtype=input_dtype) data_input_mul2 = tvm.placeholder(shape_input_mul2, name="data_input_mul2", dtype=input_dtype) data_input_realdiv1 = tvm.placeholder(shape_input_realdiv1, name="data_input_realdiv1", dtype=input_dtype) data_input_mul1 = tvm.placeholder(shape_input_mul1, name="data_input_mul1", dtype=input_dtype) data_input_mul0 = tvm.placeholder(shape_input_mul0, name="data_input_mul0", dtype=input_dtype) data_input_realdiv0 = tvm.placeholder(shape_input_realdiv0, name="data_input_realdiv0", dtype=input_dtype) data_input_mul4 = tvm.placeholder(shape_input_mul4, name="data_input_mul4", dtype=input_dtype) data_mul0_x = tvm.placeholder(shape_mul0_x, name="data_mul0_x", dtype=input_dtype) data_mul1_sub = tvm.placeholder(shape_mul1_sub, name="data_mul1_sub", dtype=input_dtype) data_mul2_x = tvm.placeholder(shape_mul2_x, name="data_mul2_x", dtype=input_dtype) data_mul3_sub1 = tvm.placeholder(shape_mul3_sub1, name="data_mul3_sub1", dtype=input_dtype) data_mul4_x = tvm.placeholder(shape_mul4_x, name="data_mul4_x", dtype=input_dtype) data_add2_y = tvm.placeholder(shape_add2_y, name="data_add2_y", dtype=input_dtype) res = lamb_next_m_v_compute(data_input_mul3, data_input_mul2, data_input_realdiv1, data_input_mul1, data_input_mul0, data_input_realdiv0, data_input_mul4, data_mul0_x, data_mul1_sub, data_mul2_x, data_mul3_sub1, data_mul4_x, data_add2_y, y1, y2, y3, y4, kernel_name) inputlist = [ data_input_mul3, data_input_mul2, data_input_realdiv1, data_input_mul1, data_input_mul0, data_input_realdiv0, data_input_mul4, data_mul0_x, data_mul1_sub, data_mul2_x, data_mul3_sub1, data_mul4_x, data_add2_y ] with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": list(inputlist) + list(res)} te.lang.cce.cce_build_code(sch, config)
def cast(input_x, output_y, dst_type, kernel_name="cast"): """ cast a tensor/scaler with input shape form src data type to dst data type. restrictions of input algorithms are as follow only types' groups blow are support tensor process: float16->float32 float16->int32 float32->float16 float32->int32 int8->float32 uint8->float32 int8->float16 uint8->float16 int8->int32 uint8->int32 int32->uint8 // number out of [0,255] can get unexpected result int32->int8 // number out of [-128,127] can get unexpected result int32->float32 // For tans with fp16, only guarantees number in [-1023,1023] get correct result int32->float16 // only guarantees number in [-1023,1023] get correct result scale convert support:(means only support shape [1,]) int64->int32 int64->float32 Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32 output_y: dict shape and dtype of output, should be same shape as input, and the dtype is the dst dtype need to cast kernel_name : str cce kernel name, default value is cast Returns ------- None """ shape = util.scalar2tensor_one(input_x.get("shape")) src_type = input_x.get("dtype").lower() check_shape(shape, param_name="input_x") if src_type == "bool": src_type = "int8" dst_type = _cast_dsttype_conversion(dst_type) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape) data = tvm.placeholder(fuseshape, name="data", dtype=src_type) if src_type == "int64": check_dtype(dst_type, ("float32", "int32"), param_name="dst_type") res = tvm.extern( [fuseshape], [data], lambda ins, outs: _kernel_ir(outs, ins, dst_type, "int64"), name="res", dtype=dst_type) tensor_list = [data, res] schedule = tvm.create_schedule(res.op) with build_config: tvm.build(schedule, tensor_list, "cce", name=kernel_name) else: with tvm.target.cce(): res = cast_compute(data, output_y, dst_type, kernel_name) sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)
def fake_quant_with_min_max_vars(x, min, max, y, num_bits, narrow_range, kernel_name="fake_quant_with_min_max_vars"): """ algorithm: calculate the fake quant value of input tensor calculating data's fake quant Parameters ---------- x: dict shape and dtype of input data min: dict shape and dtype of min max: dict shape and dtype of max y: dict shape and dtype of fake quant output num_bits: int define the range of quant max narrow_range: bool define the range of quant min kernel_name : string cce kernel name, default value is "fake_quant_with_min_max_vars" Returns ------- None """ input_shape = x.get("shape") input_dtype = x.get("dtype") min_shape = min.get("shape") min_dtype = min.get("dtype") max_shape = max.get("shape") max_dtype = max.get("dtype") min_shape = util.scalar2tensor_one(min_shape) max_shape = util.scalar2tensor_one(max_shape) check_shape(input_shape, param_name="x") check_shape(min_shape, min_rank=1, max_rank=1, param_name="min") check_shape(max_shape, min_rank=1, max_rank=1, param_name="max") if num_bits > 16 or num_bits < 2: raise RuntimeError( "The value of num_bits must be between" "2 and 16") check_tuple = ("float32",) x_type = input_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() check_dtype(x_type, check_tuple, param_name="x") check_dtype(min_dtype, check_tuple, param_name="min") check_dtype(max_dtype, check_tuple, param_name="max") input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]),) shape_min, shape_max, shape_broadcast = broadcast_shapes(min_shape, input_shape, param_name_input1="min", param_name_input2="x") data = tvm.placeholder(input_shape, dtype=x_type, name="data_input") data_min = tvm.placeholder(shape_min, dtype=min_dtype, name="data_min") data_max = tvm.placeholder(shape_min, dtype=max_dtype, name="data_max") res = fake_quant_with_min_max_vars_compute(data, data_min, data_max, y, num_bits, narrow_range, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": (data, data_min, data_max, res)} te.lang.cce.cce_build_code(schedule, config)