def _get_data_gm(shapes, dtype): """ get placeholders of data_dy, data_x, data_variance, data_mean and data_gamma Parameters ---------- shapes: dict {"shape_dy": shape_dy, "shape_x": shape_x, "shape_var": shape_variance, "shape_mean": shape_mean, "shape_gamma": shape_gamma} dtype: str the data type Returns ------- data_gm: tuple (data_dy, data_x, data_variance, data_mean, data_gamma) """ data_dy = tvm.placeholder(shapes.get("shape_dy"), name="data_dy", dtype=dtype) data_x = tvm.placeholder(shapes.get("shape_x"), name="data_x", dtype=dtype) data_variance = tvm.placeholder(shapes.get("shape_var"), name="data_variance", dtype=dtype) data_mean = tvm.placeholder(shapes.get("shape_mean"), name="data_mean", dtype=dtype) data_gamma = tvm.placeholder(shapes.get("shape_gamma"), name="data_gamma", dtype=dtype) data_gm = (data_dy, data_x, data_variance, data_mean, data_gamma) return data_gm
def correction_mul(x, batch_std, running_std, y, channel, kernel_name="correction_mul"): """CorrectionMul op""" shape = x.get("shape") data_format = x.get("format") util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32"] inp_dtype = x.get("dtype").lower() if not inp_dtype in check_list: raise RuntimeError("Dtype of input only support float16, float32") # shape = util.shape_refine(shape) x_t = tvm.placeholder(shape, name="x", dtype=inp_dtype) shape_c = [1] * len(shape) shape_c[channel] = batch_std.get("ori_shape")[0] if data_format == "NC1HWC0" and channel == 1: shape_c = batch_std.get("shape") batch_std_t = tvm.placeholder(shape_c, name="batch_std", dtype=inp_dtype) running_std_t = tvm.placeholder(shape_c, name="running_std", dtype=inp_dtype) res = correction_mul_compute(x_t, batch_std_t, running_std_t, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"print_ir": False, "name": kernel_name, "tensor_list": [x_t, batch_std_t, running_std_t, res]} te.lang.cce.cce_build_code(sch, config)
def optional_weight(tensor_list, predict_shape, dtype_list, weight, pos_weight): weight_data = None pos_weight_data = None if weight is not None: weight_shape = weight.get("shape") weight_dtype = weight.get("dtype").lower() op_utils.check_dtype(weight_dtype, dtype_list) _broadcast_shape_check(weight_shape, predict_shape) weight_shape = tuple( [1] * (len(predict_shape) - len(weight_shape))) + tuple(weight_shape) weight_data = tvm.placeholder(weight_shape, weight_dtype, name="weight_data") tensor_list.append(weight_data) if pos_weight is not None: pos_weight_shape = pos_weight.get("shape") pos_weight_dtype = pos_weight.get("dtype").lower() op_utils.check_dtype(pos_weight_dtype, dtype_list) _broadcast_shape_check(pos_weight_shape, predict_shape) pos_weight_shape = tuple([1] * (len(predict_shape) - len(pos_weight_shape)) ) + tuple(pos_weight_shape) pos_weight_data = tvm.placeholder(pos_weight_shape, pos_weight_dtype, name="pos_weight_data") tensor_list.append(pos_weight_data) return weight_data, pos_weight_data
def squared_difference(x1, x2, y, kernel_name="squared_difference"): """ algorithm: squared_difference calculating data's tf_squared_difference,y= (x - y) * (x - y) Parameters ---------- x2 : dict shape and dtype of y input, only support float16, float32 input_dy : dict shape and dtype of dy input, only support float16, float32 output_x: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is squared_difference Returns ------- None """ shape_x = x1.get("shape") shape_y = x2.get("shape") check_shape(shape_x, param_name="x1") check_shape(shape_y, param_name="x2") check_list = ["float16", "float32", "int32"] dtype = x1.get("dtype").lower() if not dtype in check_list: raise RuntimeError( "tf_squared_difference_cce only support float16, float32, int32") shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y) data_x = tvm.placeholder(shape_x, dtype=dtype, name="data_x") data_y = tvm.placeholder(shape_y, dtype=dtype, name="data_y") with tvm.target.cce(): shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") data_x_tmp = te.lang.cce.broadcast(data_x, shape_max) data_y_tmp = te.lang.cce.broadcast(data_y, shape_max) data_sub = te.lang.cce.vsub(data_x_tmp, data_y_tmp) res = te.lang.cce.vmul(data_sub, data_sub) sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_x, data_y, res] } te.lang.cce.cce_build_code(sch, config)
def diag_part_d(x, assist, y, kernel_name="diag_part_d"): """ Returns the batched diagonal part of a batched tensor Parameters ---------- x: dict dict of x, include keys(shape and dtype) assist: dict dict of help Matrix, Its Diagonal Line value is 1 else value is 0 y: dict dict of output kernel_name: str cce kernel name, default value is "diag_part_d" Returns ------- None """ shape_x = x.get("shape") dtype_x = x.get("dtype") shape_assist = assist.get("shape") dtype_assist = assist.get("dtype") shape_y = y.get("shape") check_shape(shape_x, param_name="x") check_shape(shape_assist, param_name="assist") if len(shape_x) not in (2, 4, 6, 8): raise RuntimeError("Input tensors of rank 2,4,6,8 are supported!") if list(shape_x) != list(shape_assist): raise RuntimeError("the shape of data must be equal!") len_shape_out = len(shape_x) // VALUE_TWO for i in range(len_shape_out): if shape_x[i] != shape_x[i + len_shape_out]: raise RuntimeError("the shape of input is not supported!") if list(shape_x) != list(shape_y + shape_y): raise RuntimeError("the shape of output is not supported!") if list(shape_x) != list(shape_assist): raise RuntimeError("the shape of data must be equal!") check_list = ("float16", "float32", "int32") dtype_x = dtype_x.lower() check_dtype(dtype_x, check_list, param_name="x") dtype_assist = dtype_assist.lower() check_dtype(dtype_assist, check_list, param_name="assist") if dtype_assist != dtype_x: raise RuntimeError("the dtype of data must be equal!") data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_x) data_assist = tvm.placeholder(shape_assist, name="data_assist", dtype=dtype_assist) res = diag_part_d_compute(data_x, data_assist, y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x, data_assist, res]} te.lang.cce.cce_build_code(sch, config)
def minmax_update_perchannel(x, min_val, max_val, min_up, max_up, ema, ema_decay, channel_axis, kernel_name="minmax_update_perchannel"): """MinMaxUpdatePerChannel op""" x_shape = x.get("ori_shape") x_format = x.get("format") x_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") # for Dense weight quant, 2d[co,ci] -> 4d[1,co,ci,1], channel_axis_ need change to 1. if channel_axis == 0 and x_shape[0] != min_shape[0] and x_shape[ 1] == min_shape[0]: channel_axis_ = 1 else: channel_axis_ = channel_axis util.check_kernel_name(kernel_name) util.check_shape_rule(x_shape) util.check_shape_rule(min_shape, 1, 1, x_shape[channel_axis_]) util.check_shape_rule(max_shape, 1, 1, x_shape[channel_axis_]) util.check_tensor_shape_size(x_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = x_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) if channel_axis_ == 0: shape_c = min_val.get("ori_shape") else: shape_c = [min_val.get("shape")[1], min_val.get("shape")[-1]] input_data = tvm.placeholder(x.get("shape"), name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype) max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype) res_list = minmax_update_perchannel_compute(input_data, min_data, max_data, ema, ema_decay, channel_axis_) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [input_data, min_data, max_data] + list(res_list) config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def _conv3dbp_input_achieve_with_tvm(): dedy = tvm.placeholder(shape_dedy, name="dedy", dtype=out_backprop_dtype) shape_filter_ncdhw = [ filter_batch, filter_channel, filter_depth, filter_h, filter_w ] filters = tvm.placeholder(shape_filter_frac, name="filter", dtype=filter_dtype) dedx = te.lang.cce.conv3d_backprop_input_compute( filters=filters, out_backprop=dedy, filter_sizes=shape_filter_ncdhw, input_sizes=input_sizes, strides=strides, padding=pads, dilations=dilations, res_dtype=res_dtype, kernel_name=kernel_name) tensor_list = [filters, dedy, dedx] with tvm.target.cce(): sch = generic.auto_schedule(dedx) config = {"name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def fake_quant_per_layer(x, min_val, max_val, y, symmetric, narrow_range, num_bits, kernel_name="fake_quant_per_layer"): """FakeQuantPerLayer""" input_shape = x.get("shape") input_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") min_shape = util.scalar2tensor_one(min_shape) max_shape = util.scalar2tensor_one(max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(min_shape, 1, 1, 1) util.check_shape_rule(max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = input_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), ) shape_min, _, _ = util.produce_shapes(min_shape, input_shape) quant_min = 0 quant_max = 2**num_bits - 1 if narrow_range: quant_min = quant_min + 1 input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype) max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype) res = fake_quant_per_layer_compute(input_data, min_data, max_data, y, quant_min, quant_max, symmetric, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [input_data, min_data, max_data, res] config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def bn_infer_grad(grads, scale, batch_variance, x_backprop, epsilon=0.0001, kernel_name="bn_infer_grad"): """ algorithm: fused_batch_norm_grad_v2 bn_infer_grad. Parameters ---------- grads: dict dict of grads, A 5D Tensor for input grads. scale: dict dict of scale, A 5D Tensor for input scale. batch_variance: dict dict of batch_variance, A 5D Tensor for input batch_variance. x_backprop: dict dict of x_backprop, A 5D Tensor for output x_backprop. epsilon: float A small float number added to the variance of x. Defaults to `0.0001`. kernel_name: str kernel name, default value is "bn_infer_grad" Returns ------- None """ shape_grads = grads.get("shape") shape_scale = scale.get("shape") shape_batch_variance = batch_variance.get("shape") input_grads_dtype = grads.get("dtype").lower() input_scale_dtype = scale.get("dtype").lower() batch_variance_dtype = batch_variance.get("dtype").lower() check_dtype(input_grads_dtype, ("float32", "float16"), param_name="grads") check_dtype(input_scale_dtype, ("float32",), param_name="scale") check_dtype(batch_variance_dtype, ("float32",), param_name="batch_variance") _check_shape(shape_grads, shape_batch_variance) util.compare_tensor_dict_key(scale, batch_variance, "shape") grads_input = tvm.placeholder(shape_grads, name="grads_input", dtype=input_grads_dtype) scale_input = tvm.placeholder(shape_scale, name="x_input", dtype=input_scale_dtype) batch_variance_input = tvm.placeholder(shape_batch_variance, name="batch_variance_input", dtype=batch_variance_dtype) res = bn_infer_grad_compute(grads_input, scale_input, batch_variance_input, x_backprop, epsilon, kernel_name=kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [grads_input, scale_input, batch_variance_input, res] config = {"name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def logical_or(x1, x2, y, kernel_name="logical_or"): """ algorithm : logical_or calculating the value of x1 OR x2 element-wise Parameters ---------- x1 : the dict of x1, include shape and dtype, dtype support int8, the value only support 0, 1 x2 : the dict of x2, include shape and dtype, dtype support int8, the value only support 0, 1 y : the dict of y, include shape and dtype kernel_name : string, cce kernel name, default value is "logical_or" Returns ------- None """ shape_x1 = x1.get("shape") shape_x2 = x2.get("shape") dtype_x1 = x1.get("dtype") dtype_x2 = x2.get("dtype") if dtype_x1 == "bool" or dtype_x2 == "bool": dtype_x1 = "int8" dtype_x2 = "int8" check_shape(shape_x1, param_name="x1") check_shape(shape_x2, param_name="x2") check_tuple = ("int8", ) check_dtype(dtype_x1, check_tuple, param_name="x1") check_dtype(dtype_x2, check_tuple, param_name="x2") shape_x1, shape_x2, shape_max = broadcast_shapes(shape_x1, shape_x2, param_name_input1="x1", param_name_input2="x2") dtype = dtype_x1.lower() data_x1 = tvm.placeholder(shape_x1, name="data_x1", dtype=dtype) data_x2 = tvm.placeholder(shape_x2, name="data_x2", dtype=dtype) res = logical_or_compute(data_x1, data_x2, y, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = { "print_ir": False, "need_build": False, "name": kernel_name, "tensor_list": (data_x1, data_x2, res) } te.lang.cce.cce_build_code(schedule, config)
def custom_subtract(shape_x, shape_y, dtype, kernel_name="cce_subtract", need_build=True, need_print=True): """ do element-wise subtract operation between two input tensors Parameters: ---------- shape_x : shape of input data1 shape_y : shape of input data2 dtype : source data type, support float16,float32,int32 kernel_name : cce kernel name, default value is "cce_subtract" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_rule(shape_y) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32", "int32"] dtype = dtype.lower() if not (dtype in check_list): raise RuntimeError( "tf_subtract_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) print("######## shape") shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y) util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT) data1 = tvm.placeholder(shape_x, dtype=dtype, name="data1") data2 = tvm.placeholder(shape_y, dtype=dtype, name="data2") with tvm.target.cce(): data1_tmp1 = te.lang.cce.broadcast(data1, shape_max) data2_tmp1 = te.lang.cce.broadcast(data2, shape_max) res = te.lang.cce.vsub(data1_tmp1, data2_tmp1) sch = generic.auto_schedule(res) config = { "print_ir": need_print, "need_build": need_build, "name": kernel_name, "tensor_list": [data1, data2, res] } te.lang.cce.cce_build_code(sch, config)
def mul(x, y, output, kernel_name="mul"): """ do element-wise mul operation between two input tensors Parameters: ---------- x : dict. shape, dtype of input x y : dict. shape, dtype of input y output : dict. shape, dtype of ouput kernel_name : str. cce kernel name, default value is "mul" Returns ------- None """ # format_pattern = 1 Nz and vector # format_pattern = 2 vector and Nz # format_pattern = 0 Nz scalar Nz Nz ND ND format_pattern = _mul_check_format(x, y) shape_x, shape_y = _infer_shape(format_pattern, x, y) shape_x = util.scalar2tensor_one(shape_x) dtype_x = x.get("dtype").lower() shape_y = util.scalar2tensor_one(shape_y) dtype_y = y.get("dtype").lower() op_utils.check_shape(shape_x, param_name="x") op_utils.check_shape(shape_y, param_name="y") if dtype_x != dtype_y: raise RuntimeError("dtype of inputs should be consistent") dtype = dtype_x check_list = ("int32", "float16", "float32", "int16") op_utils.check_dtype(dtype, check_list, param_name="x") vmul_support = tbe_platform.cce_conf.api_check_support( "te.lang.cce.vmul", "float32") if dtype_x == "float32" and not vmul_support: raise RuntimeError( "Input dtype is float32, but do not support on the platform") shape_x, shape_y, shape_max = op_utils.broadcast_shapes( shape_x, shape_y, param_name_input1="x", param_name_input2="y") shape_x, shape_y = op_utils.refine_shapes_for_broadcast(shape_x, shape_y) input_x = tvm.placeholder(shape_x, dtype=dtype, name="x") input_y = tvm.placeholder(shape_y, dtype=dtype, name="y") res = _mul_compute(input_x, input_y, output, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": (input_x, input_y, res)} te.lang.cce.cce_build_code(sch, config)
def atan_grad(y, dy, z, kernel_name="atan_grad"): """ Gradient calculation for atan(x) Parameters: ---------- y : dict of y, include shape and dtype, dtype support float16, float32 dy : dict of dy, include shape and dtype, dtype support float16, float32 z : dict of output, include shape and dtype kernel_name : cce kernel name, default value is atan_grad Algorithm : ---------- forward : y = atan(x) backward gradient : de/dx = dy/dx*de/dy = 1/(1+x^2)*grad Returns ---------- None """ # get the shape and dtype shape = y.get("shape") shape_grad = dy.get("shape") dtype = y.get("dtype") dtype_grad = dy.get("dtype") # check whether kernel name is unique # check whether the shape is right check_shape(shape, param_name="y") check_shape(shape_grad, param_name="dy") if not operator.eq(shape, shape_grad): raise RuntimeError("all input shape must be the same") shape, _ = refine_shape_axes(shape, []) # check whether dtypes are fp16,fp32 and whether they are the same check_list = ("float16", "float32") check_dtype(dtype, check_list, param_name="y") check_dtype(dtype_grad, check_list, param_name="dy") dtype = dtype.lower() if dtype != dtype_grad.lower(): raise RuntimeError("all input dtype must be same") # get 2 input placeholders: data_input, grad data_input = tvm.placeholder(shape, name="input_data", dtype=dtype) grad = tvm.placeholder(shape, name="input_grad", dtype=dtype) # compute the backward gradient res = atan_grad_compute(data_input, grad, z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_input, grad, res]} te.lang.cce.cce_build_code(sch, config)
def custom_equal(shape_x, shape_y, dtype, kernel_name="cce_tf_equal", need_build=False, need_print=False): """ do element-wise equal operation between two input tensors Parameters: ---------- shape_x : shape of input x shape_y : shape of input y dtype : source data type, support float16,float32,int32,int8,uint8 kernel_name : cce kernel name, default value is "cce_tf_equal" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_rule(shape_y) check_list = ["float16", "float32", "int32", "int8", "uint8", "bool"] dtype = dtype.lower() if not (dtype in check_list): raise RuntimeError( "tf_equal_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT) shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y) util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT) x = tvm.placeholder(shape_x, dtype=dtype, name="x") y = tvm.placeholder(shape_y, dtype=dtype, name="y") x_tmp = te.lang.cce.broadcast(x, shape_max) y_tmp = te.lang.cce.broadcast(y, shape_max) res = tvm.compute(shape_max, lambda *i: x_tmp(*i) == y_tmp(*i), name='res') sch = tvm.create_schedule(res.op) if need_print: with build_config: print(tvm.lower(sch, [x, y, res], simple_mode=True)) if need_build: with build_config: tvm.build(sch, [x, y, res], "cce", name=kernel_name)
def gelu_grad(input_dy, input_x, input_y, output_z, kernel_name="gelu_grad"): """ algorithm: gelu_grad calculating: dy*res' res' = res/x + x*0.5*(1 - tanh(math_four)*tanh(math_four))* np.sqrt(2 / np.pi)*(1 + 3*0.044715*x2) math_four = (np.sqrt(2 / np.pi)*(x + 0.044715*tf.pow(x, 3))) Parameters ---------- input_dy : dict shape and dtype of dy input, only support float16, float32 input_x : dict shape and dtype of x input, only support float16, float32 input_y : dict shape and dtype of y input, only support float16, float32 output_z: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is gelu_grad Returns: ------- none. """ shape_dy = input_dy.get("shape") shape_x = input_x.get("shape") shape_y = input_y.get("shape") check_shape(shape_dy, param_name="input_dy") check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") input_dtype = input_dy.get("dtype").lower() check_list = ("float16", "float32") check_dtype(input_dtype, check_list, param_name="input_dy") shape_dy = list(shape_dy) shape_x = list(shape_x) shape_y = list(shape_y) if not (operator.eq(shape_dy, shape_x) and operator.eq(shape_dy, shape_y)): raise RuntimeError("all input shape must be equal") fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape_dy) data_dy = tvm.placeholder(fuseshape, name="data_dy", dtype=input_dtype) data_x = tvm.placeholder(fuseshape, name="data_x", dtype=input_dtype) data_gelu = tvm.placeholder(fuseshape, name="data_gelu", dtype=input_dtype) res = gelu_grad_compute(data_dy, data_x, data_gelu, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_dy, data_x, data_gelu, res] } te.lang.cce.cce_build_code(sch, config)
def softplus_grad(input_gradients, input_features, output_backprops, kernel_name="softplus_grad"): """ Computes softplus gradients for a softplus operation. The gradients: "dy * exp(x) / (1 + exp(x))". Parameters ---------- input_gradients: dict The backpropagated gradients to the corresponding softplus operation. input_features: dict The input_features passed as input to the corresponding softplus operation. source data type support "float16", "float32", "int32", "int8", "uint8". output_backprops: dict data of output. kernel_name: str kernel name, default value is "softplus_grad". Returns ------- None """ shape_dy = input_gradients.get("shape") dtype_dy = input_gradients.get("dtype") shape_x = input_features.get("shape") dtype_x = input_features.get("dtype") if dtype_dy.lower() != dtype_x.lower(): raise RuntimeError("type of dy and type of x must be same, \ while the types are different") dtype = dtype_dy check_shape(shape_dy, param_name="input_gradients") check_shape(shape_x, param_name="input_features") check_list = ("float16", "float32", "int32", "int8", "uint8") input_dtype = dtype.lower() check_dtype(input_dtype, check_list, param_name="input_gradients") shape_dy, shape_x, shape_max = broadcast_shapes( shape_dy, shape_x, param_name_input1="input_gradients", param_name_input2="input_features") reshape_dy, reshape_x = refine_shapes_for_broadcast(shape_dy, shape_x) data_dy = tvm.placeholder(reshape_dy, name="data_dy", dtype=input_dtype) data_x = tvm.placeholder(reshape_x, name="data_x", dtype=input_dtype) res = softplus_grad_compute(data_dy, data_x, output_backprops, kernel_name=kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_dy, data_x, res]} te.lang.cce.cce_build_code(sch, config)
def floor_mod(x1, x2, y, kernel_name="floor_mod"): """ calculate the remainder of division, support fp16,fp32,int32 res = x1 -floor(input_data_x / input_data_y)* input_data_y Parameters ---------- x1: dict dict{"shape":tuple or list,"dtype":str} shape of data the data type, src_dtype equals dst_dtype, support fp16,fp32,int32 x2: dict dict{"shape":tuple or list,"dtype":str} shape of data the data type, src_dtype equals dst_dtype, support fp16,fp32,int32 y: dict, reserved field dict with keys(shape and dtype) of output kernel_name: str cce kernel name, default value is "floor_mod" Returns ------ None """ # get dtype and shape attributes dtype_x = x1.get("dtype").lower() shape_x = x1.get("shape") dtype_y = x2.get("dtype").lower() shape_y = x2.get("shape") # check_kernel_name & shape check_shape(shape_x, param_name="x1") check_shape(shape_y, param_name="x2") # check input tensor data_type check_list = ("float16", "float32", "int32") check_dtype(dtype_x, check_list, param_name="x1") check_dtype(dtype_y, check_list, param_name="x2") if dtype_x != dtype_y: raise RuntimeError("the type of dtype in two dict is not the same") shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y) input_data_x = tvm.placeholder(shape_x, name="input_data_x", dtype=dtype_x) input_data_y = tvm.placeholder(shape_y, name="input_data_y", dtype=dtype_y) res = floor_mod_compute(input_data_x, input_data_y, y, kernel_name) with tvm.target.cce(): auto_sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [input_data_x, input_data_y, res] } te.lang.cce.cce_build_code(auto_sch, config)
def kl_div(input_x, input_target, output_y, reduction, kernel_name="kl_div"): """ Calcuate Kullback-Leibler divergence. output_pos = input_target * (log(input_target) - input_x) output = where(input_target > 0, output_pos, zeros) reduced = reduce_sum_all(output) if reduction = "batchmean": final_res = reduce / input.dim[0] else: final_res = reduced Parameters ---------- input_x : dict shape and dtype of input_x, dtype only support fp16 and fp32. input_target : dict shape and dtype of input_target.Shape and dtype must be same as input_x output_y : dict shape and dtype of output.Dtype must be same as input_x reduction: str Specifies the reduction to apply to the output: reduction="batchmean" or reduction="sum". "batchmean": the sum of the output will be divided by the batchsize "sum": the output will be summed kernel_name : str cce kernel name, default value is "kl_div" Returns ------ None """ # check input parameter _check_parameter(input_x, input_target) shape_x = input_x.get("shape") dtype_x = input_x.get("dtype") batch_size = shape_x[0] shape_one_dim = [reduce_one_dim(lambda x, y: x * y, shape_x[:])] data_x = tvm.placeholder(shape_one_dim, name="data_x", dtype=dtype_x) data_target = tvm.placeholder(shape_one_dim, name="data_target", dtype=dtype_x) final_res = kl_div_compute(data_x, data_target, output_y, reduction, batch_size, kernel_name=kernel_name) with tvm.target.cce(): auto_sch = generic.auto_schedule(final_res) config = { "name": kernel_name, "tensor_list": (data_x, data_target, final_res) } te.lang.cce.cce_build_code(auto_sch, config)
def fake_learned_scale_quant_perlayer( input_x, alpha, quant_max, out, neg_trunc, kernel_name="fake_learned_scale_quant_perlayer"): """FakeLearnedScaleQuantPerLayer""" input_shape = input_x.get("shape") input_dtype = input_x.get("dtype") alpha_shape = alpha.get("ori_shape") alpha_dtype = alpha.get("dtype") quant_max_shape = quant_max.get("ori_shape") quant_max_dtype = quant_max.get("dtype") alpha_shape = util.scalar2tensor_one(alpha_shape) quant_max_shape = util.scalar2tensor_one(quant_max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(alpha_shape, 1, 1, 1) util.check_shape_rule(quant_max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(alpha_shape) util.check_tensor_shape_size(quant_max_shape) check_list = ["float32", "float16"] input_dtype = input_dtype.lower() alpha_dtype = alpha_dtype.lower() quant_max_dtype = quant_max_dtype.lower() util.check_dtype_rule(input_dtype, check_list) util.check_dtype_rule(alpha_dtype, check_list) util.check_dtype_rule(quant_max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), ) input_data = tvm.placeholder(input_shape, name="x", dtype=input_dtype) alpha_data = tvm.placeholder(alpha_shape, name="alpha_data", dtype=alpha_dtype) quant_max_data = tvm.placeholder(quant_max_shape, name="quant_max_data", dtype=quant_max_dtype) res = fake_learned_scale_quant_perlayer_compute(input_data, alpha_data, quant_max_data, neg_trunc, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [input_data, alpha_data, quant_max_data, res] config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list, "bool_storage_as_1bit": False } te.lang.cce.cce_build_code(sch, config)
def matrix_diag_part_d(input_diagonal, input_help, output_diagonal, kernel_name="matrix_diag_part_d"): """ Returns the batched diagonal part of a batched tensor Parameters ---------- input_diagonal: dict dict of input_diagonal, include keys(shape and dtype) input_help: dict dict of help Matrix, Its Diagonal Line value is 1 else value is 0 output_diagonal: dict dict of output kernel_name: str cce kernel name, default value is "matrix_diag_part_d" Returns ------- None """ shape_input_diagonal = input_diagonal.get("shape") dtype_input_diagonal = input_diagonal.get("dtype") shape_input_help = input_help.get("shape") dtype_input_help = input_help.get("dtype") check_shape(shape_input_diagonal, param_name="input_diagonal") check_shape(shape_input_help, param_name="input_help") if len(shape_input_diagonal) < 2: raise RuntimeError("Input tensors of rank>=2 are supported!") if list(shape_input_diagonal) != list(shape_input_help): raise RuntimeError("the shape of data must be equal!") check_list = ("float16", "float32", "int32", "int8", "uint8") dtype_input_diagonal = dtype_input_diagonal.lower() check_dtype(dtype_input_diagonal, check_list, param_name="input_diagonal") dtype_input_help = dtype_input_help.lower() check_dtype(dtype_input_help, check_list, param_name="input_help") data_input_diagonal = tvm.placeholder(shape_input_diagonal, name="data_input_diagonal", dtype=dtype_input_diagonal) data_input_help = tvm.placeholder(shape_input_help, name="data_input_help", dtype=dtype_input_help) res = matrix_diag_part_d_compute(data_input_diagonal, data_input_help, output_diagonal, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [data_input_diagonal, data_input_help, res] } te.lang.cce.cce_build_code(sch, config)
def floor_div(input_x, input_y, output_z, kernel_name="floor_div"): """ algorithm: floordiv calculating data's floordiv, res =floor(x / y) Parameters ---------- input_x: dict input_y: dict output_z: dict kernel_name: str, default value is "floor_div" Returns ------- None """ # check dtype of input_x/input_y input_dtype_x = input_x.get("dtype").lower() input_dtype_y = input_y.get("dtype").lower() check_list = ('int8', 'uint8', 'int32', 'float16', 'float32') check_dtype(input_dtype_x, check_list, param_name="input_x") check_dtype(input_dtype_y, check_list, param_name="input_y") check_elewise_shape_range([input_x, input_y], support_broadcast=True) if input_dtype_x != input_dtype_y: error_info = {} error_info['errCode'] = OP_ERROR_CODE_018 error_info['op_name'] = 'floor_div' error_info['param_name1'] = 'input_dtype_x' error_info['param_name2'] = 'input_dtype_y' error_info['param1_dtype'] = str(input_dtype_x) error_info['param2_dtype'] = str(input_dtype_y) raise RuntimeError(error_info, "In op[%s], the parameter[%s][%s] are not equal in " "dtype with dtype[%s][%s]." % ( error_info['op_name'], error_info['param_name1'], error_info['param_name2'], error_info['param1_dtype'], error_info['param2_dtype'])) ins = classify([input_x, input_y], Mode.ELEWISE_WITH_BROADCAST) schedules, tensors = [], [] for (input_x, input_y) in ins: with te.op.compute(): x_shape, y_shape = variable_shape([input_x, input_y], support_broadcast=True) x_shape, y_shape = refine_shapes_for_broadcast(x_shape, y_shape) tensor_x = tvm.placeholder(x_shape, input_dtype_x, "tensor_x") tensor_y = tvm.placeholder(y_shape, input_dtype_y, "tensor_y") res = floor_div_compute(tensor_x, tensor_y, output_z, kernel_name) tensors.append([tensor_x, tensor_y, res]) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def fake_quant_perchannel(x, min_val, max_val, y, symmetric, narrow_range, num_bits, channel_axis, kernel_name="fake_quant_perchannel"): """FakeQuantPerChannel""" x_shape = x.get("shape") x_shape_ = x.get("ori_shape") x_format = x.get("format") x_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") # for Dense weight quant, 2d[co,ci] -> 4d[1,co,ci,1], channel_axis_ need change to 1. if channel_axis == 0 and x_shape_[0] != min_shape[0] and x_shape_[1] == min_shape[0]: channel_axis_ = 1 else: channel_axis_ = channel_axis util.check_kernel_name(kernel_name) util.check_shape_rule(x_shape) util.check_shape_rule(min_shape, 1, 1, x_shape_[channel_axis_]) util.check_shape_rule(max_shape, 1, 1, x_shape_[channel_axis_]) util.check_tensor_shape_size(x_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = x_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) quant_min = 0 quant_max = 2 ** num_bits - 1 if narrow_range: quant_min = quant_min + 1 shape_c = [1] * len(x_shape) shape_c[channel_axis_] = min_val.get("ori_shape")[0] if x_format == "NC1HWC0" and channel_axis_ == 1: shape_c = min_val.get("shape") input_data = tvm.placeholder(x_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype) max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype) res = fake_quant_perchannel_compute(input_data, min_data, max_data, y, quant_min, quant_max, symmetric, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [input_data, min_data, max_data, res] config = {"print_ir": False, "name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def inv_grad(input_y, input_dy, output_z, kernel_name="inv_grad"): """ algorithm: inv_grad calculating data's reciprocal grad,dx = -1*dy*y*y, where `y = 1/x`, and `dy` is the corresponding input gradient. Parameters ---------- input_y: dict shape and dtype of input_y, only support float16, float32, int32, int8 input_dy: dict shape and dtype of input_dy, should be same shape and type as input_y output_z: dict shape and dtype of output, should be same shape and type as input_y kernel_name: str kernel name, default value is "inv_grad" Returns ------- None """ shape_input_y = input_y.get("shape") shape_input_dy = input_dy.get("shape") dtype_input_y = input_y.get("dtype") dtype_input_dy = input_dy.get("dtype") check_shape(shape_input_y, param_name="input_y") check_shape(shape_input_dy, param_name="input_dy") shape_input_y = util.shape_refine(shape_input_y) shape_input_dy = util.shape_refine(shape_input_dy) if list(shape_input_y) != list(shape_input_dy): raise RuntimeError("the shape of input must be equal!") dtype_input_y = dtype_input_y.lower() dtype_input_dy = dtype_input_dy.lower() if dtype_input_dy != dtype_input_y: raise RuntimeError("the dtype of input must be equal!") check_list = ("float16", "float32", "int32", "int8") check_dtype(dtype_input_y, check_list, param_name="input_y") shape_input_dy, shape_input_y = refine_shapes_for_broadcast(shape_input_dy, shape_input_y) data_dy = tvm.placeholder(shape_input_dy, name="data_dy", dtype=dtype_input_dy) data_y = tvm.placeholder(shape_input_y, name="data_y", dtype=dtype_input_y) res = inv_grad_compute(data_y, data_dy, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_y, data_dy, res]} te.lang.cce.cce_build_code(sch, config)
def leaky_relu_grad(g, x, y, negative_slope=0, kernel_name="leaky_relu_grad"): """ calculate the backpropagation of leaky_relu operation y = gradients(x>0) or negative_slope*gradients(x<=0). support dtype:float16,float32 Parameters ---------- g : dict the backpropagated gradients to the corresponding leaky_relu operation x : dict the x passed as output of leaky_relu operation y : dict the output of leaky_relu back propagation negative_slope : float or int allow non-zero slope for negative inputs to speed up optimization kernel_name : str kernel name, default value is "leaky_relu_grad" Returns ------- None """ shape_g = g.get("shape") shape_x = x.get("shape") dtype_g = g.get("dtype").lower() dtype_x = x.get("dtype").lower() util.check_kernel_name(kernel_name) util.check_shape_rule(shape_g) util.check_shape_rule(shape_x) util.check_tensor_shape_size(shape_g) util.check_tensor_shape_size(shape_x) shape_list = util.produce_shapes(shape_g, shape_x) util.check_tensor_shape_size(shape_list[2]) # check input tensor data_type check_list = ["float16", "float32"] util.check_dtype_rule(dtype_g, check_list) util.check_dtype_rule(dtype_x, check_list) util.compare_tensor_dict_key(g, x, "dtype") shape_g, shape_x = refine_shapes_for_broadcast(shape_list[0], shape_list[1]) data_g = tvm.placeholder(shape_g, name="data_g", dtype=dtype_g) data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_g) res = leaky_relu_grad_compute(data_g, data_x, y, negative_slope, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_g, data_x, res]} te.lang.cce.cce_build_code(schedule, config)
def add(input_x, input_y, output_z, kernel_name="add"): """ algorithm: add calculating data's add, c = a + b Parameters ---------- input_x : dict shape and dtype of first input, only support float16, float32, int32 input_y : dict shape and dtype of second input, only support float16, float32, int32 output_z: dict shape and dtype of output, should be broadcast shape and type as input kernel_name : str cce kernel name, default value is add Returns ------- None """ # format_pattern = 1 Nz and vector # format_pattern = 2 vector and Nz # format_pattern = 0 Nz scalar Nz Nz ND ND format_pattern = _add_check_format(input_x, input_y) shape_x, shape_y = _infer_shape(format_pattern, input_x, input_y) shape_x = util.scalar2tensor_one(shape_x) shape_y = util.scalar2tensor_one(shape_y) check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") check_tuple = ("float16", "float32", "int32") input_data_type = input_x.get("dtype").lower() check_dtype(input_data_type, check_tuple, param_name="input_x") shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") if shape_x[-1] == 1 and shape_y[-1] == 1 and shape_max[-1] == 1: shape_x = shape_x if len(shape_x) == 1 else shape_x[:-1] shape_y = shape_y if len(shape_y) == 1 else shape_y[:-1] shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1] data_x = tvm.placeholder(shape_x, name="data_1", dtype=input_data_type) data_y = tvm.placeholder(shape_y, name="data_2", dtype=input_data_type) res = add_compute(data_x, data_y, output_z, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": (data_x, data_y, res) } te.lang.cce.cce_build_code(schedule, config)
def assign_sub(var, value, out, kernel_name='assign_sub'): """ Update var by subtracting value from it. Parameters: ---------- var : dict dict of input_var, include shape and dtype, dtype support int8, uint8, int32, float16, float32 value : dict dict of input_value, include shape and dtype, dtype support int8, uint8, int32, float16, float32. Must have the same shape and dtype as input_var out : dict dict of out kernel_name : str cce kernel name, default value is "assign_sub" Returns ------- None """ # get the shape and dtype shape_var = var.get("shape") shape_value = value.get("shape") dtype_var = var.get("dtype") dtype_value = value.get("dtype") # kernel name check: should be unique # check whether the shape is right check_shape(shape_var, param_name="var") check_shape(shape_value, param_name="value") if not operator.eq(shape_var, shape_value): raise RuntimeError("all input shape must be the equal") # check whether dtypes are fp16, fp32, int8, uint8, int32 # and whether they are the same check_list = ("float16", "float32", "int8", "uint8", "int32") check_dtype(dtype_var, check_list, param_name="var") check_dtype(dtype_value, check_list, param_name="value") dtype_var = dtype_var.lower() dtype_value = dtype_value.lower() if dtype_var != dtype_value: raise RuntimeError("all input dtype must be same") shape, _ = refine_shape_axes(shape_var, []) data_var = tvm.placeholder(shape, dtype=dtype_var, name='data_var') data_value = tvm.placeholder(shape, dtype=dtype_value, name='data_value') sch, res = _assign_sub_compute(data_var, data_value, out, kernel_name) with set_bool_storage_config(): tvm.build(sch, [data_var, data_value, res], "cce", name=kernel_name)
def correction_mul_grad(dout, x, batch_std, running_std, dx, mul_dx, channel, kernel_name="correction_mul_grad"): """CorrectionMulGrad op""" shape_dout = dout.get("shape") shape_x = dout.get("shape") dtype_dout = dout.get("dtype") dtype_x = x.get("dtype") dtype_batch_std = batch_std.get("dtype") dtype_running_std = running_std.get("dtype") inp_dtype_dout = dtype_dout.lower() inp_dtype_x = dtype_x.lower() inp_dtype_batch_std = dtype_batch_std.lower() inp_dtype_running_std = dtype_running_std.lower() util.check_dtype_rule(inp_dtype_dout, ("float16", "float32")) util.check_dtype_rule(inp_dtype_x, ("float16", "float32")) util.check_dtype_rule(inp_dtype_batch_std, ("float16", "float32")) util.check_dtype_rule(inp_dtype_running_std, ("float16", "float32")) util.compare_tensor_dict_key(dout, x, "dtype") util.compare_tensor_dict_key(dout, x, "shape") util.compare_tensor_dict_key(dx, x, "shape") util.compare_tensor_dict_key(batch_std, running_std, "shape") util.compare_tensor_dict_key(dx, mul_dx, "shape") util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) data_format = dout.get("format") ori_format = dout.get("format") if data_format.upper() not in ("NC1HWC0", "NCHW"): raise RuntimeError("Un supported data format {}".format(data_format)) if data_format.upper() == "NCHW" and ori_format != "NCHW": raise RuntimeError("data_format(NCHW) must same as ori_format") shape_c = [1] * len(shape_x) shape_c[channel] = batch_std.get("ori_shape")[0] if data_format == "NC1HWC0" and channel == 1: shape_c = batch_std.get("shape") dout_t = tvm.placeholder(shape_dout, name="dout", dtype=inp_dtype_dout) x_t = tvm.placeholder(shape_x, name="x", dtype=inp_dtype_x) batch_std_t = tvm.placeholder(shape_c, name="batch_std", dtype=inp_dtype_batch_std) running_std_t = tvm.placeholder(shape_c, name="running_std", dtype=inp_dtype_running_std) res_list = correction_mul_grad_compute(dout_t, x_t, batch_std_t, running_std_t, channel, data_format, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [dout_t, x_t, batch_std_t, running_std_t] + res_list config = {"print_ir": False, "name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def rsqrt_grad(input_y, input_dy, output_z, kernel_name="rsqrt_grad"): """ calculate the backpropagation of rsqrt operation rsqrt: y = 1 / sqrt(x) rsqrt_grad: -1/2 * y**3 *dy Parameters ---------- input_y: dict dict of input_y, include keys(shape and dtype) input_dy: dict dict of input_dy, include keys(shape and dtype) output_z: dict dict of output kernel_name: str cce kernel name, default value is "rsqrt_grad" Returns ------- None """ shape_input_y = input_y.get("shape") dtype_input_y = input_y.get("dtype") shape_input_dy = input_dy.get("shape") dtype_input_dy = input_dy.get("dtype") check_shape(shape_input_y, param_name="input_y") check_shape(shape_input_dy, param_name="input_dy") util.compare_tensor_dict_key(input_y, input_dy, "shape") check_list = ("float16", "float32", "int32", "int8") dtype_input_y = dtype_input_y.lower() check_dtype(dtype_input_y, check_list, param_name="input_y") dtype_input_dy = dtype_input_dy.lower() check_dtype(dtype_input_dy, check_list, param_name="input_dy") util.compare_tensor_dict_key(input_y, input_dy, "dtype") reshape_y, reshape_dy = refine_shapes_for_broadcast( shape_input_y, shape_input_dy) data_input_y = tvm.placeholder(reshape_y, name="data_input_y", dtype=dtype_input_y) data_input_dy = tvm.placeholder(reshape_dy, name="data_input_dy", dtype=dtype_input_dy) res = rsqrt_grad_compute(data_input_y, data_input_dy, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [data_input_y, data_input_dy, res] } te.lang.cce.cce_build_code(sch, config)
def fused_mul_add_n(input_x, input_y, input_z, output, kernel_name="fused_mul_add_n"): """ algorithm: fused mul+add_n calculating output = input_x * input_z + input_y Parameters ---------- input_x : dict of input_x, tensor input_y: dict of input_y, tensor input_z: dict of input_z, scalar output : dict of output kernel_name : string cce kernel name, default value is fused_mul_add_n Returns ------- None """ check_list = ("float16", "float32", "int32", "int16") shape_x = input_x.get("shape") dtype_x = input_x.get("dtype") op_utils.check_shape(shape_x, param_name="input_x") op_utils.check_dtype(dtype_x, check_list, param_name="input_x") shape_y = input_y.get("shape") dtype_y = input_y.get("dtype") op_utils.check_shape(shape_y, param_name="input_y") op_utils.check_dtype(dtype_y, check_list, param_name="input_y") dtype_z = input_z.get("dtype") shape_z = [1 for i in range(len(shape_x))] op_utils.check_shape(shape_z, param_name="input_z") op_utils.check_dtype(dtype_z, check_list, param_name="input_z") data_x = tvm.placeholder(shape_x, name="input_x", dtype=dtype_x) data_y = tvm.placeholder(shape_y, name="input_y", dtype=dtype_y) data_z = tvm.placeholder(shape_z, name="input_z", dtype=dtype_z) res = mul_add_n_compute(data_x, data_y, data_z) with tvm.target.cce(): schedule = generic.auto_schedule(res) tensor_list = [data_x, data_y, data_z, res] config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(schedule, config)
def real_div(x1, x2, y, kernel_name="real_div"): """ algorithm: real_div calculating data's real_div, c = a / b Parameters ---------- x1 : dict shape and dtype of first input, only support float16, float32, int32 x2 : dict shape and dtype of second input, only support float16, float32, int32 y: dict shape and dtype of output, should be broadcast shape and type as input kernel_name : str cce kernel name, default value is real_div Returns ------- None """ shape_x = util.scalar2tensor_one(x1.get("shape")) shape_y = util.scalar2tensor_one(x2.get("shape")) check_shape(shape_x, param_name="x1") check_shape(shape_y, param_name="x2") check_tuple = ("float16", "float32") input_data_type = x1.get("dtype").lower() check_dtype(input_data_type, check_tuple, param_name="x1") input_data_type_x2 = x2.get("dtype").lower() check_dtype(input_data_type_x2, check_tuple, param_name="x2") shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") if shape_x[-1] == 1 and shape_y[-1] == 1 and shape_max[-1] == 1: shape_x = shape_x if len(shape_x) == 1 else shape_x[:-1] shape_y = shape_y if len(shape_y) == 1 else shape_y[:-1] shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1] shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y) data_x = tvm.placeholder(shape_x, name="data_x", dtype=input_data_type) data_y = tvm.placeholder(shape_y, name="data_y", dtype=input_data_type) res = real_div_compute(data_x, data_y, y, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": (data_x, data_y, res) } te.lang.cce.cce_build_code(schedule, config)