def squared_difference(x1, x2, y, kernel_name="squared_difference"): """ algorithm: squared_difference calculating data's tf_squared_difference,y= (x - y) * (x - y) Parameters ---------- x2 : dict shape and dtype of y input, only support float16, float32 input_dy : dict shape and dtype of dy input, only support float16, float32 output_x: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is squared_difference Returns ------- None """ shape_x = x1.get("shape") shape_y = x2.get("shape") check_shape(shape_x, param_name="x1") check_shape(shape_y, param_name="x2") check_list = ["float16", "float32", "int32"] dtype = x1.get("dtype").lower() if not dtype in check_list: raise RuntimeError( "tf_squared_difference_cce only support float16, float32, int32") shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y) data_x = tvm.placeholder(shape_x, dtype=dtype, name="data_x") data_y = tvm.placeholder(shape_y, dtype=dtype, name="data_y") with tvm.target.cce(): shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") data_x_tmp = te.lang.cce.broadcast(data_x, shape_max) data_y_tmp = te.lang.cce.broadcast(data_y, shape_max) data_sub = te.lang.cce.vsub(data_x_tmp, data_y_tmp) res = te.lang.cce.vmul(data_sub, data_sub) sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_x, data_y, res] } te.lang.cce.cce_build_code(sch, config)
def floor_mod(x1, x2, y, kernel_name="floor_mod"): """ calculate the remainder of division, support fp16,fp32,int32 res = x1 -floor(input_data_x / input_data_y)* input_data_y Parameters ---------- x1: dict dict{"shape":tuple or list,"dtype":str} shape of data the data type, src_dtype equals dst_dtype, support fp16,fp32,int32 x2: dict dict{"shape":tuple or list,"dtype":str} shape of data the data type, src_dtype equals dst_dtype, support fp16,fp32,int32 y: dict, reserved field dict with keys(shape and dtype) of output kernel_name: str cce kernel name, default value is "floor_mod" Returns ------ None """ # get dtype and shape attributes dtype_x = x1.get("dtype").lower() shape_x = x1.get("shape") dtype_y = x2.get("dtype").lower() shape_y = x2.get("shape") # check_kernel_name & shape check_shape(shape_x, param_name="x1") check_shape(shape_y, param_name="x2") # check input tensor data_type check_list = ("float16", "float32", "int32") check_dtype(dtype_x, check_list, param_name="x1") check_dtype(dtype_y, check_list, param_name="x2") if dtype_x != dtype_y: raise RuntimeError("the type of dtype in two dict is not the same") shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y) input_data_x = tvm.placeholder(shape_x, name="input_data_x", dtype=dtype_x) input_data_y = tvm.placeholder(shape_y, name="input_data_y", dtype=dtype_y) res = floor_mod_compute(input_data_x, input_data_y, y, kernel_name) with tvm.target.cce(): auto_sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [input_data_x, input_data_y, res] } te.lang.cce.cce_build_code(auto_sch, config)
def softplus_grad(input_gradients, input_features, output_backprops, kernel_name="softplus_grad"): """ Computes softplus gradients for a softplus operation. The gradients: "dy * exp(x) / (1 + exp(x))". Parameters ---------- input_gradients: dict The backpropagated gradients to the corresponding softplus operation. input_features: dict The input_features passed as input to the corresponding softplus operation. source data type support "float16", "float32", "int32", "int8", "uint8". output_backprops: dict data of output. kernel_name: str kernel name, default value is "softplus_grad". Returns ------- None """ shape_dy = input_gradients.get("shape") dtype_dy = input_gradients.get("dtype") shape_x = input_features.get("shape") dtype_x = input_features.get("dtype") if dtype_dy.lower() != dtype_x.lower(): raise RuntimeError("type of dy and type of x must be same, \ while the types are different") dtype = dtype_dy check_shape(shape_dy, param_name="input_gradients") check_shape(shape_x, param_name="input_features") check_list = ("float16", "float32", "int32", "int8", "uint8") input_dtype = dtype.lower() check_dtype(input_dtype, check_list, param_name="input_gradients") shape_dy, shape_x, shape_max = broadcast_shapes( shape_dy, shape_x, param_name_input1="input_gradients", param_name_input2="input_features") reshape_dy, reshape_x = refine_shapes_for_broadcast(shape_dy, shape_x) data_dy = tvm.placeholder(reshape_dy, name="data_dy", dtype=input_dtype) data_x = tvm.placeholder(reshape_x, name="data_x", dtype=input_dtype) res = softplus_grad_compute(data_dy, data_x, output_backprops, kernel_name=kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_dy, data_x, res]} te.lang.cce.cce_build_code(sch, config)
def mul(x, y, output, kernel_name="mul"): """ do element-wise mul operation between two input tensors Parameters: ---------- x : dict. shape, dtype of input x y : dict. shape, dtype of input y output : dict. shape, dtype of ouput kernel_name : str. cce kernel name, default value is "mul" Returns ------- None """ # format_pattern = 1 Nz and vector # format_pattern = 2 vector and Nz # format_pattern = 0 Nz scalar Nz Nz ND ND format_pattern = _mul_check_format(x, y) shape_x, shape_y = _infer_shape(format_pattern, x, y) shape_x = util.scalar2tensor_one(shape_x) dtype_x = x.get("dtype").lower() shape_y = util.scalar2tensor_one(shape_y) dtype_y = y.get("dtype").lower() op_utils.check_shape(shape_x, param_name="x") op_utils.check_shape(shape_y, param_name="y") if dtype_x != dtype_y: raise RuntimeError("dtype of inputs should be consistent") dtype = dtype_x check_list = ("int32", "float16", "float32", "int16") op_utils.check_dtype(dtype, check_list, param_name="x") vmul_support = tbe_platform.cce_conf.api_check_support( "te.lang.cce.vmul", "float32") if dtype_x == "float32" and not vmul_support: raise RuntimeError( "Input dtype is float32, but do not support on the platform") shape_x, shape_y, shape_max = op_utils.broadcast_shapes( shape_x, shape_y, param_name_input1="x", param_name_input2="y") shape_x, shape_y = op_utils.refine_shapes_for_broadcast(shape_x, shape_y) input_x = tvm.placeholder(shape_x, dtype=dtype, name="x") input_y = tvm.placeholder(shape_y, dtype=dtype, name="y") res = _mul_compute(input_x, input_y, output, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": (input_x, input_y, res)} te.lang.cce.cce_build_code(sch, config)
def floor_div(input_x, input_y, output_z, kernel_name="floor_div"): """ algorithm: floordiv calculating data's floordiv, res =floor(x / y) Parameters ---------- input_x: dict input_y: dict output_z: dict kernel_name: str, default value is "floor_div" Returns ------- None """ # check dtype of input_x/input_y input_dtype_x = input_x.get("dtype").lower() input_dtype_y = input_y.get("dtype").lower() check_list = ('int8', 'uint8', 'int32', 'float16', 'float32') check_dtype(input_dtype_x, check_list, param_name="input_x") check_dtype(input_dtype_y, check_list, param_name="input_y") check_elewise_shape_range([input_x, input_y], support_broadcast=True) if input_dtype_x != input_dtype_y: error_info = {} error_info['errCode'] = OP_ERROR_CODE_018 error_info['op_name'] = 'floor_div' error_info['param_name1'] = 'input_dtype_x' error_info['param_name2'] = 'input_dtype_y' error_info['param1_dtype'] = str(input_dtype_x) error_info['param2_dtype'] = str(input_dtype_y) raise RuntimeError(error_info, "In op[%s], the parameter[%s][%s] are not equal in " "dtype with dtype[%s][%s]." % ( error_info['op_name'], error_info['param_name1'], error_info['param_name2'], error_info['param1_dtype'], error_info['param2_dtype'])) ins = classify([input_x, input_y], Mode.ELEWISE_WITH_BROADCAST) schedules, tensors = [], [] for (input_x, input_y) in ins: with te.op.compute(): x_shape, y_shape = variable_shape([input_x, input_y], support_broadcast=True) x_shape, y_shape = refine_shapes_for_broadcast(x_shape, y_shape) tensor_x = tvm.placeholder(x_shape, input_dtype_x, "tensor_x") tensor_y = tvm.placeholder(y_shape, input_dtype_y, "tensor_y") res = floor_div_compute(tensor_x, tensor_y, output_z, kernel_name) tensors.append([tensor_x, tensor_y, res]) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def inv_grad(input_y, input_dy, output_z, kernel_name="inv_grad"): """ algorithm: inv_grad calculating data's reciprocal grad,dx = -1*dy*y*y, where `y = 1/x`, and `dy` is the corresponding input gradient. Parameters ---------- input_y: dict shape and dtype of input_y, only support float16, float32, int32, int8 input_dy: dict shape and dtype of input_dy, should be same shape and type as input_y output_z: dict shape and dtype of output, should be same shape and type as input_y kernel_name: str kernel name, default value is "inv_grad" Returns ------- None """ shape_input_y = input_y.get("shape") shape_input_dy = input_dy.get("shape") dtype_input_y = input_y.get("dtype") dtype_input_dy = input_dy.get("dtype") check_shape(shape_input_y, param_name="input_y") check_shape(shape_input_dy, param_name="input_dy") shape_input_y = util.shape_refine(shape_input_y) shape_input_dy = util.shape_refine(shape_input_dy) if list(shape_input_y) != list(shape_input_dy): raise RuntimeError("the shape of input must be equal!") dtype_input_y = dtype_input_y.lower() dtype_input_dy = dtype_input_dy.lower() if dtype_input_dy != dtype_input_y: raise RuntimeError("the dtype of input must be equal!") check_list = ("float16", "float32", "int32", "int8") check_dtype(dtype_input_y, check_list, param_name="input_y") shape_input_dy, shape_input_y = refine_shapes_for_broadcast(shape_input_dy, shape_input_y) data_dy = tvm.placeholder(shape_input_dy, name="data_dy", dtype=dtype_input_dy) data_y = tvm.placeholder(shape_input_y, name="data_y", dtype=dtype_input_y) res = inv_grad_compute(data_y, data_dy, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_y, data_dy, res]} te.lang.cce.cce_build_code(sch, config)
def leaky_relu_grad(g, x, y, negative_slope=0, kernel_name="leaky_relu_grad"): """ calculate the backpropagation of leaky_relu operation y = gradients(x>0) or negative_slope*gradients(x<=0). support dtype:float16,float32 Parameters ---------- g : dict the backpropagated gradients to the corresponding leaky_relu operation x : dict the x passed as output of leaky_relu operation y : dict the output of leaky_relu back propagation negative_slope : float or int allow non-zero slope for negative inputs to speed up optimization kernel_name : str kernel name, default value is "leaky_relu_grad" Returns ------- None """ shape_g = g.get("shape") shape_x = x.get("shape") dtype_g = g.get("dtype").lower() dtype_x = x.get("dtype").lower() util.check_kernel_name(kernel_name) util.check_shape_rule(shape_g) util.check_shape_rule(shape_x) util.check_tensor_shape_size(shape_g) util.check_tensor_shape_size(shape_x) shape_list = util.produce_shapes(shape_g, shape_x) util.check_tensor_shape_size(shape_list[2]) # check input tensor data_type check_list = ["float16", "float32"] util.check_dtype_rule(dtype_g, check_list) util.check_dtype_rule(dtype_x, check_list) util.compare_tensor_dict_key(g, x, "dtype") shape_g, shape_x = refine_shapes_for_broadcast(shape_list[0], shape_list[1]) data_g = tvm.placeholder(shape_g, name="data_g", dtype=dtype_g) data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_g) res = leaky_relu_grad_compute(data_g, data_x, y, negative_slope, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_g, data_x, res]} te.lang.cce.cce_build_code(schedule, config)
def real_div(x1, x2, y, kernel_name="real_div"): """ algorithm: real_div calculating data's real_div, c = a / b Parameters ---------- x1 : dict shape and dtype of first input, only support float16, float32, int32 x2 : dict shape and dtype of second input, only support float16, float32, int32 y: dict shape and dtype of output, should be broadcast shape and type as input kernel_name : str cce kernel name, default value is real_div Returns ------- None """ shape_x = util.scalar2tensor_one(x1.get("shape")) shape_y = util.scalar2tensor_one(x2.get("shape")) check_shape(shape_x, param_name="x1") check_shape(shape_y, param_name="x2") check_tuple = ("float16", "float32") input_data_type = x1.get("dtype").lower() check_dtype(input_data_type, check_tuple, param_name="x1") input_data_type_x2 = x2.get("dtype").lower() check_dtype(input_data_type_x2, check_tuple, param_name="x2") shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") if shape_x[-1] == 1 and shape_y[-1] == 1 and shape_max[-1] == 1: shape_x = shape_x if len(shape_x) == 1 else shape_x[:-1] shape_y = shape_y if len(shape_y) == 1 else shape_y[:-1] shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1] shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y) data_x = tvm.placeholder(shape_x, name="data_x", dtype=input_data_type) data_y = tvm.placeholder(shape_y, name="data_y", dtype=input_data_type) res = real_div_compute(data_x, data_y, y, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": (data_x, data_y, res) } te.lang.cce.cce_build_code(schedule, config)
def rsqrt_grad(input_y, input_dy, output_z, kernel_name="rsqrt_grad"): """ calculate the backpropagation of rsqrt operation rsqrt: y = 1 / sqrt(x) rsqrt_grad: -1/2 * y**3 *dy Parameters ---------- input_y: dict dict of input_y, include keys(shape and dtype) input_dy: dict dict of input_dy, include keys(shape and dtype) output_z: dict dict of output kernel_name: str cce kernel name, default value is "rsqrt_grad" Returns ------- None """ shape_input_y = input_y.get("shape") dtype_input_y = input_y.get("dtype") shape_input_dy = input_dy.get("shape") dtype_input_dy = input_dy.get("dtype") check_shape(shape_input_y, param_name="input_y") check_shape(shape_input_dy, param_name="input_dy") util.compare_tensor_dict_key(input_y, input_dy, "shape") check_list = ("float16", "float32", "int32", "int8") dtype_input_y = dtype_input_y.lower() check_dtype(dtype_input_y, check_list, param_name="input_y") dtype_input_dy = dtype_input_dy.lower() check_dtype(dtype_input_dy, check_list, param_name="input_dy") util.compare_tensor_dict_key(input_y, input_dy, "dtype") reshape_y, reshape_dy = refine_shapes_for_broadcast( shape_input_y, shape_input_dy) data_input_y = tvm.placeholder(reshape_y, name="data_input_y", dtype=dtype_input_y) data_input_dy = tvm.placeholder(reshape_dy, name="data_input_dy", dtype=dtype_input_dy) res = rsqrt_grad_compute(data_input_y, data_input_dy, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [data_input_y, data_input_dy, res] } te.lang.cce.cce_build_code(sch, config)
def sigmoid_cross_entropy_with_logits( predict, target, loss, kernel_name="sigmoid_cross_entropy_with_logits"): """ calculating data Parameters ---------- predict : dict shape and dtype of predict target : dict shape and dtype of target loss : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "sigmoid_cross_entropy_with_logits" Returns ------- None """ shape_predict = predict.get("shape") dtype_predict = predict.get("dtype") input_dtype_predict = dtype_predict.lower() check_shape(shape_predict, param_name="predict") shape_target = target.get("shape") dtype_target = target.get("dtype") input_dtype_target = dtype_target.lower() check_shape(shape_target, param_name="target") check_list = ("float16", "float32") check_dtype(input_dtype_predict, check_list, param_name="predict") check_dtype(input_dtype_target, check_list, param_name="target") shape_predict, shape_target = \ refine_shapes_for_broadcast(shape_predict, shape_target) data_predict = tvm.placeholder(shape_predict, name="data_predict", dtype=input_dtype_predict) data_target = tvm.placeholder(shape_target, name="data_target", dtype=input_dtype_target) loss = sigmoid_cross_entropy_with_logits_compute(data_predict, data_target, loss, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(loss) config = { "name": kernel_name, "tensor_list": [data_predict, data_target, loss] } te.lang.cce.cce_build_code(sch, config)
def bitwise_xor(x1, x2, y, kernel_name="bitwise_xor"): """ algorithm: bitwise_xor calculating: gradient of bitwise_xor Parameters ---------- x1 : dict the shape and dtype of the tensor x1 x2 : dict the shape and dtype of the tensor x2 y : dict the shape and dtype of the tensor y kernel_name : string cce kernel name, default value is "bitwise_xor" Returns ------- None """ shape_x = x1.get("shape") shape_y = x2.get("shape") dtype_x = x1.get("dtype").lower() dtype_y = x2.get("dtype").lower() check_shape(shape_x, param_name="x1") check_shape(shape_y, param_name="x2") check_tuple = ("int16", "uint16", "int32") input_data_type = dtype_x.lower() check_dtype(input_data_type, check_tuple, param_name="x1") if dtype_x != dtype_y: raise RuntimeError("two input type must be the same") shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y) if input_data_type == "int32": input_data_type = "int16" shape_x.append(2) shape_y.append(2) data_x = tvm.placeholder(shape_x, dtype=input_data_type, name="data_x") data_y = tvm.placeholder(shape_y, dtype=input_data_type, name="data_y") result = bitwise_xor_compute(data_x, data_y, y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(result) config = {"name": kernel_name, "tensor_list": [data_x, data_y, result]} te.lang.cce.cce_build_code(sch, config)
def equal(input_x, input_y, output_z, kernel_name="equal"): """ Returns the truth value of (x = y) element-wise Parameters ---------- input_x: dict dict of input_x, include keys(shape and dtype) input_y: dict dict of input_y, include keys(shape and dtype) output_z: dict dict of output kernel_name: str cce kernel name, default value is "equal" Returns ------- None """ shape_x = input_x.get("shape") dtype_x = input_x.get("dtype") shape_y = input_y.get("shape") dtype_y = input_y.get("dtype") shape_x, shape_y, shape_broadcast = broadcast_shapes( shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") check_list = ("float16", "float32", "int32", "int8", "uint8") dtype_x = dtype_x.lower() check_dtype(dtype_x, check_list, param_name="input_x") dtype_y = dtype_y.lower() check_dtype(dtype_y, check_list, param_name="input_y") util.compare_tensor_dict_key(input_x, input_y, "dtype") shape_x = list(shape_x) shape_y = list(shape_y) shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y) data_input_x = tvm.placeholder(shape_x, name="data_input_x", dtype=dtype_x) data_input_y = tvm.placeholder(shape_y, name="data_input_y", dtype=dtype_y) res = equal_compute(data_input_x, data_input_y, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [data_input_x, data_input_y, res] } te.lang.cce.cce_build_code(sch, config)
def xlogy(input_x, input_y, output_z, kernel_name="xlogy"): """ algorithm: xlogy calculating data's xlogy, res = 0 if x == 0 else x*log(y) Parameters ---------- input_x: dict dict of input_x, include keys(shape and dtype) input_y: dict dict of input_y, include keys(shape and dtype) output_z: dict dict info of output_z kernel_name: str kernel name, default value is "xlogy" Returns ------- None """ shape_x = input_x.get("shape") shape_y = input_y.get("shape") dtype = input_x.get("dtype") dtype_y = input_y.get("dtype") util.compare_tensor_dict_key(input_x, input_y, "dtype") check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") input_dtype = dtype.lower() input_dtype_y = dtype_y.lower() check_list = ("float16", "float32") check_dtype(input_dtype, check_list, param_name="input_x") check_dtype(input_dtype_y, check_list, param_name="input_y") shape_list = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") shape_x, shape_y = refine_shapes_for_broadcast(shape_list[0], shape_list[1]) data1 = tvm.placeholder(shape_x, name="data1", dtype=input_dtype) data2 = tvm.placeholder(shape_y, name="data2", dtype=input_dtype) res = xlogy_compute(data1, data2, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [data1, data2, res], "bool_storage_as_1bit": False } te.lang.cce.cce_build_code(sch, config)
def atan2(x1, x2, y, kernel_name="atan2"): """ Algorithm: arctan2 arctan2(y, x) = arctan(y/x) ---------------------------------- Parameters: x1: the dict of input data x1, only support float16, float32. x2: the dict of input data x2, only support float16, float32. y: the dict of output kernel_name: default value is "atan2". ---------------------------------- Returns: None """ y_shape = x1.get("shape") x_shape = x2.get("shape") y_dtype = x1.get("dtype") x_dtype = x2.get("dtype") check_shape(y_shape, param_name="x1") check_shape(x_shape, param_name="x2") shape_y, shape_x, shape_max = broadcast_shapes( y_shape, x_shape, param_name_input1="x1", param_name_input2="x2") check_list = ("float16", "float32") check_dtype(y_dtype, check_list, param_name="x1") check_dtype(x_dtype, check_list, param_name="x2") if y_dtype.lower() != x_dtype.lower(): raise RuntimeError("The input tensor must have identical dtype!") shape_y, shape_x = refine_shapes_for_broadcast(shape_y, shape_x) input_y = tvm.placeholder(shape_y, dtype=y_dtype.lower(), name="input_y") input_x = tvm.placeholder(shape_x, dtype=x_dtype.lower(), name="input_x") res = atan2_compute(input_y, input_x, y, kernel_name) res = te.lang.cce.cast_to(res, x_dtype.lower()) with tvm.target.cce(): auto_sch = topi.generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": (input_y, input_x, res), "print_ir": False, "bool_storage_as_1bit": False } te.lang.cce.cce_build_code(auto_sch, config)
def logical_and(x1, x2, y, kernel_name="logical_and"): """ calculating data Parameters ---------- x1 : dict shape and dtype of input, only support float16, float32 x2 : dict shape and dtype of input, only support float16, float32 y : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "logical_and" Returns ------- None """ shape_x = x1.get("shape") shape_y = x2.get("shape") dtype_x = x1.get("dtype") dtype_y = x2.get("dtype") check_shape(shape_x, param_name="x1") check_shape(shape_y, param_name="x2") if dtype_x != dtype_y: raise RuntimeError("The type of input must be the same") input_data_type = dtype_x.lower() check_tuple = ("int8", ) check_dtype(input_data_type, check_tuple, param_name="x1") shape_x, shape_y, _ = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y) data_x = tvm.placeholder(shape_x, dtype=dtype_x, name="data_x") data_y = tvm.placeholder(shape_y, dtype=dtype_y, name="data_y") res = logical_and_compute(data_x, data_y, y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": (data_x, data_y, res)} te.lang.cce.cce_build_code(sch, config)
def less(input_x, input_y, output_z, kernel_name="less"): """ do element-wise less operation between two input tensors Parameters: ---------- input_x : dict shape and dtype of first input, support float16,float32,int32, int8,uint8 input_y : dict shape and dtype of second input, support float16,float32,int32, int8,uint8 output_x: dict shape and dtype of output, should be broadcast shape and type as input kernel_name : str cce kernel name, default value is less Returns ------- None """ shape_x = util.scalar2tensor_one(input_x.get("shape")) shape_y = util.scalar2tensor_one(input_y.get("shape")) check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") check_list = ("float16", "float32", "int32", "int8", "uint8") input_dtype = input_x.get("dtype").lower() check_dtype(input_dtype, check_list, param_name="input_x") shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y) data_x = tvm.placeholder(shape_x, dtype=input_dtype, name="data_x") data_y = tvm.placeholder(shape_y, dtype=input_dtype, name="data_y") res = less_compute(data_x, data_y, output_z, kernel_name="less") with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_x, data_y, res] } te.lang.cce.cce_build_code(sch, config)
def pow(input_x, input_y, output_z, kernel_name="pow"): """ algorithm: pow calculating data pow, res =x ** y Parameters ---------- input_x: dict dict with keys(shape and dtype) of input_x input_y: dict dict with keys(shape and dtype) of input_y output_z: dict dict with keys(shape and dtype) of output kernel_name: str kernel name, default value is "pow" Returns ------- None """ shape_x = input_x.get("shape") shape_y = input_y.get("shape") if len(shape_x) == 0: shape_x = (1,) if len(shape_y) == 0: shape_y = (1,) check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") shape_list = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") input_x_dtype = input_x.get("dtype").lower() input_y_dtype = input_y.get("dtype").lower() if input_x_dtype != input_y_dtype: raise RuntimeError("Dtype of input_x and input_y must be the same.") check_list = ("float16", "float32", "int8", "uint8", "int32") check_dtype(input_x_dtype, check_list, param_name="input_x") shape_x, shape_y = refine_shapes_for_broadcast(shape_list[0], shape_list[1]) data_x = tvm.placeholder(shape_x, dtype=input_x_dtype, name="data_x") data_y = tvm.placeholder(shape_y, dtype=input_y_dtype, name="data_y") res = pow_compute(data_x, data_y, output_z, kernel_name="pow") with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x, data_y, res], "bool_storage_as_1bit": False} te.lang.cce.cce_build_code(sch, config)
def floor_div(input_x, input_y, output_z, kernel_name="floor_div"): """ algorithm: floordiv calculating data's floordiv, res =floor(x / y) Parameters ---------- input_x: dict dict with keys(shape and dtype) of input_x input_y: dict dict with keys(shape and dtype) of input_y output_z: dict dict with keys(shape and dtype) of output kernel_name: str kernel name, default value is "floordiv" Returns ------- None """ # check dtype of input_x/input_y input_dtype_x = input_x.get("dtype").lower() input_dtype_y = input_y.get("dtype").lower() check_list = ('int8', 'uint8', 'int32', 'float16', 'float32') check_dtype(input_dtype_x, check_list, param_name="input_x") if input_dtype_x != input_dtype_y: raise RuntimeError("The dtype of input_x and input_y must be the same") # check shape of input_x/input_y shape_x = input_x.get("shape") shape_y = input_y.get("shape") check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") shape_list = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") # compute result for floordiv() with floordiv_compute() shape_x, shape_y = refine_shapes_for_broadcast(shape_list[0], shape_list[1]) data_x = tvm.placeholder(shape_x, dtype=input_dtype_x, name='data_x') data_y = tvm.placeholder(shape_y, dtype=input_dtype_y, name='data_y') res = floor_div_compute(data_x, data_y, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x, data_y, res]} te.lang.cce.cce_build_code(sch, config)
def mod(input_x, input_y, output_z, kernel_name="mod"): """ Returns element-wise remainder of division. Parameters ---------- input_x: dict input tensor contains shape and dtype attributes. source data type support "float16", "float32", "int8", "uint8", "int32". input_y: dict input tensor contains shape and dtype attributes. Must have the same type as 'input_x'. output_z: dict data of output. Must have the same type as 'input_x'. kernel_name: str kernel name, default value is "mod" Returns: None """ shape_x = input_x.get("shape") shape_y = input_y.get("shape") util.compare_tensor_dict_key(input_x, input_y, "dtype") check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") check_list = ("float16", "float32", "int8", "uint8", "int32") input_dtype = input_x.get("dtype").lower() check_dtype(input_dtype, check_list, param_name="input_x") shape_x, shape_y, shape_broadcast = broadcast_shapes( shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") reshape_x, reshape_y = refine_shapes_for_broadcast(shape_x, shape_y) data_x = tvm.placeholder(reshape_x, dtype=input_dtype, name="data_x") data_y = tvm.placeholder(reshape_y, dtype=input_dtype, name="data_y") res = mod_compute(data_x, data_y, output_z, kernel_name="mod") with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x, data_y, res]} te.lang.cce.cce_build_code(sch, config)
def leaky_relu_grad(g, x, y, negative_slope=0, kernel_name="leaky_relu_grad"): """ calculate the backpropagation of leaky_relu operation y = gradients(x>0) or negative_slope*gradients(x<=0). support dtype:float16,float32 Parameters ---------- g : dict the backpropagated gradients to the corresponding leaky_relu operation x : dict the x passed as output of leaky_relu operation y : dict the output of leaky_relu back propagation negative_slope : float or int allow non-zero slope for negative inputs to speed up optimization kernel_name : str kernel name, default value is "leaky_relu_grad" Returns ------- None """ g_dtype = g.get("dtype").lower() x_dtype = x.get("dtype").lower() check_list = ("float16", "float32") check_dtype(g_dtype, check_list, param_name="input_g") check_dtype(x_dtype, check_list, param_name="input_x") check_elewise_shape_range([g, x], support_broadcast=True) if g_dtype != x_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "g", "x", g_dtype, x_dtype) ins = classify([g, x], Mode.ELEWISE_WITH_BROADCAST) schedules, tensors = [], [] for (g, x) in ins: with te.op.compute(): g_shape, x_shape = variable_shape([g, x], support_broadcast=True) g_shape, x_shape = refine_shapes_for_broadcast(g_shape, x_shape) tensor_g = tvm.placeholder(g_shape, g_dtype, "tensor_g") tensor_x = tvm.placeholder(x_shape, x_dtype, "tensor_x") res = leaky_relu_grad_compute(tensor_g, tensor_x, y, negative_slope, kernel_name) tensors.append((tensor_g, tensor_x, res)) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def xdivy(input_x, input_y, output_z, kernel_name="xdivy"): """ algorithm: xdivy calculating data's xdivy,return 0 if x==0 and x/y otherwise, elementwise Parameters ---------- input_x: dict dict with keys(shape and dtype) of input_x input_y: dict dict with keys(shape and dtype) of input_y output_z: dict dict with keys(shape and dtype) of output kernel_name : str kernel name, default value is "xdivy" Returns ------- None """ shape_x = input_x.get("shape") dtype = input_x.get("dtype") shape_y = input_y.get("shape") dtype_y = input_y.get("dtype") util.compare_tensor_dict_key(input_x, input_y, "dtype") check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") shape_list = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") input_dtype = dtype.lower() input_dtype_y = dtype_y.lower() check_list = ("float16", "float32") check_dtype(input_dtype, check_list, param_name="input_x") check_dtype(input_dtype_y, check_list, param_name="input_y") reshape_x, reshape_y = refine_shapes_for_broadcast(shape_list[0], shape_list[1]) data_x = tvm.placeholder(reshape_x, dtype=input_dtype, name="data_x") data_y = tvm.placeholder(reshape_y, dtype=input_dtype, name="data_y") res = xdivy_compute(data_x, data_y, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x, data_y, res]} te.lang.cce.cce_build_code(sch, config)
def truncate_div(input_x, input_y, output_x, kernel_name="truncate_div"): """ algorithm: truncate_div calculating data's truncate_div, res = floor(x / y) if x/y>0 else ceil(x/y) Parameters ---------- input_x: dict with keys(shape and dtype) only support {float16, float32, int8, uint8(on mini)}, {float16, float32(on cloud)} input_y: dict with keys(shape and dtype) dict info of input_y output_x: dict with keys(shape and dtype) dict info of output_x kernel_name: str kernel name, default value is "truncate_div" Returns ------- None """ shape_x = input_x.get("shape") shape_y = input_y.get("shape") dtype = input_x.get("dtype") check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") input_dtype = dtype.lower() check_list = ("float16", "float32", "int32", "int8", "uint8") check_dtype(input_dtype, check_list, param_name="input_x") shape_list = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") reshape_x, reshape_y = refine_shapes_for_broadcast(shape_list[0], shape_list[1]) data1 = tvm.placeholder(reshape_x, dtype=input_dtype, name="data1") data2 = tvm.placeholder(reshape_y, dtype=input_dtype, name="data2") res = truncate_div_compute(data1, data2, output_x, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data1, data2, res]} te.lang.cce.cce_build_code(sch, config)
def real_div(x1, x2, y, kernel_name="real_div"): """ algorithm: real_div calculating data's real_div, c = a / b Parameters ---------- x1 : dict shape and dtype of first input, only support float16, float32, int32 x2 : dict shape and dtype of second input, only support float16, float32, int32 y: dict shape and dtype of output, should be broadcast shape and type as input kernel_name : str cce kernel name, default value is real_div Returns ------- None """ x_dtype = x1.get("dtype").lower() y_dtype = x2.get("dtype").lower() check_list = ("float16", "float32") check_dtype(x_dtype, check_list, param_name="input_x") check_dtype(y_dtype, check_list, param_name="input_y") check_elewise_shape_range([x1, x2], support_broadcast=True) if x_dtype != y_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "x1", "x2", x_dtype, y_dtype) ins = classify([x1, x2], Mode.ELEWISE_WITH_BROADCAST) schedules, tensors = [], [] for (x1, x2) in ins: with te.op.compute(): x_shape, y_shape = variable_shape([x1, x2], support_broadcast=True) x_shape, y_shape = refine_shapes_for_broadcast(x_shape, y_shape) tensor_x = tvm.placeholder(x_shape, x_dtype, "tensor_x") tensor_y = tvm.placeholder(y_shape, y_dtype, "tensor_y") res = real_div_compute(tensor_x, tensor_y, y, kernel_name) tensors.append([tensor_x, tensor_y, res]) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) # build config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def truncate_mod(input_x, input_y, output_z, kernel_name="truncate_mod"): """ algorithm: truncatemod calculating data's truncate, res = x - truncate(x/y)*y Parameters ---------- input_x: dict dict with keys(shape and dtype) of input_x input_y: dict dict with keys(shape and dtype) of input_y output_div: dict dict with keys(shape and dtype) of output kernel_name: str kernel name, default value is "truncatemod" Returns ------- None """ shape_x = input_x.get("shape") dtype_x = input_x.get("dtype").lower() shape_y = input_y.get("shape") dtype_y = input_y.get("dtype").lower() check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") shape_list = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") check_list = ("float16", "float32", "int8", "uint8", "int32") check_dtype(dtype_x, check_list, param_name="input_x") check_dtype(dtype_y, check_list, param_name="input_y") shape_x, shape_y = refine_shapes_for_broadcast(shape_list[0], shape_list[1]) data_x = tvm.placeholder(shape_x, dtype=dtype_x, name="data_x") data_y = tvm.placeholder(shape_y, dtype=dtype_y, name="data_y") res = truncate_mod_compute(data_x, data_y, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x, data_y, res]} te.lang.cce.cce_build_code(sch, config)
def sigmoid_grad(x, dx, out, kernel_name="sigmoid_grad"): """ do sigmoid grad sigmoid_grad = (sigmoid - sigmoid*sigmoid)*grad Parameters: ---------- x : dictionary shape of sigmoid input dx : dictionary shape of grad out: dictionary output kernel_name : cce kernel name, default value is "sigmoid_grad_cce" Returns ------- None """ x_dtype = x.get("dtype").lower() dx_dtype = dx.get("dtype").lower() check_list = ("float16", "float32") check_dtype(x_dtype, check_list, param_name="input_x") check_dtype(dx_dtype, check_list, param_name="input_dx") check_elewise_shape_range([x, dx], support_broadcast=False) if x_dtype != dx_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "x", "dx", x_dtype, dx_dtype) ins = classify([x, dx], Mode.ELEWISE) schedules, tensors = [], [] for (sig, dx) in ins: with te.op.compute(): shape_sig, shape_dx = variable_shape([sig, dx], support_broadcast=False) shape_sig, shape_dx = refine_shapes_for_broadcast( shape_sig, shape_dx) tensor_sig = tvm.placeholder(shape_sig, x_dtype, "tensor_x") tensor_dx = tvm.placeholder(shape_dx, dx_dtype, "tensor_dx") res = sigmoid_grad_compute(tensor_sig, tensor_dx, out, kernel_name) tensors.append([tensor_sig, tensor_dx, res]) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def greater(x, y, z, kernel_name="greater"): """ do element-wise greater operation between two input tensors Parameters: ---------- x : dict shape and dtype of input data_x y : dict shape and dtype of input data_y z : dict shape and dtype of output data_z kernel_name : str cce kernel name, default value is "greater" Returns ------- None """ shape_input_x = util.scalar2tensor_one(x.get("shape")) dtype_input_x = x.get("dtype").lower() shape_input_y = util.scalar2tensor_one(y.get("shape")) dtype_input_y = y.get("dtype").lower() check_shape(shape_input_x, param_name="x") check_shape(shape_input_y, param_name="y") check_list = ("float16", "float32", "int32", "int8", "uint8") check_dtype(dtype_input_x, check_list, param_name="x") shape_list = broadcast_shapes(shape_input_x, shape_input_y, param_name_input1="x", param_name_input2="y") reshape_x, reshape_y = refine_shapes_for_broadcast(shape_list[0], shape_list[1]) data_x = tvm.placeholder(reshape_x, dtype=dtype_input_x, name="data_x") data_y = tvm.placeholder(reshape_y, dtype=dtype_input_y, name="data_y") res = greater_compute(data_x, data_y, z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x, data_y, res]} te.lang.cce.cce_build_code(sch, config)
def sqrt_grad(x, dx, out, kernel_name="sqrt_grad"): """ algorithm: sqrt_grad_cce Parameters ---------- x : dict of data: dict dx : dict of data_grad: dict out : dict of output: dict kernel_name : cce kernel name, default value is "sqrt_grad": str Returns ------- None """ x_dtype = x.get("dtype").lower() dx_dtype = dx.get("dtype").lower() check_list = ("float16", "float32") check_dtype(x_dtype, check_list, param_name="x") check_dtype(dx_dtype, check_list, param_name="dx") check_elewise_shape_range([x, dx], support_broadcast=False) if x_dtype != dx_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "x", "dx", x_dtype, dx_dtype) ins = classify([x, dx], Mode.ELEWISE) schedules, tensors = [], [] for (x, dx) in ins: with te.op.compute(): x_shape, dx_shape = variable_shape([x, dx], support_broadcast=False) x_shape, dx_shape = refine_shapes_for_broadcast(x_shape, dx_shape) tensor_x = tvm.placeholder(x_shape, x_dtype, "tensor_x") tensor_dx = tvm.placeholder(dx_shape, dx_dtype, "tensor_dx") res = sqrt_grad_compute(tensor_x, tensor_dx, out, kernel_name) tensors.append([tensor_x, tensor_dx, res]) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def div_no_nan(input_x, input_y, output_z, kernel_name="div_no_nan"): """ algorithm: div_no_nan_cce Returns 0 if the denominator is zero, else, like Div. Supports broadcasting. Parameters ---------- input_x: dict dict with keys(shape and dtype) of input_x input_y: dict dict with keys(shape and dtype) of input_y output_z: dict dict with keys(shape and dtype) of output kernel_name: str cce kernel name, default value is "div_no_nan" Returns ------- None """ shape_x = input_x.get("shape") shape_y = input_y.get("shape") dtype = input_x.get("dtype") for shape in (shape_x, shape_y): check_shape(shape, param_name="input_x") shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") input_dtype = dtype.lower() check_dtype(input_dtype, ("float16", "float32", "int32", "int8", "uint8"), param_name="input_x") reshape_x, reshape_y = refine_shapes_for_broadcast(shape_x, shape_y) data_x = tvm.placeholder(reshape_x, name="data_x", dtype=input_dtype) data_y = tvm.placeholder(reshape_y, name="data_y", dtype=input_dtype) res = div_no_nan_compute(data_x, data_y, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x, data_y, res]} te.lang.cce.cce_build_code(sch, config)
def sub(input_x, input_y, output_z, kernel_name="sub"): """ do element-wise sub operation between two input tensors Parameters: ---------- input_x : dict shape and dtype of input, only support float16, float32,int32 input_y : dict shape and dtype of input, only support float16, float32,int32 output_z: dict shape and dtype of output, should be same shape and type as input kernel_name : kernel name, default value is "sub" Returns ------- None """ check_list = ["float16", "float32", "int32"] x_dtype = input_x.get("dtype").lower() y_dtype = input_x.get("dtype").lower() if not x_dtype in check_list or not y_dtype in check_list: error_detal = "sub only support float16, float32, int32" error_manager_vector.raise_err_two_input_dtype_invalid( kernel_name, "input_x", "input_y", error_detal) ins = classify([input_x, input_y], Mode.ELEWISE_WITH_BROADCAST) schedules, tensors = [], [] for (x1, x2) in ins: with te.op.compute(): x_shape, y_shape = variable_shape([x1, x2], support_broadcast=True) x_shape, y_shape = refine_shapes_for_broadcast(x_shape, y_shape) data1 = tvm.placeholder(x_shape, x_dtype, "data1") data2 = tvm.placeholder(y_shape, y_dtype, "data2") res = sub_compute(data1, data2, output_z, kernel_name) tensors.append([data1, data2, res]) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) # build config = {"print_ir": False, "name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def bitwise_and(x1, x2, y, kernel_name="bitwise_and"): """ algorithm: bitwise_and computes the bitwise and of `x1` and `x2` Parameters ---------- x1 : dict the shape and dtype of the tensor x1, only support int16,uint16 x2 : dict the shape and dtype of the tensor x2, only support int16,uint16 y : dict the shape and dtype of the tensor y, only support int16,uint16 kernel_name : string cce kernel name, default value is "bitwise_and" Returns ------- None """ shape_x, shape_y, dtype = _check_parameters(x1, x2, y, kernel_name) shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="x1", param_name_input2="x2") shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y) if dtype == "int32": dtype = "int16" shape_x.append(2) shape_y.append(2) data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype) data_y = tvm.placeholder(shape_y, name="data_y", dtype=dtype) res = bitwise_and_compute(data_x, data_y, y, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": (data_x, data_y, res)} te.lang.cce.cce_build_code(schedule, config)