def bn_infer_grad(grads, scale, batch_variance, x_backprop, epsilon=0.0001, kernel_name="bn_infer_grad"): """ algorithm: fused_batch_norm_grad_v2 bn_infer_grad. Parameters ---------- grads: dict dict of grads, A 5D Tensor for input grads. scale: dict dict of scale, A 5D Tensor for input scale. batch_variance: dict dict of batch_variance, A 5D Tensor for input batch_variance. x_backprop: dict dict of x_backprop, A 5D Tensor for output x_backprop. epsilon: float A small float number added to the variance of x. Defaults to `0.0001`. kernel_name: str kernel name, default value is "bn_infer_grad" Returns ------- None """ shape_grads = grads.get("shape") shape_scale = scale.get("shape") shape_batch_variance = batch_variance.get("shape") input_grads_dtype = grads.get("dtype").lower() input_scale_dtype = scale.get("dtype").lower() batch_variance_dtype = batch_variance.get("dtype").lower() check_dtype(input_grads_dtype, ("float32", "float16"), param_name="grads") check_dtype(input_scale_dtype, ("float32",), param_name="scale") check_dtype(batch_variance_dtype, ("float32",), param_name="batch_variance") _check_shape(shape_grads, shape_batch_variance) util.compare_tensor_dict_key(scale, batch_variance, "shape") grads_input = tvm.placeholder(shape_grads, name="grads_input", dtype=input_grads_dtype) scale_input = tvm.placeholder(shape_scale, name="x_input", dtype=input_scale_dtype) batch_variance_input = tvm.placeholder(shape_batch_variance, name="batch_variance_input", dtype=batch_variance_dtype) res = bn_infer_grad_compute(grads_input, scale_input, batch_variance_input, x_backprop, epsilon, kernel_name=kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [grads_input, scale_input, batch_variance_input, res] config = {"name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def leaky_relu_grad(g, x, y, negative_slope=0, kernel_name="leaky_relu_grad"): """ calculate the backpropagation of leaky_relu operation y = gradients(x>0) or negative_slope*gradients(x<=0). support dtype:float16,float32 Parameters ---------- g : dict the backpropagated gradients to the corresponding leaky_relu operation x : dict the x passed as output of leaky_relu operation y : dict the output of leaky_relu back propagation negative_slope : float or int allow non-zero slope for negative inputs to speed up optimization kernel_name : str kernel name, default value is "leaky_relu_grad" Returns ------- None """ shape_g = g.get("shape") shape_x = x.get("shape") dtype_g = g.get("dtype").lower() dtype_x = x.get("dtype").lower() util.check_kernel_name(kernel_name) util.check_shape_rule(shape_g) util.check_shape_rule(shape_x) util.check_tensor_shape_size(shape_g) util.check_tensor_shape_size(shape_x) shape_list = util.produce_shapes(shape_g, shape_x) util.check_tensor_shape_size(shape_list[2]) # check input tensor data_type check_list = ["float16", "float32"] util.check_dtype_rule(dtype_g, check_list) util.check_dtype_rule(dtype_x, check_list) util.compare_tensor_dict_key(g, x, "dtype") shape_g, shape_x = refine_shapes_for_broadcast(shape_list[0], shape_list[1]) data_g = tvm.placeholder(shape_g, name="data_g", dtype=dtype_g) data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_g) res = leaky_relu_grad_compute(data_g, data_x, y, negative_slope, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_g, data_x, res]} te.lang.cce.cce_build_code(schedule, config)
def rsqrt_grad(input_y, input_dy, output_z, kernel_name="rsqrt_grad"): """ calculate the backpropagation of rsqrt operation rsqrt: y = 1 / sqrt(x) rsqrt_grad: -1/2 * y**3 *dy Parameters ---------- input_y: dict dict of input_y, include keys(shape and dtype) input_dy: dict dict of input_dy, include keys(shape and dtype) output_z: dict dict of output kernel_name: str cce kernel name, default value is "rsqrt_grad" Returns ------- None """ shape_input_y = input_y.get("shape") dtype_input_y = input_y.get("dtype") shape_input_dy = input_dy.get("shape") dtype_input_dy = input_dy.get("dtype") check_shape(shape_input_y, param_name="input_y") check_shape(shape_input_dy, param_name="input_dy") util.compare_tensor_dict_key(input_y, input_dy, "shape") check_list = ("float16", "float32", "int32", "int8") dtype_input_y = dtype_input_y.lower() check_dtype(dtype_input_y, check_list, param_name="input_y") dtype_input_dy = dtype_input_dy.lower() check_dtype(dtype_input_dy, check_list, param_name="input_dy") util.compare_tensor_dict_key(input_y, input_dy, "dtype") reshape_y, reshape_dy = refine_shapes_for_broadcast( shape_input_y, shape_input_dy) data_input_y = tvm.placeholder(reshape_y, name="data_input_y", dtype=dtype_input_y) data_input_dy = tvm.placeholder(reshape_dy, name="data_input_dy", dtype=dtype_input_dy) res = rsqrt_grad_compute(data_input_y, data_input_dy, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [data_input_y, data_input_dy, res] } te.lang.cce.cce_build_code(sch, config)
def correction_mul_grad(dout, x, batch_std, running_std, dx, mul_dx, channel, kernel_name="correction_mul_grad"): """CorrectionMulGrad op""" shape_dout = dout.get("shape") shape_x = dout.get("shape") dtype_dout = dout.get("dtype") dtype_x = x.get("dtype") dtype_batch_std = batch_std.get("dtype") dtype_running_std = running_std.get("dtype") inp_dtype_dout = dtype_dout.lower() inp_dtype_x = dtype_x.lower() inp_dtype_batch_std = dtype_batch_std.lower() inp_dtype_running_std = dtype_running_std.lower() util.check_dtype_rule(inp_dtype_dout, ("float16", "float32")) util.check_dtype_rule(inp_dtype_x, ("float16", "float32")) util.check_dtype_rule(inp_dtype_batch_std, ("float16", "float32")) util.check_dtype_rule(inp_dtype_running_std, ("float16", "float32")) util.compare_tensor_dict_key(dout, x, "dtype") util.compare_tensor_dict_key(dout, x, "shape") util.compare_tensor_dict_key(dx, x, "shape") util.compare_tensor_dict_key(batch_std, running_std, "shape") util.compare_tensor_dict_key(dx, mul_dx, "shape") util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) data_format = dout.get("format") ori_format = dout.get("format") if data_format.upper() not in ("NC1HWC0", "NCHW"): raise RuntimeError("Un supported data format {}".format(data_format)) if data_format.upper() == "NCHW" and ori_format != "NCHW": raise RuntimeError("data_format(NCHW) must same as ori_format") shape_c = [1] * len(shape_x) shape_c[channel] = batch_std.get("ori_shape")[0] if data_format == "NC1HWC0" and channel == 1: shape_c = batch_std.get("shape") dout_t = tvm.placeholder(shape_dout, name="dout", dtype=inp_dtype_dout) x_t = tvm.placeholder(shape_x, name="x", dtype=inp_dtype_x) batch_std_t = tvm.placeholder(shape_c, name="batch_std", dtype=inp_dtype_batch_std) running_std_t = tvm.placeholder(shape_c, name="running_std", dtype=inp_dtype_running_std) res_list = correction_mul_grad_compute(dout_t, x_t, batch_std_t, running_std_t, channel, data_format, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [dout_t, x_t, batch_std_t, running_std_t] + res_list config = {"print_ir": False, "name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def lp_loss(predict, label, y, p, reduction="mean", kernel_name="lp_loss"): """ :param predict: dict shape and dtype of input :param label: dict shape and dtype of label, should be same shape and type as predict :param y: dict shape and dtype of y, should be same shape and type as predict :param p: int decides which loss to compute, now the p only can be 1 to compute l1_loss :param reduction: str reduce mode,can be 'mean','sum' or 'none' :param kernel_name: kernel name, default value is "lp_loss" :return: None """ predict_shape = predict.get("shape") predict_dtype = predict.get("dtype").lower() label_shape = label.get("shape") label_dtype = label.get("dtype").lower() dtype_list = ["float16", "float32"] reduction_list = ["none", "mean", "sum"] op_utils.check_dtype(predict_dtype, dtype_list) op_utils.check_dtype(label_dtype, dtype_list) op_utils.check_shape(predict_shape) op_utils.check_shape(label_shape) util.compare_tensor_dict_key(predict, label, "shape") util.compare_tensor_dict_key(predict, label, "dtype") if p != 1: raise RuntimeError("lp_loss only supports l1_loss") if reduction not in reduction_list: raise RuntimeError("reduction should be one of ['none','mean','sum']") predict_data = tvm.placeholder(predict_shape, dtype=predict_dtype, name="predict_data") label_data = tvm.placeholder(label_shape, dtype=label_dtype, name="label_data") res = lp_loss_compute(predict_data, label_data, p, reduction, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [predict_data, label_data, res] } te.lang.cce.cce_build_code(schedule, config)
def xlogy(input_x, input_y, output_z, kernel_name="xlogy"): """ algorithm: xlogy calculating data's xlogy, res = 0 if x == 0 else x*log(y) Parameters ---------- input_x: dict dict of input_x, include keys(shape and dtype) input_y: dict dict of input_y, include keys(shape and dtype) output_z: dict dict info of output_z kernel_name: str kernel name, default value is "xlogy" Returns ------- None """ shape_x = input_x.get("shape") shape_y = input_y.get("shape") dtype = input_x.get("dtype") dtype_y = input_y.get("dtype") util.compare_tensor_dict_key(input_x, input_y, "dtype") check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") input_dtype = dtype.lower() input_dtype_y = dtype_y.lower() check_list = ("float16", "float32") check_dtype(input_dtype, check_list, param_name="input_x") check_dtype(input_dtype_y, check_list, param_name="input_y") shape_list = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") shape_x, shape_y = refine_shapes_for_broadcast(shape_list[0], shape_list[1]) data1 = tvm.placeholder(shape_x, name="data1", dtype=input_dtype) data2 = tvm.placeholder(shape_y, name="data2", dtype=input_dtype) res = xlogy_compute(data1, data2, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [data1, data2, res], "bool_storage_as_1bit": False } te.lang.cce.cce_build_code(sch, config)
def equal(input_x, input_y, output_z, kernel_name="equal"): """ Returns the truth value of (x = y) element-wise Parameters ---------- input_x: dict dict of input_x, include keys(shape and dtype) input_y: dict dict of input_y, include keys(shape and dtype) output_z: dict dict of output kernel_name: str cce kernel name, default value is "equal" Returns ------- None """ shape_x = input_x.get("shape") dtype_x = input_x.get("dtype") shape_y = input_y.get("shape") dtype_y = input_y.get("dtype") shape_x, shape_y, shape_broadcast = broadcast_shapes( shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") check_list = ("float16", "float32", "int32", "int8", "uint8") dtype_x = dtype_x.lower() check_dtype(dtype_x, check_list, param_name="input_x") dtype_y = dtype_y.lower() check_dtype(dtype_y, check_list, param_name="input_y") util.compare_tensor_dict_key(input_x, input_y, "dtype") shape_x = list(shape_x) shape_y = list(shape_y) shape_x, shape_y = refine_shapes_for_broadcast(shape_x, shape_y) data_input_x = tvm.placeholder(shape_x, name="data_input_x", dtype=dtype_x) data_input_y = tvm.placeholder(shape_y, name="data_input_y", dtype=dtype_y) res = equal_compute(data_input_x, data_input_y, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [data_input_x, data_input_y, res] } te.lang.cce.cce_build_code(sch, config)
def softmax_grad(softmax, grad_softmax, grad_x, kernel_name="softmax_grad"): """ Computes softmax gradients for a softmax operation The calculation formula is as follows : grad_x = grad_softmax * softmax - sum(grad_softmax * softmax) * softmax Parameters ---------- softmax: dict shape and dtype of first input, only support float16, float32 grad_softmax: dict shape and dtype of second input, only support float16, float32 grad_x: dict shape and dtype of output data, should be same shape and type as input kernel_name: str kernel name, default value is "softmax_grad" Returns ------- None """ shape_softmax = softmax.get("shape") shape_grad_softmax = grad_softmax.get("shape") dtype_softmax = softmax.get("dtype") util.compare_tensor_dict_key(softmax, grad_softmax, "dtype") check_shape(shape_softmax, param_name="softmax") check_shape(shape_grad_softmax, param_name="grad_softmax") check_list = ("float16", "float32") input_dtype = dtype_softmax.lower() check_dtype(input_dtype, check_list, param_name="softmax") if list(shape_softmax) != list(shape_grad_softmax): shape_softmax, shape_grad_softmax, shape_max = \ broadcast_shapes(shape_softmax, shape_grad_softmax, param_name_input1="softmax", param_name_input2="grad_softmax") softmax = tvm.placeholder(shape_softmax, name="softmax", dtype=input_dtype) grad_softmaxgrad = tvm.placeholder(shape_grad_softmax, name="grad_softmaxgrad", dtype=input_dtype) res = softmax_grad_compute(softmax, grad_softmaxgrad, grad_x, kernel_name=kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [softmax, grad_softmaxgrad, res] } te.lang.cce.cce_build_code(sch, config)
def reciprocal_grad(input_y, input_dy, output_data, kernel_name="reciprocal_grad"): """ algorithm: reciprocal_grad calculating data's reciprocal grad,dx = -1*dy*y*y, where `y = 1/x`, and `dy` is the corresponding input gradient. Parameters ---------- input_y: dict shape and dtype of input_y, only support float16, float32, int32, int8 input_dy: dict shape and dtype of input_dy, should be same shape and type as input_y output_data: dict shape and dtype of output, should be same shape and type as input_y kernel_name: str kernel name, default value is "reciprocal_grad" Returns ------- None """ shape_y = input_y.get("shape") shape_dy = input_dy.get("shape") dtype_y = input_y.get("dtype").lower() dtype_dy = input_dy.get("dtype").lower() check_shape(shape_y, param_name="input_y") check_shape(shape_dy, param_name="input_dy") shape_y = util.shape_refine(shape_y) shape_dy = util.shape_refine(shape_dy) util.compare_tensor_dict_key(input_y, input_dy, "shape") util.compare_tensor_dict_key(input_y, input_dy, "dtype") check_list = ("float16", "float32", "int32", "int8") check_dtype(dtype_y, check_list, param_name="input_y") reshape_y, reshape_dy = refine_shapes_for_broadcast(shape_y, shape_dy) data_dy = tvm.placeholder(reshape_dy, name="data_dy", dtype=dtype_dy) data_y = tvm.placeholder(reshape_y, name="data_y", dtype=dtype_y) res = reciprocal_grad_compute(data_y, data_dy, output_data, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_y, data_dy, res]} te.lang.cce.cce_build_code(sch, config)
def iou(bboxes, gtboxes, overlap, mode="iou", kernel_name="iou"): """ calculating data Parameters ---------- bboxes : dict shape and dtype of bboxes, the coordinates of bbox shape must be [n, 4] [x1, y1, x2, y2] gtboxes : dict shape and dtype of gtboxes, the coordinates of bbox shape must be [m, 4] [x1, y1, x2, y2] overlap : dict shape and dtype of overlap result shape is [m, n] mode : str ('iou','iof') iou : the output is gtbox and bbox iou iof : kernel_name : str kernel name, default value is "iou" Returns ------- None """ bboxes_shape = bboxes.get("shape") gtboxes_shape = gtboxes.get("shape") check_shape(bboxes_shape, param_name="bboxes") check_shape(gtboxes_shape, param_name="gtboxes") _box_shape_check("bboxes", bboxes_shape) _box_shape_check("gtboxes", gtboxes_shape) bboxes_dtype = bboxes.get("dtype").lower() util.compare_tensor_dict_key(bboxes, gtboxes, "dtype") check_list = ("float16", "float32") check_dtype(bboxes_dtype, check_list, param_name="bboxes") # check whether mode is valid check_list = ("iou", "iof") if mode not in check_list: raise RuntimeError("Mode only support iou and iof") res = iou_compute(bboxes, gtboxes, overlap, mode, kernel_name) return res
def smooth_l1_loss_grad_v2(predict, label, dout, gradient, sigma=1.0, reduction='mean', kernel_name="smooth_l1_loss_grad_v2"): # check input: predict label dout check_list = ("float16", "float32") shape_predict = predict.get("shape") dtype_predict = predict.get("dtype").lower() util.check_dtype_rule(dtype_predict, check_list) shape_label = label.get("shape") dtype_label = label.get("dtype").lower() util.check_dtype_rule(dtype_label, check_list) shape_dout = dout.get("shape") dtype_dout = dout.get("dtype").lower() util.check_dtype_rule(dtype_dout, check_list) util.check_shape_rule(shape_predict) util.check_shape_rule(shape_label) util.check_shape_rule(shape_dout) util.compare_tensor_dict_key(predict, label, "shape") util.compare_tensor_dict_key(predict, dout, "shape") # check reduction check_list_reduction = ("none", "mean", "sum") reduction_type = reduction.lower() util.check_dtype_rule(reduction_type, check_list_reduction) input_predict = tvm.placeholder( shape_predict, name="predict", dtype=dtype_predict) input_label = tvm.placeholder( shape_label, name="label", dtype=dtype_label) input_dout = tvm.placeholder( shape_dout, name="dout", dtype=dtype_dout) res = smooth_l1_loss_grad_v2_compute(input_predict, input_label, input_dout, sigma, reduction_type) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [input_predict, input_label, input_dout, res] } te.lang.cce.cce_build_code(sch, config)
def mod(input_x, input_y, output_z, kernel_name="mod"): """ Returns element-wise remainder of division. Parameters ---------- input_x: dict input tensor contains shape and dtype attributes. source data type support "float16", "float32", "int8", "uint8", "int32". input_y: dict input tensor contains shape and dtype attributes. Must have the same type as 'input_x'. output_z: dict data of output. Must have the same type as 'input_x'. kernel_name: str kernel name, default value is "mod" Returns: None """ shape_x = input_x.get("shape") shape_y = input_y.get("shape") util.compare_tensor_dict_key(input_x, input_y, "dtype") check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") check_list = ("float16", "float32", "int8", "uint8", "int32") input_dtype = input_x.get("dtype").lower() check_dtype(input_dtype, check_list, param_name="input_x") shape_x, shape_y, shape_broadcast = broadcast_shapes( shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") reshape_x, reshape_y = refine_shapes_for_broadcast(shape_x, shape_y) data_x = tvm.placeholder(reshape_x, dtype=input_dtype, name="data_x") data_y = tvm.placeholder(reshape_y, dtype=input_dtype, name="data_y") res = mod_compute(data_x, data_y, output_z, kernel_name="mod") with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x, data_y, res]} te.lang.cce.cce_build_code(sch, config)
def xdivy(input_x, input_y, output_z, kernel_name="xdivy"): """ algorithm: xdivy calculating data's xdivy,return 0 if x==0 and x/y otherwise, elementwise Parameters ---------- input_x: dict dict with keys(shape and dtype) of input_x input_y: dict dict with keys(shape and dtype) of input_y output_z: dict dict with keys(shape and dtype) of output kernel_name : str kernel name, default value is "xdivy" Returns ------- None """ shape_x = input_x.get("shape") dtype = input_x.get("dtype") shape_y = input_y.get("shape") dtype_y = input_y.get("dtype") util.compare_tensor_dict_key(input_x, input_y, "dtype") check_shape(shape_x, param_name="input_x") check_shape(shape_y, param_name="input_y") shape_list = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x", param_name_input2="input_y") input_dtype = dtype.lower() input_dtype_y = dtype_y.lower() check_list = ("float16", "float32") check_dtype(input_dtype, check_list, param_name="input_x") check_dtype(input_dtype_y, check_list, param_name="input_y") reshape_x, reshape_y = refine_shapes_for_broadcast(shape_list[0], shape_list[1]) data_x = tvm.placeholder(reshape_x, dtype=input_dtype, name="data_x") data_y = tvm.placeholder(reshape_y, dtype=input_dtype, name="data_y") res = xdivy_compute(data_x, data_y, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x, data_y, res]} te.lang.cce.cce_build_code(sch, config)
def _dtype_check(input_x, input_scale, input_offset, input_mean, input_variance, is_training): """ Function to check if the dtype is in line with norms. Parameters ---------- input_x: dict dict of input, A 4D Tensor for input data. input_scale: dict dict of scale, A 1D Tensor for scaling factor, to scale the normalized x. input_offset: dict dict of offset, A 1D Tensor for offset, to shift to the normalized x. input_mean: dict dict of mean, A 1D Tensor for population mean. Used for inference only, must be empty for training. input_variance: dict dict of variance, A 1D Tensor for population variance. Used for inference only, must be empty for training. is_training: bool A bool value to indicate the operation is for training or inference. Returns ------- None """ dtype_x = input_x.get("dtype") dtype_scale = input_scale.get("dtype") util.compare_tensor_dict_key(input_scale, input_offset, "dtype") if not is_training: util.compare_tensor_dict_key(input_scale, input_mean, "dtype") util.compare_tensor_dict_key(input_scale, input_variance, "dtype") check_dtype(dtype_x.lower(), ("float16", "float32"), param_name="input_x") check_dtype(dtype_scale.lower(), ("float32", "float16"), param_name="input_scale")
def prelu_grad(input_gradients, input_features, input_weights, output_backprops_dx, output_backprops_da, kernel_name="prelu_grad"): """ calculating the backpropagation of prelu operation prelu equivalent function: prelu(x) = max(0, input_features) + input_weights * min(0, input_features) so prelu_grad output_backprops: output_backprops_dx = input_features > 0 ? input_gradients : input_weights * input_gradients output_backprops_da = input_features > 0 ? 0 : input_features * input_gradients support dtype:float16, float32 Parameters ---------- input_gradients : dict shape and dtype of grad, not support 1D input_features : dict shape and dtype of input tensor, not support 1D input_weights : dict shape and dtype of input learning weight output_backprops_dx : dict shape and dtype of output, should be same shape and type as input_features output_backprops_da : dict shape and dtype of output, should be same shape and type as input_features kernel_name : str kernel name, default value is "prelu_grad" Returns ------- None """ shape_input_gradients = input_gradients.get("shape") dtype_input_gradients = input_gradients.get("dtype") input_gradients_dtype = dtype_input_gradients.lower() input_format = input_gradients.get("format") shape_input_features = input_features.get("shape") dtype_input_features = input_features.get("dtype") input_features_dtype = dtype_input_features.lower() shape_input_weights = input_weights.get("shape") dtype_input_weights = input_weights.get("dtype") input_weights_dtype = dtype_input_weights.lower() # check dtype check_list = ("float16", "float32") util.compare_tensor_dict_key(input_gradients, input_features, "dtype") util.compare_tensor_dict_key(input_gradients, input_weights, "dtype") check_dtype(dtype_input_gradients, check_list, param_name="input_gradients") check_dtype(dtype_input_features, check_list, param_name="input_features") check_dtype(dtype_input_weights, check_list, param_name="input_weights") # check shape check_shape(shape_input_gradients, param_name="input_gradients") check_shape(shape_input_features, param_name="input_features") check_shape(shape_input_weights, param_name="input_weights") if list(shape_input_gradients) != list(shape_input_features): shape_input_gradients, shape_input_features, shape_max = \ broadcast_shapes(shape_input_gradients, shape_input_features, param_name_input1="input_gradients", param_name_input2="input_features") check_inputs_shape(shape_input_features, shape_input_weights, input_format) if len(shape_input_features) == 4: shape_input_weights = [1, shape_input_weights[0], 1, 1] elif input_format == "NC1HWC0" and len(shape_input_weights) == 5: pass elif input_format == "NC1HWC0" and len(shape_input_weights) == 1 \ and shape_input_weights[0] != 1: weights_c1 = (shape_input_weights[0] + 15) // 16 shape_input_weights = [1, weights_c1, 1, 1, 16] else: weights_shape = [1 for _ in range(len(shape_input_features))] weights_shape[1] = shape_input_weights[0] shape_input_weights = weights_shape data_input_gradients = tvm.placeholder(shape_input_gradients, name="data_input_gradients", dtype=input_gradients_dtype) data_input_features = tvm.placeholder(shape_input_features, name="data_input_features", dtype=input_features_dtype) data_input_weights = tvm.placeholder(shape_input_weights, name="data_input_weights", dtype=input_weights_dtype) res_dx, res_da = prelu_grad_compute( data_input_gradients, data_input_features, data_input_weights, output_backprops_dx, output_backprops_da, input_format, kernel_name) res = [res_dx, res_da] tensor_list = [ data_input_gradients, data_input_features, data_input_weights ] + list(res) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def smooth_l1_loss_grad(predict, label, dout, gradient, sigma=1.0, kernel_name="smooth_l1_loss_grad"): """ calculating data smooth = x/sigma if -sigma < x < sigma 1 if x > sigma -1 if x < -sigma out = smooth * dout Parameters ---------- predict : dict shape and dtype of input label : dict shape and dtype of output, should be same shape and type as predict gradient : dict shape and dtype of output, should be same shape and type as predict dout : dict shape and dtype of output, should be same shape and type as predict sigma : float sigma kernel_name : str kernel name, default value is "smooth_l1_loss_grad" Returns ------- None """ predict_shape = predict.get("shape") predict_dtype = predict.get("dtype") label_shape = label.get("shape") dout_shape = dout.get("shape") input_dtype = predict_dtype.lower() label_dtype = label.get("dtype").lower() dout_dtype = dout.get("dtype").lower() util.compare_tensor_dict_key(predict, label, "shape") util.compare_tensor_dict_key(predict, dout, "shape") util.compare_tensor_dict_key(predict, label, "dtype") util.compare_tensor_dict_key(predict, dout, "dtype") check_list = ("float16", "float32") check_dtype(input_dtype, check_list, param_name="predict") check_dtype(label_dtype, check_list, param_name="label") check_dtype(dout_dtype, check_list, param_name="dout") check_shape(predict_shape, param_name="predict") check_shape(label_shape, param_name="label") check_shape(dout_shape, param_name="dout") shape = (functools_reduce(lambda x, y: x * y, predict_shape[:]), ) predict_input = tvm.placeholder(shape, name="predict_input", dtype=input_dtype) label_input = tvm.placeholder(shape, name="label_input", dtype=input_dtype) dout_input = tvm.placeholder(shape, name="dout_input", dtype=input_dtype) res = smooth_l1_loss_grad_compute(predict_input, label_input, dout_input, gradient, sigma, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [predict_input, label_input, dout_input, res] } te.lang.cce.cce_build_code(sch, config)
def sigmoid_cross_entropy_with_logits_grad( predict, target, dout, gradient, kernel_name="sigmoid_cross_entropy_with_logits_grad"): """ calculating data Parameters ---------- predict : dict the output of previous layer target : dict label dout : dict last gradient gradient : dict result after compute kernel_name : str kernel name, default value is "sigmoid_cross_entropy_with_logits_grad" Returns ------- None """ check_list = ("float16", "float32") predict_shape = predict.get("shape") predict_dtype = predict.get("dtype") gradient_dtype = gradient.get("dtype").lower() predict_dtype_lower = predict_dtype.lower() check_dtype(gradient_dtype, check_list, param_name="gradient") check_dtype(predict_dtype_lower, check_list, param_name="predict") check_shape(predict_shape, param_name="predict") target_shape = target.get("shape") target_dtype = target.get("dtype") target_dtype_lower = target_dtype.lower() check_dtype(target_dtype_lower, check_list, param_name="target") check_shape(target_shape, param_name="target") dout_shape = dout.get("shape") dout_dtype = dout.get("dtype") dout_dtype_lower = dout_dtype.lower() check_dtype(dout_dtype_lower, check_list, param_name="dout") check_shape(dout_shape, param_name="dout") util.compare_tensor_dict_key(predict, target, "shape") util.compare_tensor_dict_key(predict, dout, "shape") shape = (functools_reduce(lambda x, y: x * y, predict_shape[:]), ) predict_data_input = tvm.placeholder(shape, name="predict_data_input", dtype=predict_dtype_lower) target_data_input = tvm.placeholder(shape, name="target_data_input", dtype=target_dtype_lower) dout_data_input = tvm.placeholder(shape, name="dout_data_input", dtype=dout_dtype_lower) res = sigmoid_cross_entropy_with_logits_grad_compute( predict_data_input, target_data_input, dout_data_input, gradient, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [predict_data_input, target_data_input, dout_data_input, res] } te.lang.cce.cce_build_code(sch, config)
def smooth_l1_loss_v2(predict, label, loss, sigma=1.0, reduction="mean", kernel_name="smooth_l1_loss_v2"): """ calculating data Parameters ---------- predict : dict shape and dtype of input label : dict shape and dtype of input loss : dict shape and dtype of output, should be same shape and type as input sigma: float sigma, default value is 1 reduction: str type of result, default value is "mean" kernel_name : str kernel name, default value is "smooth_l1_lossV2" Returns ------- None """ util.check_kernel_name(kernel_name) check_list = ("float16", "float32") shape_predict = predict.get("shape") dtype_predict = predict.get("dtype").lower() util.check_dtype_rule(dtype_predict, check_list) shape_label = label.get("shape") dtype_label = label.get("dtype").lower() util.check_dtype_rule(dtype_label, check_list) shape_loss = label.get("shape") dtype_loss = loss.get("dtype").lower() util.check_dtype_rule(dtype_loss, check_list) util.check_shape_rule(shape_predict) util.check_shape_rule(shape_label) util.check_shape_rule(shape_loss) util.compare_tensor_dict_key(predict, label, "shape") check_list_reduction = ("none", "mean", "sum") reduction_type = reduction.lower() util.check_dtype_rule(reduction_type, check_list_reduction) input_predict = tvm.placeholder(shape_predict, name="predict", dtype=dtype_predict) input_label = tvm.placeholder(shape_label, name="label", dtype=dtype_label) res = smooth_l1_loss_v2_compute(input_predict, input_label, sigma, reduction_type) # TODO:auto schedule with tvm.target.cce(): sch = generic.auto_schedule(res) # TODO:operator build config = { "name": kernel_name, "tensor_list": [input_predict, input_label, res] } te.lang.cce.cce_build_code(sch, config)
def bn_training_update_grad(grads, x, batch_mean, batch_variance, diff_scale, diff_offset, epsilon=0.0001, kernel_name="bn_training_update_grad"): """ algorithm: fused_batch_norm_grad_v2 bn_training_update_grad. Parameters ---------- grads: dict dict of grads, A 5D Tensor for input grads. x: dict dict of x, A 5D Tensor for input x. batch_mean: dict dict of batch_mean, A 5D Tensor for input batch_mean. batch_variance: dict dict of batch_variance, A 5D Tensor for input batch_variance. diff_scale: dict dict of diff_scale, A 5D Tensor for output diff_scale. diff_offset: dict dict of diff_offset, A 5D Tensor for output diff_offset. epsilon: float A small float number added to the variance of x. Defaults to `0.0001`. kernel_name: str kernel name, default value is "bn_training_update_grad" Returns ------- None """ shape_grads = grads.get("shape") shape_x = x.get("shape") shape_batch_mean = batch_mean.get("shape") shape_batch_variance = batch_variance.get("shape") dtype_grads = grads.get("dtype") dtype_x = x.get("dtype") dtype_batch_mean = batch_mean.get("dtype") dtype_batch_variance = batch_variance.get("dtype") input_grads_dtype = dtype_grads.lower() input_x_dtype = dtype_x.lower() batch_mean_dtype = dtype_batch_mean.lower() batch_variance_dtype = dtype_batch_variance.lower() check_dtype(input_grads_dtype, ("float32", "float16"), param_name="grads") check_dtype(input_x_dtype, ("float32", "float16"), param_name="x") check_dtype(batch_mean_dtype, ("float32", ), param_name="batch_mean") check_dtype(batch_variance_dtype, ("float32", ), param_name="batch_variance") util.compare_tensor_dict_key(grads, x, "dtype") data_format = grads.get("format") ori_format = grads.get("ori_format") _check_format_nd(data_format, ori_format) if data_format == "NC1HWC0": _check_shape(shape_grads, shape_x, shape_batch_mean, shape_batch_variance) else: shape_list = [1, 1, 1, 1] shape_list[1] = shape_x[1] shape_batch_mean = shape_list shape_batch_variance = shape_list util.compare_tensor_dict_key(grads, x, "shape") util.compare_tensor_dict_key(batch_mean, batch_variance, "shape") grads_input = tvm.placeholder(shape_grads, name="grads_input", dtype=input_grads_dtype) x_input = tvm.placeholder(shape_x, name="x_input", dtype=input_x_dtype) batch_mean_input = tvm.placeholder(shape_batch_mean, name="batch_mean_input", dtype=batch_mean_dtype) batch_variance_input = tvm.placeholder(shape_batch_variance, name="batch_variance_input", dtype=batch_variance_dtype) res_list = bn_training_update_grad_compute(grads_input, x_input, batch_mean_input, batch_variance_input, diff_scale, diff_offset, epsilon, kernel_name=kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [ grads_input, x_input, batch_mean_input, batch_variance_input ] + list(res_list) config = {"name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def sigmoid_cross_entropy_with_logits_grad_v2( predict, target, dout, weight, pos_weight, gradient, reduction="mean", kernel_name="sigmoid_cross_entropy_with_logits_grad_v2"): """ Function: it measures the gradient of Binary Cross Entropy With Logits. ----------- :param predict: dict, shape and dtype of input, required :param target: dict,shape and dtype of target, should be same shape and type as predict, required :param dout: dict,shape and dtype of dout, should be same shape and type as predict, required :param weight: dict,shape and dtype of weight, should be same shape and type as predict, optional :param pos_weight: dict,shape and dtype of pos_weight, should be same shape and type as predict, optional :param gradient: dict,shape and dtype of target, should be same shape and type as predict, required :param reduction: str, specifies the reduction mode: 'none' | 'mean' | 'sum', default to 'mean' :param kernel_name: str, kernel name, default to 'sigmoid_cross_entropy_with_logits_grad_v2' :return: None """ predict_shape = predict.get("shape") predict_dtype = predict.get("dtype").lower() target_shape = target.get("shape") target_dtype = target.get("dtype").lower() dout_shape = dout.get("shape") dout_dtype = dout.get("dtype").lower() util.compare_tensor_dict_key(predict, target, "shape") util.compare_tensor_dict_key(predict, dout, "shape") util.compare_tensor_dict_key(predict, target, "dtype") util.compare_tensor_dict_key(predict, dout, "dtype") dtype_list = ["float16", "float32"] op_utils.check_dtype(predict_dtype, dtype_list) op_utils.check_shape(predict_shape) reduction_list = ["none", "mean", "sum"] if reduction not in reduction_list: raise RuntimeError("reduction should be one of ['none','mean','sum']") util.check_kernel_name(kernel_name) tensor_list = [] predict_data = tvm.placeholder(predict_shape, predict_dtype, name="predict_data") target_data = tvm.placeholder(target_shape, target_dtype, name="target_data") dout_data = tvm.placeholder(dout_shape, dout_dtype, name="dout_data") tensor_list.append(predict_data) tensor_list.append(target_data) tensor_list.append(dout_data) weight_data, pos_weight_data = optional_weight(tensor_list, predict_shape, dtype_list, weight, pos_weight) res = sigmoid_cross_entropy_with_logits_grad_v2_compute( predict_data, target_data, dout_data, weight_data, pos_weight_data, reduction) tensor_list.append(res) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(schedule, config)
def bn_training_reduce_grad(grads, x, diff_scale, diff_offset, scale, batch_mean, batch_variance, y, epsilon=0.0001, kernel_name="bn_training_reduce_grad"): """ algorithm: fused_batch_norm_grad_v2 bn_training_reduce_grad. Parameters ---------- grads: dict dict of grads, A 5D Tensor for input grads. source data type, support "float32", "float16". x: dict dict of s, A 5D Tensor for input x. source data type, support "float32", "float16". diff_scale: dict dict of diff_scale, A 5D Tensor for input diff_scale. The output of bn_training_update_grad. source data type, support "float32". diff_offset: dict dict of diff_offset, A 5HD Tensor for input diff_offset. The output of bn_training_update_grad. source data type, support "float32". scale: dict dict of scale, A 5HD Tensor for input scale. source data type, support "float32". batch_mean: dict dict of batch_mean, A 5D Tensor for input batch_mean. source data type, support "float32". batch_variance: dict dict of batch_variance, A 5D Tensor for input batch_variance. source data type, support "float32". y: dict dict of output, A `Tensor`. Has the same type as `grads`. epsilon: float A small float number added to the variance of x. kernel_name: str kernel name, default value is "bn_training_reduce_grad" Returns ------- None """ shape_grads = grads.get("shape") shape_x = x.get("shape") shape_diff_scale = diff_scale.get("shape") shape_diff_offset = diff_offset.get("shape") shape_scale = scale.get("shape") shape_batch_mean = batch_mean.get("shape") shape_batch_variance = batch_variance.get("shape") util.compare_tensor_dict_key(grads, x, "shape") dtype_grads = grads.get("dtype") dtype_x = x.get("dtype") dtype_diff_scale = diff_scale.get("dtype") dtype_diff_offset = diff_offset.get("dtype") dtype_scale = scale.get("dtype") dtype_batch_mean = batch_mean.get("dtype") dtype_batch_variance = batch_variance.get("dtype") input_grads_dtype = dtype_grads.lower() x_dtype = dtype_x.lower() diff_scale_dtype = dtype_diff_scale.lower() diff_offset_dtype = dtype_diff_offset.lower() scale_dtype = dtype_scale.lower() batch_mean_dtype = dtype_batch_mean.lower() batch_variance_dtype = dtype_batch_variance.lower() check_dtype(input_grads_dtype, ("float32", "float16"), param_name="grads") check_dtype(x_dtype, ("float32", "float16"), param_name="x") check_dtype(diff_scale_dtype, ("float32",), param_name="diff_scale") check_dtype(diff_offset_dtype, ("float32",), param_name="diff_offset") check_dtype(scale_dtype, ("float32",), param_name="scale") check_dtype(batch_mean_dtype, ("float32",), param_name="batch_mean") check_dtype(batch_variance_dtype, ("float32",), param_name="batch_variance") util.compare_tensor_dict_key(diff_scale, diff_offset, "shape") util.compare_tensor_dict_key(diff_scale, scale, "shape") util.compare_tensor_dict_key(diff_scale, batch_mean, "shape") util.compare_tensor_dict_key(diff_scale, batch_variance, "shape") util.compare_tensor_dict_key(grads, x, "shape") data_format = grads.get("format").upper() ori_format = grads.get("ori_format").upper() _check_format_nd(data_format, ori_format) if data_format == "NC1HWC0": _check_shape(shape_grads, shape_diff_scale) else: shape_list = [1, 1, 1, 1] shape_list[1] = shape_x[1] shape_diff_scale = shape_list shape_diff_offset = shape_list shape_scale = shape_list shape_batch_mean = shape_list shape_batch_variance = shape_list grads_input = tvm.placeholder(shape_grads, name="grads_input", dtype=input_grads_dtype) x_input = tvm.placeholder(shape_x, name="x_input", dtype=x_dtype) diff_scale_input = tvm.placeholder(shape_diff_scale, name="diff_scale_input", dtype=diff_scale_dtype) diff_offset_input = tvm.placeholder(shape_diff_offset, name="diff_offset_input", dtype=diff_offset_dtype) scale_input = tvm.placeholder(shape_scale, name="scale_input", dtype=scale_dtype) batch_mean_input = tvm.placeholder(shape_batch_mean, name="batch_mean_input", dtype=batch_mean_dtype) batch_variance_input = tvm.placeholder(shape_batch_variance, name="batch_variance_input", dtype=batch_variance_dtype) res = bn_training_reduce_grad_compute(grads_input, x_input, diff_scale_input, diff_offset_input, scale_input, batch_mean_input, batch_variance_input, y, epsilon, kernel_name=kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [grads_input, x_input, diff_scale_input, diff_offset_input, scale_input, batch_mean_input, batch_variance_input, res] config = {"name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def fake_quant_with_min_max_args_gradient(gradients, x, y, min=-6, max=6, num_bits=8, narrow_range=False, kernel_name="fake_quant_" "with_min_max_args"): """ Compute gradients for a FakeQuantWithMinMaxArgs operation. calculating data's : y = gradients*(if x>=nudged_min and <=nudged_max 1 else 0) Parameters ---------- gradients:dict shape and dtype of input gradients,only support float32 x: dict shape and dtype of input x,only support float32 y: dict the dict of output data min: scalar float int Defaults to -6 max: scalar float int Defaults to 6 [min; max] define the clamping range for the x data num_bits: float int Defaults to 8.num_bits is the bitwidth of the quantization, between 2 and 16 narrow_range: bool True or False if True x values are quantized into the quantization range [1; 2^num_bits - 1] if False x values are quantized into the quantization range [0; 2^num_bits - 1] kernel_name: str cce kernel name, default value is "fake_quant_with_min_max_args_gradient" Returns ------- None """ shape_gradients = gradients.get("shape") shape_x = x.get("shape") if shape_gradients != shape_x: raise RuntimeError("shape of two input must be same") util.compare_tensor_dict_key(gradients, x, "dtype") check_shape(shape_x, param_name="x") input_dtype = x.get("dtype").lower() check_dtype(input_dtype, ["float32"], param_name="x") if min >= max: raise RuntimeError("min must be less than max") if num_bits < 2 or num_bits > 16: raise RuntimeError("num_bits is between 2 and 16") shape_x = (functools_reduce(lambda x, y: x * y, shape_x[:]), ) gradients = tvm.placeholder(shape_x, name="gradients", dtype=input_dtype) x = tvm.placeholder(shape_x, name="x", dtype=input_dtype) res = fake_quant_with_min_max_args_gradient_compute( gradients, x, y, float(min), float(max), num_bits, narrow_range, kernel_name) with tvm.target.cce(): auto_sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [gradients, x, res]} te.lang.cce.cce_build_code(auto_sch, config)
def batch_norm_grad_ext2(y_backprop, x, scale, reserve_space_1, reserve_space_2, x_backprop, scale_backprop, offset_backprop, reserve_space_3, reserve_space_4, epsilon=0.0001, data_format="NHWC", is_training=True, kernel_name="batch_norm_grad_ext2"): """ algorithm: batch_norm_grad_ext2 Batch normalization grad. Parameters ---------- y_backprop: dict dict of y_backprop. source data type, support "float16", "float32". x: dict dict of x. source data type, support "float16", "float32". scale: dict dict of scale. source data type, support "float32". reserve_space_1: dict dict of reserve_space_1. source data type, support "float32". When is_training is True, a Tensor for the computed batch mean to be reused in gradient computation. When is_training is False, a Tensor for the population mean to be reused in both 1st and 2nd order gradient computation. reserve_space_2: dict dict of reserve_space_2. source data type, support "float32". When is_training is True, a Tensor for the computed batch variance (inverted variance in the cuDNN case) to be reused in gradient computation. When is_training is False, a Tensor for the population variance to be reused in both 1st and 2nd order gradient computation. x_backprop: dict dict of output. Has the same type as `y_backprop`. scale_backprop: dict dict of scale_backprop. Has the same type as `reserve_space_1`. offset_backprop: dict dict of offset_backprop. Has the same type as `reserve_space_1`. reserve_space_3: dict dict of reserve_space_3. reserve_space_4: dict dict of reserve_space_4. epsilon: float A small float number added to the variance of x. Defaults to `0.0001`. data_format: str An optional `string` from: `"NHWC", "NCHW"`. Defaults to `"NHWC"`. Either "NHWC" (default) or "NCHW". is_training: bool An optional `bool`. Defaults to `True`. A bool value to indicate the operation is for training (default) or inference. kernel_name: str kernel name, default value is "batch_norm_grad_ext2" Returns ------- None """ shape_y_backprop = y_backprop.get("shape") if len(shape_y_backprop) == 2: shape_y_backprop = list(shape_y_backprop) + [1, 1] shape_x = x.get("shape") if len(shape_x) == 2: shape_x = list(shape_x) + [1, 1] shape_scale = scale.get("shape") shape_reserve_space_1 = reserve_space_1.get("shape") shape_reserve_space_2 = reserve_space_2.get("shape") dtype_y_backprop = y_backprop.get("dtype") dtype_x = x.get("dtype") dtype_scale = scale.get("dtype") dtype_reserve_space_1 = reserve_space_1.get("dtype") dtype_reserve_space_2 = reserve_space_2.get("dtype") y_backprop_dtype = dtype_y_backprop.lower() x_dtype = dtype_x.lower() scale_dtype = dtype_scale.lower() reserve_space_1_dtype = dtype_reserve_space_1.lower() reserve_space_2_dtype = dtype_reserve_space_2.lower() check_dtype(y_backprop_dtype, ("float32", "float16"), param_name="y_backprop") check_dtype(x_dtype, ("float32", "float16"), param_name="x") check_dtype(scale_dtype, ("float32", ), param_name="scale") check_dtype(reserve_space_1_dtype, ("float32", ), param_name="reserve_space_1") check_dtype(reserve_space_2_dtype, ("float32", ), param_name="reserve_space_2") util.compare_tensor_dict_key(y_backprop, x, "dtype") _format_check(x, data_format) format_data = x.get("format") _check_shape_len(shape_y_backprop, shape_x, shape_scale, shape_reserve_space_1, shape_reserve_space_2, format_data) _check_shape(shape_y_backprop, shape_x, shape_scale, shape_reserve_space_1, shape_reserve_space_2, format_data) util.compare_tensor_dict_key(y_backprop, x, "shape") util.compare_tensor_dict_key(scale, reserve_space_1, "shape") util.compare_tensor_dict_key(scale, reserve_space_2, "shape") shape_list = _change_shape(shape_scale, shape_reserve_space_1, shape_reserve_space_2, format_data) y_backprop = tvm.placeholder(shape_y_backprop, name="y_backprop", dtype=y_backprop_dtype) x = tvm.placeholder(shape_x, name="x", dtype=x_dtype) scale = tvm.placeholder(shape_list.get("shape_scale_change"), name="scale", dtype=scale_dtype) reserve_space_1 = tvm.placeholder( shape_list.get("shape_reserve_space_1_change"), name="reserve_space_1", dtype=reserve_space_1_dtype) reserve_space_2 = tvm.placeholder( shape_list.get("shape_reserve_space_2_change"), name="reserve_space_2", dtype=reserve_space_2_dtype) res_list = batch_norm_grad_ext2_compute(y_backprop, x, scale, reserve_space_1, reserve_space_2, x_backprop, scale_backprop, offset_backprop, reserve_space_3, reserve_space_4, epsilon, data_format, is_training, kernel_name=kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [y_backprop, x, scale, reserve_space_1, reserve_space_2 ] + list(res_list) config = {"name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def axpy_v2(x1, x2, alpha, y, kernel_name="axpy_v2"): """ calculating data Parameters ---------- x1 : dict shape and dtype of input_x x2 : dict shape and dtype of input_y alpha : dict shape and dtype of alpha scalar apply to input_y:input_y*alpha y : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "axpy" Returns ------- None """ # check kernel name util.check_kernel_name(kernel_name) # infer shape according to the format pattern format_pattern = _add_check_format(x1, x2) shape_x1, shape_x2 = _infer_shape(format_pattern, x1, x2) dtype_x1 = x1.get("dtype").lower() dtype_x2 = x2.get("dtype").lower() alpha_dtype = alpha.get("dtype").lower() alpha_shape = alpha.get("shape") # check shape shape_x1 = util.scalar2tensor_one(shape_x1) shape_x2 = util.scalar2tensor_one(shape_x2) alpha_shape = util.scalar2tensor_one(alpha_shape) op_utils.check_shape(shape_x1) op_utils.check_shape(shape_x2) op_utils.check_shape(alpha_shape) # check dtype dtype_list0 = ("float16", "float32", "int32") dtype_list1 = ("float16", "float32") check_dtype(dtype_x1, dtype_list0) check_dtype(dtype_x2, dtype_list0) check_dtype(alpha_dtype, dtype_list1) util.compare_tensor_dict_key(x1, x2, "dtype") # check alpha is 0D or 1D tensor if len(alpha_shape) and not util.is_scalar(alpha_shape): raise RuntimeError("alpha should be 0D or 1D tensor") # produce shapes shape_x1, shape_x2, shape_max = util.produce_shapes(shape_x1, shape_x2) if shape_x1[-1] == 1 and shape_x2[-1] == 1 and shape_max[-1] == 1: shape_x1 = shape_x1 if len(shape_x1) == 1 else shape_x1[:-1] shape_x2 = shape_x2 if len(shape_x2) == 1 else shape_x2[:-1] shape_max = shape_max if len(shape_max) == 1 else shape_max[:-1] util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT) util.produce_shapes(shape_max, alpha_shape) shape_x1, shape_x2 = refine_shapes_for_broadcast(shape_x1, shape_x2) data_input_x1 = tvm.placeholder(shape_x1, name="data_input_x1", dtype=dtype_x1) data_input_x2 = tvm.placeholder(shape_x2, name="data_input_x2", dtype=dtype_x2) alpha_shape = tuple([1] * (len(shape_x1) - len(alpha_shape))) + tuple(alpha_shape) alpha_input = tvm.placeholder(alpha_shape, name="alpha_input", dtype=alpha_dtype) res = axpy_v2_compute(data_input_x1, data_input_x2, alpha_input, y, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"print_ir": False, "name": kernel_name, "tensor_list": [data_input_x1, data_input_x2, alpha_input, res]} te.lang.cce.cce_build_code(schedule, config)
def softmax_cross_entropy_with_logits( input_features, input_labels, output_loss, output_backprop, kernel_name="softmax_cross_entropy_with_logits"): """ Computes softmax cross entropy cost. Parameters ---------- input_features: dict input tensor contains shape and dtype attributes. source data type support "float16", "float32". input_labels: dict input tensor contains shape and dtype attributes. Must have the same type as 'input_features'. output_loss: dict data of output. Must have the same type as 'input_features'. output_backprop: dict data of output. Must have the same type as 'input_features'. kernel_name: str kernel name, default value is "softmax_cross_entropy_with_logits" Returns: None """ shape_features = input_features.get("shape") shape_labels = input_labels.get("shape") util.compare_tensor_dict_key(input_features, input_labels, "dtype") check_shape(shape_features, param_name="input_features") check_shape(shape_labels, param_name="input_labels") check_list = ("float16", "float32") input_dtype = input_features.get("dtype").lower() check_dtype(input_dtype, check_list, param_name="input_features") if len(shape_features) == 4: if len(shape_features) != len(shape_labels): raise RuntimeError("The length of two inputs must be same") if input_dtype != "float32": raise RuntimeError("Not supported dtype!") data_features = tvm.placeholder(shape_features, dtype=input_dtype, name="data_features") data_labels = tvm.placeholder(shape_labels, dtype=input_dtype, name="data_labels") res = softmax_cross_entropy_with_logits_nchw_compute( data_features, data_labels, output_loss, output_backprop) else: if len(shape_features) == 1 and len(shape_labels) == 1: raise RuntimeError( "The rank of two inputs can not be 1 at the same" "time") if len(shape_features) > 2 or len(shape_labels) > 2: raise RuntimeError( "logits and labels must be either 2-dimensional," "or broadcasted to 2-dimensional") if len(shape_features) == 1 or len(shape_labels) == 1: shape_features, shape_labels, shape_broadcast = \ broadcast_shapes(shape_features, shape_labels, param_name_input1="input_features", param_name_input2="input_labels") data_features = tvm.placeholder(shape_features, dtype=input_dtype, name="data_features") data_labels = tvm.placeholder(shape_labels, dtype=input_dtype, name="data_labels") res = softmax_cross_entropy_with_logits_compute( data_features, data_labels, output_loss, output_backprop) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [data_features, data_labels] + list(res) config = {"name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def smooth_l1_loss(predict, label, loss, sigma=1.0, kernel_name="smooth_l1_loss"): """ calculating data Parameters ---------- predict : dict shape and dtype of input label : dict shape and dtype of input loss : dict shape and dtype of output, should be same shape and type as input sigma: float sigma,default value is 1 kernel_name : str kernel name, default value is "smooth_l1_loss" Returns ------- None """ check_list = ("float16", "float32") shape_predict = predict.get("shape") dtype_predict = predict.get("dtype") input_predict_dtype = dtype_predict.lower() check_dtype(input_predict_dtype, check_list, param_name="predict") shape_label = label.get("shape") dtype_label = label.get("dtype") input_label_dtype = dtype_label.lower() dtype_loss = loss.get("dtype").lower() check_dtype(input_label_dtype, check_list, param_name="label") check_dtype(dtype_loss, check_list, param_name="loss") util.compare_tensor_dict_key(predict, label, "shape") check_shape(shape_predict, param_name="predict") check_shape(shape_label, param_name="label") check_list = ("float16", "float32") check_dtype(input_predict_dtype, check_list, param_name="predict") shape_predict, shape_label = \ refine_shapes_for_broadcast(shape_predict, shape_label) input_predict = tvm.placeholder(shape_predict, name="predict", dtype=input_predict_dtype) input_label = tvm.placeholder(shape_label, name="label", dtype=input_label_dtype) res = smooth_l1_loss_compute(input_predict, input_label, loss, sigma, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [input_predict, input_label, res] } te.lang.cce.cce_build_code(sch, config)
def relu_grad(input_gradients, input_features, output_backprops, kernel_name="relu_grad"): """ calculate the backpropagation of relu operation output_backprops = input_gradients*1(input_features>0) or 0(input_features<=0). support dtype:float16,float32,int32,int8,uint8 Parameters ---------- input_gradients: dict the backpropagated gradients to the corresponding relu operation input_features: dict the features passed as output of relu operation output_backprops: dict the output of relu back propagation kernel_name: str cce kernel name, default value is "relu_grad" Returns ------- None """ shape_input_gradients = input_gradients.get("shape") shape_input_features = input_features.get("shape") util.compare_tensor_dict_key(input_gradients, input_features, "dtype") check_shape(shape_input_gradients, param_name="input_gradients") check_shape(shape_input_features, param_name="input_features") if list(shape_input_gradients) != list(shape_input_features): shape_input_gradients, shape_input_features, shape_max = \ broadcast_shapes(shape_input_gradients, shape_input_features, param_name_input1="input_gradients", param_name_input2="input_features") dtype_input_gradients = input_gradients.get("dtype").lower() dtype_input_features = input_features.get("dtype").lower() check_list = ("float16", "float32", "int32", "int8", "uint8") check_dtype(dtype_input_gradients, check_list, param_name="input_gradients") check_dtype(dtype_input_features, check_list, param_name="input_features") shape_input_gradients, shape_input_features = \ refine_shapes_for_broadcast(shape_input_gradients, shape_input_features) data_input_gradients = tvm.placeholder(shape_input_gradients, name="data_input_gradients", dtype=dtype_input_gradients) data_input_features = tvm.placeholder(shape_input_features, name="data_input_features", dtype=dtype_input_features) res = relu_grad_compute(data_input_gradients, data_input_features, output_backprops, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [data_input_gradients, data_input_features, res] } te.lang.cce.cce_build_code(sch, config)
def l1_loss_grad(grads, predict, label, y, reduction="mean", kernel_name="l1_loss_grad"): """ Parameters ---------- grads : dict shape and dtype of grad_out as input predict : dict shape and dtype of predict as input, should be same shape and type as grads label : dict shape and dtype of label as input, should be same shape and type as grads y : dict shape and dtype of output, should be same shape and type as grads reduction: string reduction name, default value is "mean" kernel_name : str kernel name, default value is "l1_loss_grad" Returns ------- None """ dtype_list = ["float16", "float32"] reduction_list = ["none", "mean", "sum"] grads_data_type = grads.get("dtype").lower() grads_shape = grads.get("shape") predict_data_type = predict.get("dtype").lower() predict_shape = predict.get("shape") label_data_type = label.get("dtype").lower() label_shape = label.get("shape") op_utils.check_dtype(grads_data_type, dtype_list) op_utils.check_dtype(predict_data_type, dtype_list) op_utils.check_dtype(label_data_type, dtype_list) op_utils.check_shape(grads_shape) op_utils.check_shape(predict_shape) op_utils.check_shape(label_shape) util.compare_tensor_dict_key(grads, predict, "shape") util.compare_tensor_dict_key(grads, label, "shape") util.compare_tensor_dict_key(grads, predict, "dtype") util.compare_tensor_dict_key(grads, label, "dtype") if reduction not in reduction_list: raise RuntimeError("reduction should be one of ['none','mean','sum']") grads = tvm.placeholder(grads_shape, dtype=grads_data_type, name="grads") predict = tvm.placeholder(predict_shape, dtype=predict_data_type, name="predict") label = tvm.placeholder(label_shape, dtype=label_data_type, name="label") res = l1_loss_grad_compute(grads, predict, label, y, reduction=reduction, kernel_name="l1_loss_grad") with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [grads, predict, label, res]} te.lang.cce.cce_build_code(schedule, config)
def histogram_fixed_width_d(x, range, y, nbins, dtype="int32", kernel_name='histogram_fixed_width_d'): """this operation returns a rank 1 histogram counting the number of entries in `values` that fell into every bin. The bins are equal width and determined by the arguments `value_range` and `nbins`. Parameters ---------- x: dict dict info of input value, must include the keys(shape and dtype). range: dict dict info of input value_range, must include the keys(shape and dtype). the shape must be (2,) or [2] y: dict dict info of output nbins: int number of histogram bins. dtype: str data type for returned histogram. kernel_name: str cce kernel name, default value is "histogram_fixed_width" returns ------- None """ input_shape_list = [x.get("shape"), range.get("shape")] input_dtype = x.get("dtype") dtype_input = input_dtype.lower() check_shape(input_shape_list[0], param_name="x") check_shape(input_shape_list[1], param_name="range") util.compare_tensor_dict_key(x, range, "dtype") data_shape_size = util.check_tensor_shape_size(list(input_shape_list[0])) data_range_shape_size = util.check_tensor_shape_size( list(input_shape_list[1])) check_dtype(dtype_input, ("float16", "float32", "int32"), param_name="x") if data_range_shape_size != 2: raise RuntimeError("the shape of range must be (2,) or [2]") if nbins <= 0: raise RuntimeError("the nbins must be > 0") data = tvm.placeholder([data_shape_size], dtype=dtype_input, name="input_data") range_data = tvm.placeholder([data_range_shape_size], dtype=dtype_input, name="input_range_data") res = histogram_fixed_width_d_compute(data, range_data, y, nbins, kernel_name) sch = tvm.create_schedule(res.op) with build_config: tvm.build(sch, [data, range_data, res], "cce", name=kernel_name)
def mse_loss_grad(predict, label, dout, grad, reduction="mean", kernel_name="mse_loss_grad"): """ calculating data Parameters ---------- predict : dict shape and dtype of input label : dict shape and dtype of output, should be same shape and type as predict dout : dict shape and dtype of output, should be same shape and type as predict grad : dict shape and dtype of output, should be same shape and type as predict reduction : str reduce mode,can be 'mean','sum' or 'none' kernel_name : str kernel name, default value is "mse_loss_grad" Returns ------- None """ predict_shape = predict.get("shape") predict_dtype = predict.get("dtype") label_shape = label.get("shape") dout_shape = dout.get("shape") input_dtype = predict_dtype.lower() label_dtype = label.get("dtype").lower() dout_dtype = dout.get("dtype").lower() util.compare_tensor_dict_key(predict, label, "shape") util.compare_tensor_dict_key(predict, dout, "shape") util.compare_tensor_dict_key(predict, label, "dtype") util.compare_tensor_dict_key(predict, dout, "dtype") check_list = ("float16", "float32") op_utils.check_dtype(input_dtype, check_list) op_utils.check_dtype(label_dtype, check_list) op_utils.check_dtype(dout_dtype, check_list) op_utils.check_shape(predict_shape) op_utils.check_shape(label_shape) op_utils.check_shape(dout_shape) util.check_kernel_name(kernel_name) predict_input = tvm.placeholder(predict_shape, name="predict_input", dtype=input_dtype) label_input = tvm.placeholder(label_shape, name="label_input", dtype=input_dtype) dout_input = tvm.placeholder(dout_shape, name="dout_input", dtype=input_dtype) res = mse_loss_grad_compute(predict_input, label_input, dout_input, grad, reduction, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [predict_input, label_input, dout_input, res] } te.lang.cce.cce_build_code(schedule, config)