def _param_check(shape_x, dtype_x, axis, kernel_name): """check param Parameters ---------- shape_x: list input shape dtype_x: str input dtype axis: int axis int num kernel_name: str kernel_name string Returns ------- None """ util.check_shape_rule(shape_x, max_dim=8) util.check_tensor_shape_size(shape_x) check_list = ("int32", "float32") util.check_dtype_rule(dtype_x.lower(), check_list) util.check_kernel_name(kernel_name)
def fake_quant_perchannel(x, min_val, max_val, y, symmetric, narrow_range, num_bits, channel_axis, kernel_name="fake_quant_perchannel"): """FakeQuantPerChannel""" x_shape = x.get("shape") x_shape_ = x.get("ori_shape") x_format = x.get("format") x_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") # for Dense weight quant, 2d[co,ci] -> 4d[1,co,ci,1], channel_axis_ need change to 1. if channel_axis == 0 and x_shape_[0] != min_shape[0] and x_shape_[1] == min_shape[0]: channel_axis_ = 1 else: channel_axis_ = channel_axis util.check_kernel_name(kernel_name) util.check_shape_rule(x_shape) util.check_shape_rule(min_shape, 1, 1, x_shape_[channel_axis_]) util.check_shape_rule(max_shape, 1, 1, x_shape_[channel_axis_]) util.check_tensor_shape_size(x_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = x_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) quant_min = 0 quant_max = 2 ** num_bits - 1 if narrow_range: quant_min = quant_min + 1 shape_c = [1] * len(x_shape) shape_c[channel_axis_] = min_val.get("ori_shape")[0] if x_format == "NC1HWC0" and channel_axis_ == 1: shape_c = min_val.get("shape") input_data = tvm.placeholder(x_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype) max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype) res = fake_quant_perchannel_compute(input_data, min_data, max_data, y, quant_min, quant_max, symmetric, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [input_data, min_data, max_data, res] config = {"print_ir": False, "name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def fake_quant_with_min_max_grad(dout, x, min_val, max_val, dx, num_bits, quant_delay, symmetric, narrow_range, kernel_name="fake_quant_with_min_max_grad"): """FakeQuantWithMinMaxGrad""" input_shape = x.get("shape") input_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") min_shape = util.scalar2tensor_one(min_shape) max_shape = util.scalar2tensor_one(max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(min_shape, 1, 1, 1) util.check_shape_rule(max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", 'float16'] x_dtype = input_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]),) shape_min, _, _ = util.produce_shapes(min_shape, input_shape) if symmetric: quant_min = 0 - 2 ** (num_bits - 1) quant_max = 2 ** (num_bits - 1) - 1 else: quant_min = 0 quant_max = 2 ** num_bits - 1 if narrow_range: quant_min = quant_min + 1 dout_data = tvm.placeholder(input_shape, name="dout", dtype=x_dtype) input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype) max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype) res = fake_quant_with_min_max_grad_compute(dout_data, input_data, min_data, max_data, quant_min, quant_max, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [dout_data, input_data, min_data, max_data, res] config = {"print_ir": False, "name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def check_param_common(self): """ Check parameter Parameters ---------- None Returns ------- None """ util.check_kernel_name(self.kernel_name) util.check_shape_rule(self.indices_shape) util.check_shape_rule(self.grad_shape) util.check_shape_size(self.indices_shape, SHAPE_SIZE_LIMIT) util.check_shape_size(self.grad_shape, SHAPE_SIZE_LIMIT) check_list_indices_dtype = ("int32", "int64") util.check_dtype_rule(self.indices_dtype, check_list_indices_dtype) util.check_dtype_rule(self.grad_dtype, ("float32")) if self.grad_shape[1:] != self.var_shape[1:]: raise RuntimeError( "grad's shape must be the same as var's shape" " except first dimension") if len(self.indices_shape) != 1: raise RuntimeError( "indices must be one-dimensioal") if self.grad_shape[0] != self.indices_shape[0]: raise RuntimeError("grad must be the same shape as indices in " "first dimension")
def fake_learned_scale_quant_perlayer_grad_d_reduce( dout_alpha, dalpha, kernel_name="fake_learned_scale_quant_perlayer_grad_d_reduce"): """FakeLearnedScaleQuantPerLayerGradDReduce""" dout_alpha_shape = dout_alpha.get("shape") dout_alpha_dtype = dout_alpha.get("dtype") util.check_kernel_name(kernel_name) util.check_shape_rule(dout_alpha_shape) util.check_tensor_shape_size(dout_alpha_shape) check_list = ["float32", 'float16'] dout_alpha_dtype = dout_alpha_dtype.lower() util.check_dtype_rule(dout_alpha_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, dout_alpha_shape[:]), ) dout_alpha_data = tvm.placeholder(input_shape, name="dout_alpha", dtype=dout_alpha_dtype) res = fake_learned_scale_quant_perlayer_grad_d_reduce_compute( dout_alpha_data, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [dout_alpha_data, res] config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def fake_quant_minmax_update(x, min_val, max_val, min_up, max_up, ema, ema_decay, symmetric, narrow_range, training, num_bits, kernel_name="fake_quant_minmax_update"): """FakeQuantPerLayer op""" input_shape = x.get("shape") input_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") min_shape = util.scalar2tensor_one(min_shape) max_shape = util.scalar2tensor_one(max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(min_shape, 1, 1, 1) util.check_shape_rule(max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = input_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]),) shape_min, _, _ = util.produce_shapes(min_shape, input_shape) if symmetric: quant_min = 0 - 2 ** (num_bits - 1) quant_max = 2 ** (num_bits - 1) - 1 else: quant_min = 0 quant_max = 2 ** num_bits - 1 if narrow_range: quant_min = quant_min + 1 input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype) max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype) res_list = fake_quant_minmax_update_compute(input_data, min_data, max_data, ema, ema_decay, quant_min, quant_max, training, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [input_data, min_data, max_data] + list(res_list) config = {"print_ir": False, "name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def minmax_update_perchannel(x, min_val, max_val, min_up, max_up, ema, ema_decay, channel_axis, kernel_name="minmax_update_perchannel"): """MinMaxUpdatePerChannel op""" x_shape = x.get("ori_shape") x_format = x.get("format") x_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") util.check_kernel_name(kernel_name) util.check_shape_rule(x_shape) util.check_shape_rule(min_shape, 1, 1, x_shape[channel_axis]) util.check_shape_rule(max_shape, 1, 1, x_shape[channel_axis]) util.check_tensor_shape_size(x_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = x_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) if channel_axis == 0: shape_c = min_val.get("ori_shape") else: shape_c = [min_val.get("shape")[1], min_val.get("shape")[-1]] input_data = tvm.placeholder(x.get("shape"), name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype) max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype) res_list = minmax_update_perchannel_compute(input_data, min_data, max_data, ema, ema_decay, channel_axis) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [input_data, min_data, max_data] + list(res_list) config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def minmax_update_perlayer(x, min_val, max_val, min_up, max_up, ema, ema_decay, kernel_name="minmax_update_perlayer"): """MinMaxUpdatePerLayer op""" input_shape = x.get("shape") input_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") min_shape = util.scalar2tensor_one(min_shape) max_shape = util.scalar2tensor_one(max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(min_shape, 1, 1, 1) util.check_shape_rule(max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = input_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), ) shape_min, _, _ = util.produce_shapes(min_shape, input_shape) input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype) max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype) res_list = minmax_update_perlayer_compute(input_data, min_data, max_data, ema, ema_decay) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [input_data, min_data, max_data] + list(res_list) config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def addcdiv(x1, x2, x3, y=None, alpha=1.0, kernel_name="addcdiv"): check_list = ("float16", "float32") shape_x1 = x1.get("shape") dtype_x1 = x1.get("dtype").lower() shape_x2 = x2.get("shape") dtype_x2 = x2.get("dtype").lower() shape_x3 = x3.get("shape") dtype_x3 = x3.get("dtype").lower() util.check_shape_rule(shape_x1) # 校验算子的shape,维度数需要大于等于1、小于等于8 util.check_shape_size(shape_x1, SHAPE_SIZE_LIMIT) # 校验算子第一个输入shape大小 util.check_dtype_rule(dtype_x1, check_list) # 校验算子的输入数据类型 util.check_shape_rule(shape_x2) util.check_shape_size(shape_x2, SHAPE_SIZE_LIMIT) util.check_dtype_rule(dtype_x2, check_list) util.check_shape_rule(shape_x3) util.check_shape_size(shape_x3, SHAPE_SIZE_LIMIT) util.check_dtype_rule(dtype_x3, check_list) if dtype_x1 != dtype_x2 or dtype_x1 != dtype_x3: raise RuntimeError("the type of x1, x2, x3 must be the same!") util.check_kernel_name(kernel_name) # 校验算子的kernel_name # 取shape_x1,shape_x2,shape_x3中每个维度的大值赋给shape_max shape_x2, shape_x3, shape_max = broadcast_shapes(shape_x2, shape_x3) util.check_tensor_shape_size(shape_max) # 对shape_max进行校验 shape_x1, _, shape_max = broadcast_shapes(shape_x1, shape_max) util.check_tensor_shape_size(shape_max) # 对shape_max进行校验 shape_x2, _, _ = broadcast_shapes(shape_x2, shape_max) # 将input_x的shape广播为shape_max shape_x3, _, _ = broadcast_shapes(shape_x3, shape_max) # 将input_y的shape广播为shape_max data_x1 = tvm.placeholder(shape_x1, name="data_x1", dtype=dtype_x1) data_x2 = tvm.placeholder(shape_x2, name="data_x2", dtype=dtype_x2) data_x3 = tvm.placeholder(shape_x3, name="data_x3", dtype=dtype_x3) res = addcdiv_compute(data_x1, data_x2, data_x3, shape_max, alpha, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x1, data_x2, data_x3, res]} te.lang.cce.cce_build_code(schedule, config)
def check_param(x, grad, argmax, y, ksize, strides, padding, dtype, dilation, ceil_mode, kernel_name): """ check the parameters is valid, if one is invalid,then raise error Parameters ---------- x: dict,shape and datatype grad: dict,shape and datatype argmax: dict,shape and datatype y: dict,shape and datatype ksize: kernel or windows size,minimum length is 4, just like [1, poolingWindowH, poolingWindowW, 1] strides: stride , minimum length is 4, just like [1, poolingStrideH, poolingStrideW, 1] padding: pad mode Returns ------- None """ y_shape = x.get("shape") y_dtype = x.get("dtype").lower() y_dtype_arg = y.get("dtype").lower() input_gard_shape = grad.get("shape") grad_dtype = grad.get("dtype").lower() argmax_shape = argmax.get("shape") argmax_dtype = argmax.get("dtype").lower() util.check_shape_rule(y_shape) util.check_shape_rule(input_gard_shape) util.check_shape_rule(argmax_shape) util.check_kernel_name(kernel_name) check_shape_5hd(y_shape) check_shape_5hd(input_gard_shape) util.check_tensor_shape_size(input_gard_shape) util.check_tensor_shape_size(argmax_shape) util.check_tensor_shape_size(y_shape) util.check_dtype_rule(grad_dtype, ("float16", "float32", "int32")) util.check_dtype_rule(argmax_dtype, ("uint16")) util.check_dtype_rule(y_dtype, ("float16", "float32", "int32")) if y_dtype != grad_dtype or y_dtype_arg != y_dtype: raise RuntimeError("The dtype of tensor must be same") if dtype != DT_INT32 and dtype != DT_INT64: raise RuntimeError( "The dtype of input max indice must be int32 or int64") check_output_dim_with_ksize_stride(padding, input_gard_shape, y_shape, ksize, strides, dilation, ceil_mode)
def check_conv3d_dtype(fmp_dtype, w_dtype, res_dtype): """ algorithm: check the input params of conv3d Parameters ---------- fmp_dtype: the dtype of feature w_dtype: the dtype of filter res_dtype: the dtype of output Returns ------- None """ util.check_dtype_rule(fmp_dtype, ('float16', )) util.check_dtype_rule(w_dtype, ('float16', )) util.check_dtype_rule(res_dtype, ('float16', ))
def conv_layer_fast_cce_para_check(shape_in, shape_w, in_dtype, w_dtype, res_dtype, padh, padw, strideh, stridew, bias, kernel_name): # conv shape check util.check_kernel_name(kernel_name) # conv data type check util.check_dtype_rule(in_dtype, ['float16']) util.check_dtype_rule(w_dtype, ['float16']) util.check_dtype_rule(res_dtype, ['float16']) if not isinstance(bias, bool): raise RuntimeError("bias dtype should be bool.") if isinstance(padh, list): if len(padh) != PAD_SHAPE_DIM: raise RuntimeError("Dimension must be %d when padh is a list." % PAD_SHAPE_DIM) pad_top = padh[0] pad_bottom = padh[1] else: pad_top = padh pad_bottom = padh if isinstance(padw, list): if len(padw) != PAD_SHAPE_DIM: raise RuntimeError("Dimension must be %d when padw is a list." % PAD_SHAPE_DIM) pad_left = padw[0] pad_right = padw[1] else: pad_left = padw pad_right = padw shape_in, shape_w = te.lang.cce.check_conv_shape( shape_in, shape_w, pad_top, pad_bottom, pad_left, pad_right, strideh, stridew, in_dtype, w_dtype, res_dtype) return shape_in, shape_w
def cheak(x, y1, y2, axis, kernel_name): """ Function: Check parameters (eg: shape dtype etc). Modify : 2020-08-03 """ util.check_kernel_name(kernel_name) shape = y1.get("shape") dtype = y1.get("dtype").lower() util.check_dtype_rule(dtype, ("float16")) util.check_shape_rule(shape) shape = y2.get("shape") dtype = y2.get("dtype").lower() util.check_dtype_rule(dtype, ("int32")) util.check_shape_rule(shape) shape = x.get("shape") dtype = x.get("dtype").lower() util.check_dtype_rule(dtype, ("float16")) util.check_shape_rule(shape) if axis == -1: axis = len(shape) - 1 if axis != len(shape) - 1: raise RuntimeError("Dim should take the last one.") allnum = functools_reduce(lambda x, y: x * y, shape) num = shape[axis] if num > MAX_NUM: raise RuntimeError("Num in dim is too big (>7040).") return shape, dtype, allnum, num
def smooth_l1_loss_v2(predict, label, loss, sigma=1.0, reduction="mean", kernel_name="smooth_l1_loss_v2"): """ calculating data Parameters ---------- predict : dict shape and dtype of input label : dict shape and dtype of input loss : dict shape and dtype of output, should be same shape and type as input sigma: float sigma, default value is 1 reduction: str type of result, default value is "mean" kernel_name : str kernel name, default value is "smooth_l1_lossV2" Returns ------- None """ util.check_kernel_name(kernel_name) check_list = ("float16", "float32") shape_predict = predict.get("shape") dtype_predict = predict.get("dtype").lower() util.check_dtype_rule(dtype_predict, check_list) shape_label = label.get("shape") dtype_label = label.get("dtype").lower() util.check_dtype_rule(dtype_label, check_list) shape_loss = label.get("shape") dtype_loss = loss.get("dtype").lower() util.check_dtype_rule(dtype_loss, check_list) util.check_shape_rule(shape_predict) util.check_shape_rule(shape_label) util.check_shape_rule(shape_loss) util.compare_tensor_dict_key(predict, label, "shape") check_list_reduction = ("none", "mean", "sum") reduction_type = reduction.lower() util.check_dtype_rule(reduction_type, check_list_reduction) input_predict = tvm.placeholder(shape_predict, name="predict", dtype=dtype_predict) input_label = tvm.placeholder(shape_label, name="label", dtype=dtype_label) res = smooth_l1_loss_v2_compute(input_predict, input_label, sigma, reduction_type) # TODO:auto schedule with tvm.target.cce(): sch = generic.auto_schedule(res) # TODO:operator build config = { "name": kernel_name, "tensor_list": [input_predict, input_label, res] } te.lang.cce.cce_build_code(sch, config)
def fake_quant_perchannel_grad(dout, x, min_val, max_val, dx, symmetric, narrow_range, num_bits, channel_axis, kernel_name="fake_quant_perchannel_grad"): """FakeQuantPerChannelGrad""" x_shape = x.get("shape") x_format = x.get("format") x_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") util.check_kernel_name(kernel_name) util.check_shape_rule(x_shape) util.check_shape_rule(min_shape, 1, 1, x_shape[channel_axis]) util.check_shape_rule(max_shape, 1, 1, x_shape[channel_axis]) util.check_tensor_shape_size(x_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = x_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) if symmetric: quant_min = 0 - 2**(num_bits - 1) quant_max = 2**(num_bits - 1) - 1 else: quant_min = 0 quant_max = 2**num_bits - 1 if narrow_range: quant_min = quant_min + 1 shape_c = [1] * len(x_shape) shape_c[channel_axis] = min_val.get("ori_shape")[0] if x_format == "NC1HWC0" and channel_axis == 1: shape_c = min_val.get("shape") dout_data = tvm.placeholder(x_shape, name="dout", dtype=x_dtype) input_data = tvm.placeholder(x_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype) max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype) res = fake_quant_perchannel_grad_compute(dout_data, input_data, min_data, max_data, quant_min, quant_max, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [dout_data, input_data, min_data, max_data, res] config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def addcmul(input_data, x1, x2, y, value=1.0, kernel_name="addcmul"): """ algorithm: addcmul calculating data's addcmul, y = input_data + value * (x1 * x2) Parameters ---------- input_data : dict shape and dtype of first input, only support float16, float32, int32, int8, uint8 x1 : dict shape and dtype of second input, only support float16, float32, int32, int8, uint8 x2 : dict shape and dtype of third input, only support float16, float32, int32, int8, uint8 y: dict shape and dtype of output, should be broadcast shape and type as input value: float scaling coefficient, default value is 1.0 kernel_name : str cce kernel name, default value is addcmul Returns ------- None """ shape_input = input_data.get("shape") shape_x1 = x1.get("shape") shape_x2 = x2.get("shape") dtype_input = input_data.get("dtype").lower() dtype_x1 = x1.get("dtype").lower() dtype_x2 = x2.get("dtype").lower() util.check_kernel_name(kernel_name) util.check_shape_rule(shape_input) util.check_shape_size(shape_input, SHAPE_SIZE_LIMIT) util.check_shape_rule(shape_x1) util.check_shape_size(shape_x1, SHAPE_SIZE_LIMIT) util.check_shape_rule(shape_x2) util.check_shape_size(shape_x2, SHAPE_SIZE_LIMIT) check_list = ("float16", "float32", "int32", "int8", "uint8") util.check_dtype_rule(dtype_input, check_list) util.check_dtype_rule(dtype_x1, check_list) util.check_dtype_rule(dtype_x2, check_list) if dtype_input != dtype_x1 or dtype_input != dtype_x2: raise RuntimeError("the type of input_data, x1, x2 must be same") shape_x1, shape_x2, shape_max1 = broadcast_shapes(shape_x1, shape_x2) util.check_tensor_shape_size(shape_max1) shape_input, _, shape_max = broadcast_shapes(shape_input, shape_max1) util.check_tensor_shape_size(shape_max) shape_x1, _, _ = broadcast_shapes(shape_x1, shape_max) shape_x2, _, _ = broadcast_shapes(shape_x2, shape_max) data_input = tvm.placeholder(shape_input, dtype=dtype_input, name="data_input") data_x1 = tvm.placeholder(shape_x1, dtype=dtype_x1, name="data_x1") data_x2 = tvm.placeholder(shape_x2, dtype=dtype_x2, name="data_x2") res = addcmul_compute(data_input, data_x1, data_x2, shape_max, y, value, kernel_name="addcmul") with tvm.target.cce(): schedule = generic.auto_schedule(res) tensor_list = [data_input, data_x1, data_x2, res] config = {"print_ir": False, "name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(schedule, config)
def check_conv3dbp_input_params( shape_filter, # pylint:disable=R0913,R0914,R0915 shape_out_backprop, input_sizes, strides, pads, dilations, filter_dtype, out_backprop_dtype, res_dtype, kernel_name): """ The params check function of conv3d backprop input Parameters: ------------------------- shape_filter : The shape of filter. 5-D with shape (depth, height, weight, batch, channels) shape_out_backprop : The shape of gradients. 5-D with shape[batch, depth, height, weight,channels] input_sizes : The shape of feature map. 5-D with shape [batch, depth, height, weight, channels]. strides : A list of ints. The stride of the sliding window. pads : A list of ints. dilations : An optional list of ints. Only support [1, 1, 1, 1, 1] now. filter_dtype : The dtype of filter data. Default value is float16. out_backprop_dtype : The dtype of gradients data. Default value is float16 res_dtype : The dtype of result(De/Dx) data. Default value is float16. kernel_name : Cce kernel name. Default value is "conv3d_backprop_intput_cce" Returns : All transformed params. """ def _check_attr_range(attr_name, attr_value, attr_min, attr_max): if attr_value < attr_min or attr_value > attr_max: dict_args = { 'errCode': 'E60011', 'range': '[{},{}]'.format(attr_min, attr_max), 'attr_name': attr_name, 'value': str(attr_value) } raise RuntimeError(dict_args, err_mana.get_error_message(dict_args)) def _check_64bits_limitation(attr_name, attr_value, dtype=None): if dtype is None: bit_ratio = BIT_RATIO_DICT.get("float16") else: bit_ratio = BIT_RATIO_DICT.get(dtype) if attr_value * bit_ratio > DATA_SIZE_MAX: dict_args = { 'errCode': 'E60020', 'attr_name': attr_name, } raise RuntimeError(dict_args, err_mana.get_error_message(dict_args)) def _check_l1_limitation(): block_size = 16 w_value = dedy_w * stride_w if fmap_w > block_size: h_value_max = filter_h_dilation + 1 elif block_size % fmap_w == 0: h_value_max = filter_h_dilation + block_size // fmap_w - 1 else: h_value_max = filter_h_dilation + block_size // fmap_w + 1 a_l1_size = h_value_max * w_value * \ ((filter_d_dilation - 2)//stride_d + 2) * block_size * 2 b_l1_size = filter_h_dilation * filter_w_dilation * \ filter_d_dilation * block_size * block_size * 2 l1_size = get_soc_spec("L1_SIZE") if (a_l1_size + b_l1_size) > l1_size: dict_args = {'errCode': 'E60022'} raise RuntimeError(dict_args, err_mana.get_error_message(dict_args)) def _check_shape_error(): fmap_h_padding = fmap_h + pad_up + pad_down fmap_w_padding = fmap_w + pad_left + pad_right fmap_d_padding = fmap_deep + pad_head + pad_tail if fmap_channel != filter_channel: dict_args = { 'errCode': 'E60108', 'reason': "Shape error: Fmap's C must be equal to Filter'C." } raise RuntimeError(dict_args, err_mana.get_error_message(dict_args)) if dedy_channel != filter_batch: dict_args = { 'errCode': 'E60108', 'reason': "Shape error: Dedy's C must be equal to Filter'N." } raise RuntimeError(dict_args, err_mana.get_error_message(dict_args)) if fmap_batch != dedy_batch: dict_args = { 'errCode': 'E62503', 'backprop_N': str(dedy_batch), 'forward_shape': str(fmap_batch) } raise RuntimeError(dict_args, err_mana.get_error_message(dict_args)) if filter_h_dilation > fmap_h_padding: dict_args = { 'errCode': 'E62507', 'dim': 'H', 'filter_dila': str(filter_h_dilation), 'input_pad': str(fmap_h_padding) } raise RuntimeError(dict_args, err_mana.get_error_message(dict_args)) if filter_w_dilation > fmap_w_padding: dict_args = { 'errCode': 'E62507', 'dim': 'W', 'filter_dila': str(filter_w_dilation), 'input_pad': str(fmap_w_padding) } raise RuntimeError(dict_args, err_mana.get_error_message(dict_args)) if filter_d_dilation > fmap_d_padding: dict_args = { 'errCode': 'E62507', 'dim': 'D', 'filter_dila': str(filter_d_dilation), 'input_pad': str(fmap_d_padding) } raise RuntimeError(dict_args, err_mana.get_error_message(dict_args)) if ((fmap_h - filter_h_dilation + pad_up + pad_down) // stride_h + 1) != dedy_h: dict_args = { 'errCode': 'E60024', } raise RuntimeError(dict_args, err_mana.get_error_message(dict_args)) if ((fmap_w - filter_w_dilation + pad_left + pad_right) // stride_w + 1) != dedy_w: dict_args = { 'errCode': 'E60025', } raise RuntimeError(dict_args, err_mana.get_error_message(dict_args)) if ((fmap_deep - filter_d_dilation + pad_head + pad_tail) // stride_d + 1) != dedy_deep: dict_args = { 'errCode': 'E62508', } raise RuntimeError(dict_args, err_mana.get_error_message(dict_args)) # Base check, Mainly required by interface appearance # =========================================================== # util check util.check_kernel_name(kernel_name) util.check_shape_rule(shape_filter, CONV_BACKPROP_SHAPE_DIM, CONV_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM) util.check_shape_rule(shape_out_backprop, CONV_BACKPROP_SHAPE_DIM, CONV_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM) util.check_shape_rule(input_sizes, CONV_BACKPROP_SHAPE_DIM, CONV_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM) util.check_shape_rule(strides, STRIDES_SHAPE_DIM, STRIDES_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM) # pads check if isinstance(pads, (tuple, list)) and \ len(pads) != CONV_BACKPROP_PAD_SHAPE_DIM: dict_args = { 'errCode': 'E62501', 'param_name': 'pads', } raise RuntimeError(dict_args, err_mana.get_error_message(dict_args)) if isinstance(pads, str) and pads not in ['SAME', 'VALID']: dict_args = { 'errCode': 'E60000', 'param_name': 'pads', 'expected_value': 'SAME or VALID', 'input_value': str(pads), } raise RuntimeError(dict_args, err_mana.get_error_message(dict_args)) # dilations check util.check_shape_rule(dilations, CONV_BACKPROP_SHAPE_DIM, CONV_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM) dilation_n, dilation_d, dilation_h, dilation_w, dilation_c = dilations if dilation_n != 1 or dilation_c != 1: dict_args = { 'errCode': 'E60023', 'dilation_n': str(dilation_n), 'dilation_c': str(dilation_c), } raise RuntimeError(dict_args, err_mana.get_error_message(dict_args)) # detype chek filter_dtype = filter_dtype.lower() out_backprop_dtype = out_backprop_dtype.lower() res_dtype = res_dtype.lower() util.check_dtype_rule(filter_dtype, ['float16']) util.check_dtype_rule(out_backprop_dtype, ['float16']) util.check_dtype_rule(res_dtype, ['float16']) # the relation limits between shape shape_filter = list(shape_filter) shape_out_backprop = list(shape_out_backprop) input_sizes = list(input_sizes) strides = list(strides) fmap_batch, fmap_deep, fmap_h, fmap_w, fmap_channel = input_sizes dedy_batch, dedy_deep, dedy_h, dedy_w, dedy_channel = shape_out_backprop filter_depth, filter_h, \ filter_w, filter_channel, filter_batch = shape_filter _, stride_d, stride_h, stride_w, _ = strides filter_h_dilation = (filter_h - 1) * dilation_h + 1 filter_w_dilation = (filter_w - 1) * dilation_w + 1 filter_d_dilation = (filter_depth - 1) * dilation_d + 1 if pads == 'SAME': pad_h = align(fmap_h, stride_h) - stride_h + filter_h - fmap_h pad_h = max(pad_h, 0) pad_up = pad_h // 2 pad_down = pad_h - pad_up pad_w = align(fmap_w, stride_w) - stride_w + filter_w - fmap_w pad_w = max(pad_w, 0) pad_left = pad_w // 2 pad_right = pad_w - pad_left pad_d = align(fmap_deep, stride_d)\ - stride_d + filter_depth - fmap_deep pad_d = max(pad_d, 0) pad_head = pad_d // 2 pad_tail = pad_d - pad_head pads = [pad_head, pad_tail, pad_up, pad_down, pad_left, pad_right] elif pads == "VALID": pads = PADDING_VAILD # pads compute pads = list(pads) pad_head, pad_tail, pad_up, pad_down, pad_left, pad_right = pads fmap_h_padding = fmap_h + pad_up + pad_down fmap_w_padding = fmap_w + pad_left + pad_right # special cases dey_hw_min, fmap_hw_min = DEDY_HW_MIN, FMAP_HW_MIN # limitation by chip: # if kernel h,w in [1,11] and fmap h/w after padding equals to filter h/w # load3d support h,w is 1 if (1 <= filter_h <= 11) and (1 <= filter_w <= 11) \ and (fmap_h_padding == filter_h or fmap_w_padding == filter_w): dey_hw_min = 1 fmap_hw_min = 1 _check_shape_error() _check_l1_limitation() # Dedy value limit _check_attr_range("Dedy's H after expands", dedy_h * stride_h, dey_hw_min, DEDY_HW_MAX) _check_attr_range("Dedy's W after expands", dedy_w * stride_w, dey_hw_min, DEDY_HW_MAX) # filter value limit _check_attr_range("filter's H", filter_h, FILTER_HW_MIN, FILTER_HW_MAX) _check_attr_range("filter's W", filter_w, FILTER_HW_MIN, FILTER_HW_MAX) _check_attr_range("filter's D", filter_depth, FILTER_HW_MIN, FILTER_D_MAX) _check_attr_range("filter H*W", filter_h * filter_w, FILTER_HW_MIN, FILTER_HW_SIZE) _check_attr_range("filter H*W*D", filter_h * filter_w * filter_depth, FILTER_HW_MIN, KHWD_COEFF) # Fmap value limit _check_attr_range("Fmap's H", fmap_h, fmap_hw_min, FMAP_HW_MAX) _check_attr_range("Fmap's W", fmap_w, fmap_hw_min, FMAP_HW_MAX) # stride value limit _check_attr_range("stride's H", stride_h, STRIDE_HW_MIN, STRIDE_HW_MAX) _check_attr_range("stride's W", stride_w, STRIDE_HW_MIN, STRIDE_HW_MAX) _check_attr_range("stride's H*W", stride_h * stride_w, STRIDE_HW_MIN, STRIDE_SIZE_MAX) _check_attr_range("stride's H*W*D", stride_h * stride_w * stride_d, STRIDE_HW_MIN, STRIDE_SIZE_HWD_MAX) # check shape size, 64 bits limitation # =========================================================== c0_size = cce_params.C0_SIZE fmap_size = fmap_batch * align(fmap_channel, c0_size) \ * fmap_deep * fmap_h * fmap_w dedy_size = dedy_batch * align(dedy_channel, c0_size) \ * dedy_deep * dedy_h * dedy_w filter_size = align(filter_batch, c0_size) * \ align(filter_channel, c0_size) * filter_depth * filter_h * filter_w _check_64bits_limitation("input", fmap_size, dtype=res_dtype) _check_64bits_limitation("out_backprop", dedy_size, dtype=out_backprop_dtype) _check_64bits_limitation("filter", filter_size, dtype=filter_dtype) result = (shape_filter, shape_out_backprop, input_sizes, strides, pads, dilations, filter_dtype, out_backprop_dtype, res_dtype, kernel_name) return result
def fake_learned_scale_quant_perchannel_grad_d( dout, input_x, alpha, quant_max, dx, dalpha, neg_trunc, channel_axis, kernel_name="fake_learned_scale_quant_perchannel_grad_d"): """FakeLearnedScaleQuantPerChannelGradD""" input_shape = input_x.get("shape") input_x_shape_ = input_x.get("ori_shape") input_x_format = input_x.get("format") input_dtype = input_x.get("dtype") alpha_shape = alpha.get("ori_shape") alpha_dtype = alpha.get("dtype") quant_max_shape = quant_max.get("ori_shape") quant_max_dtype = quant_max.get("dtype") # for Dense weight quant, 2d[co,ci] -> 4d[1,co,ci,1], channel_axis_ need change to 1. if channel_axis == 0 and input_x_shape_[0] != alpha_shape[ 0] and input_x_shape_[1] == alpha_shape[0]: channel_axis_ = 1 else: channel_axis_ = channel_axis util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(alpha_shape, 1, 1, input_x_shape_[channel_axis_]) util.check_shape_rule(quant_max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(alpha_shape) util.check_tensor_shape_size(quant_max_shape) check_list = ["float32", "float16"] input_dtype = input_dtype.lower() alpha_dtype = alpha_dtype.lower() quant_max_dtype = quant_max_dtype.lower() util.check_dtype_rule(input_dtype, check_list) util.check_dtype_rule(alpha_dtype, check_list) util.check_dtype_rule(quant_max_dtype, check_list) shape_c = [1] * len(input_shape) shape_c[channel_axis_] = alpha.get("ori_shape")[0] if input_x_format == "NC1HWC0" and channel_axis_ == 1: shape_c = alpha.get("shape") dout_data = tvm.placeholder(input_shape, name="dout", dtype=input_dtype) input_data = tvm.placeholder(input_shape, name="x", dtype=input_dtype) alpha_data = tvm.placeholder(shape_c, name="alpha_data", dtype=alpha_dtype) quant_max_data = tvm.placeholder(quant_max_shape, name="quant_max_data", dtype=quant_max_dtype) res = fake_learned_scale_quant_perchannel_grad_d_compute( dout_data, input_data, alpha_data, quant_max_data, neg_trunc, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [dout_data, input_data, alpha_data, quant_max_data ] + list(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def batchnorm_fold(x, x_sum, x_square_sum, mean, variance, y, batch_mean, batch_std, running_mean, running_std, mean_updated, variance_updated, momentum=0.9, epsilon=1e-5, is_training=True, freeze_bn=0, data_format="NCHW", kernel_name="batchnorm_fold"): """batchnorm_fold TBE op""" momentum = 1.0 - momentum util.check_kernel_name(kernel_name) data_format = data_format.upper() if data_format != "NCHW": raise RuntimeError("The data_format only support NCHW") shape_x = x.get("shape") shape_mean = mean.get("shape") shape_variance = variance.get("shape") dtype_x = x.get("dtype") dtype_mean = mean.get("dtype") dtype_variance = variance.get("dtype") for shape in (shape_x, shape_mean, shape_variance): util.check_shape_rule(shape) util.check_tensor_shape_size(shape) check_tuple = ("float16", "float32") for dtype in (dtype_x, dtype_mean, dtype_variance): util.check_dtype_rule(dtype.lower(), check_tuple) format_data = x.get("format").upper() if format_data not in ("NCHW", "NC1HWC0"): raise RuntimeError("Format of input only support 4D and 5HD") if format_data == "NC1HWC0": if len(shape_x) != 5: raise RuntimeError("batchnorm_fold only support shape 5D" "when input format is NC1HWC0") shape_mean = (1, shape_x[1], 1, 1, shape_x[4]) elif format_data == "NCHW": if len(shape_x) < 2 or len(shape_x) > 4: raise RuntimeError("batchnorm_fold only support shape 2D to 4D") if shape_x[1] != shape_mean[0]: raise RuntimeError("data_format is NCHW, shape_bias must" "be equal to the second axis of shape_x") shape_mean = (1, shape_x[1],) for _ in range(2, len(shape_x)): shape_mean = shape_mean + (1,) x_input = tvm.placeholder(shape_x, name="x_input", dtype=dtype_x.lower()) x_sum = tvm.placeholder(shape_mean, name="x_sum", dtype=dtype_x.lower()) x_square_sum = tvm.placeholder(shape_mean, name="x_square_sum", dtype=dtype_x.lower()) mean = tvm.placeholder(shape_mean, name="mean", dtype=dtype_mean.lower()) variance = tvm.placeholder(shape_mean, name="variance", dtype=dtype_variance.lower()) shape_x = te.lang.cce.util.shape_to_list(x_input.shape) num = shape_x[0] * shape_x[2] * shape_x[3] num_rec = 1.0 / num # compute the mean of x batch_mean = te.lang.cce.vmuls(x_sum, num_rec) # compute the variance of x variance_div = te.lang.cce.vmuls(x_square_sum, num_rec) mean_square = te.lang.cce.vmul(batch_mean, batch_mean) batch_var_biased = te.lang.cce.vsub(variance_div, mean_square) if num == 1: batch_var_scaler = 0.0 else: batch_var_scaler = float(num) / (num - 1) batch_variance = te.lang.cce.vmuls(batch_var_biased, batch_var_scaler) batch_std = te.lang.cce.vsqrt(te.lang.cce.vadds(batch_variance, epsilon)) factor = 1.0 - momentum factor_reverse = momentum mean_mul = te.lang.cce.vmuls(batch_mean, factor) mean_mul_rev = te.lang.cce.vmuls(mean, factor_reverse) mean_updated = te.lang.cce.vadd(mean_mul, mean_mul_rev) var_mul = te.lang.cce.vmuls(batch_variance, factor) var_mul_rev = te.lang.cce.vmuls(variance, factor_reverse) variance_updated = te.lang.cce.vadd(var_mul, var_mul_rev) y = te.lang.cce.vadds(x_input, 0.0) running_mean = te.lang.cce.vadds(mean, 0.0) running_std = te.lang.cce.vsqrt(te.lang.cce.vadds(variance, epsilon)) res = [y, batch_mean, batch_std, running_mean, running_std, mean_updated, variance_updated] with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [x_input, x_sum, x_square_sum, mean, variance] + res} te.lang.cce.cce_build_code(sch, config)
def correction_mul_grad(dout, x, batch_std, running_std, dx, d_batch_std, channel, kernel_name="correction_mul_grad"): """CorrectionMulGrad op""" shape_dout = dout.get("shape") shape_x = dout.get("shape") dtype_dout = dout.get("dtype") dtype_x = x.get("dtype") dtype_batch_std = batch_std.get("dtype") dtype_running_std = running_std.get("dtype") inp_dtype_dout = dtype_dout.lower() inp_dtype_x = dtype_x.lower() inp_dtype_batch_std = dtype_batch_std.lower() inp_dtype_running_std = dtype_running_std.lower() util.check_dtype_rule(inp_dtype_dout, ("float16", "float32")) util.check_dtype_rule(inp_dtype_x, ("float16", "float32")) util.check_dtype_rule(inp_dtype_batch_std, ("float32", )) util.check_dtype_rule(inp_dtype_running_std, ("float32", )) util.compare_tensor_dict_key(dout, x, "dtype") util.compare_tensor_dict_key(dout, x, "shape") util.compare_tensor_dict_key(dx, x, "shape") util.compare_tensor_dict_key(batch_std, running_std, "shape") util.compare_tensor_dict_key(batch_std, d_batch_std, "shape") util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) data_format = dout.get("format") ori_format = dout.get("format") if data_format.upper() not in ("NC1HWC0", "NCHW"): raise RuntimeError("Un supported data format {}".format(data_format)) if data_format.upper() == "NCHW" and ori_format != "NCHW": raise RuntimeError("data_format(NCHW) must same as ori_format") shape_c = [1] * len(shape_x) shape_c[channel] = batch_std.get("ori_shape")[0] if data_format == "NC1HWC0" and channel == 1: shape_c = batch_std.get("shape") dout_t = tvm.placeholder(shape_dout, name="dout", dtype=inp_dtype_dout) x_t = tvm.placeholder(shape_x, name="x", dtype=inp_dtype_x) batch_std_t = tvm.placeholder(shape_c, name="batch_std", dtype=inp_dtype_batch_std) running_std_t = tvm.placeholder(shape_c, name="running_std", dtype=inp_dtype_running_std) res_list = correction_mul_grad_compute(dout_t, x_t, batch_std_t, running_std_t, channel, data_format, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [dout_t, x_t, batch_std_t, running_std_t] + list(res_list) config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def check_conv3dbp_filter_params(shape_x, shape_out_backprop, filter_sizes, strides, pads, dilations, x_dtype, out_backprop_dtype, res_dtype, kernel_name): """ The params check function of conv3d_backprop_filter Parameters: ---------- shape_x : The shape of feature map, which is 5-D [batch, depth, channels, height, weight]. shape_out_backprop : The shape of gradients, which is 5-D [batch, depth,channels, height, weight]. filter_sizes : The shape of filter. which is 5-D [batch, depth, channels, height, weight]. strides : The stride of the sliding window. A list of ints. pads : "SAME"or"VALID", indicating the type of pads algorithm to use, or list. dilations : An optional list of ints. Default value is [1, 1, 1, 1]. x_dtype : Fmeature map data dtype. Default value is float16. out_backprop_dtype : Gradients data dtype. Default value is float16. res_dtype : Result(De/Dw) data dtype. Default value is float32. kernel_name : Kernel name of cce. Default value is "conv3d_backprop_filter_cce" Returns : All transformed params. ---------- """ def _check_attr_range_dw(name, value, attr_min=None, attr_max=None): if not attr_min and not attr_max: return if not attr_min: if value > attr_max: args_dict = { 'errCode': 'E60011', 'range': '(,{}]'.format(attr_max), 'attr_name': name, 'value': str(value) } raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) elif not attr_max: if value < attr_min: args_dict = { 'errCode': 'E60011', 'range': '[{},)'.format(attr_min), 'attr_name': name, 'value': str(value) } raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) elif value > attr_max or value < attr_min: args_dict = { 'errCode': 'E60011', 'range': '[{},{}]'.format(attr_min, attr_max), 'attr_name': name, 'value': str(value) } raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) def _check_64bits_limitation(attr_name, attr_value, dtype=None): if dtype: bit_ratio = BIT_RATIO_DICT.get(dtype) else: bit_ratio = BIT_RATIO_DICT.get("float16") if attr_value * bit_ratio > DATA_SIZE_MAX: args_dict = {'errCode': 'E60020', 'attr_name': attr_name} raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) # First : Base check, Mainly required by interface appearance # =========================================================== # util check util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x, CONV3D_BACKPROP_SHAPE_DIM, CONV3D_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM) util.check_shape_rule(shape_out_backprop, CONV3D_BACKPROP_SHAPE_DIM, CONV3D_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM) util.check_shape_rule(filter_sizes, CONV3D_BACKPROP_SHAPE_DIM, CONV3D_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM) util.check_shape_rule(strides, STRIDES_SHAPE_DIM, STRIDES_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM) def _check_attr_pads(): # pads check if isinstance(pads, (tuple, list)) and \ len(pads) != PADDING_SHAPE_DIM: args_dict = {'errCode': 'E62501', 'param_name': 'pads'} raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) if isinstance(pads, str) and pads not in PADDING_SUPPORT: args_dict = { 'errCode': 'E60021', 'expected_pad_mode': '[{}]'.format(PADDING_SUPPORT), 'actual_pad_mode': str(pads) } raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) _check_attr_pads() # dilations check util.check_shape_rule(dilations, CONV3D_BACKPROP_SHAPE_DIM, CONV3D_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM) dilation_n, dilation_d, dilation_c, dilation_h, dilation_w = dilations _check_attr_range_dw("dilations's H", dilation_h, DILATION_MIN, DILATION_MAX) _check_attr_range_dw("dilations's W", dilation_w, DILATION_MIN, DILATION_MAX) if dilation_n != 1 or dilation_c != 1: args_dict = { 'errCode': 'E60023', 'dilation_n': str(dilation_n), 'dilation_c': str(dilation_c) } raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) # detype check x_dtype = x_dtype.lower() out_backprop_dtype = out_backprop_dtype.lower() res_dtype = res_dtype.lower() util.check_dtype_rule(x_dtype, ['float16']) util.check_dtype_rule(out_backprop_dtype, ['float16']) util.check_dtype_rule(res_dtype, ['float32', 'float16']) # Second : Furture Check, Mainly required by SRS # =========================================================== # the relation limits between shape shape_x = list(shape_x) shape_out_backprop = list(shape_out_backprop) filter_sizes = list(filter_sizes) strides = list(strides) fmap_batch, fmap_d, fmap_channel, fmap_h, fmap_w = shape_x dedy_batch, dedy_d, dedy_channel, dedy_h, dedy_w = shape_out_backprop filter_batch, filter_d, filter_channel, filter_h, filter_w = filter_sizes stride_d, stride_h, stride_w = strides filter_d_dilation = (filter_d - 1) * dilation_d + 1 filter_h_dilation = (filter_h - 1) * dilation_h + 1 filter_w_dilation = (filter_w - 1) * dilation_w + 1 # pads compute if pads == 'SAME': pad_d = \ align(fmap_d, stride_d) - stride_d + filter_d_dilation - fmap_d pad_d = max(pad_d, 0) pad_front = pad_d // 2 pad_back = pad_d - pad_front pad_w = \ align(fmap_w, stride_w) - stride_w + filter_w_dilation - fmap_w pad_w = max(pad_w, 0) pad_left = pad_w // 2 pad_right = pad_w - pad_left pad_h = \ align(fmap_h, stride_h) - stride_h + filter_h_dilation - fmap_h pad_h = max(pad_h, 0) pad_up = pad_h // 2 pad_down = pad_h - pad_up pads = [pad_front, pad_back, pad_up, pad_down, pad_left, pad_right] elif pads == "VALID": pads = PADDING_VAILD pads = list(pads) pad_front, pad_back, pad_up, pad_down, pad_left, pad_right = pads if pad_front >= filter_d_dilation or pad_back >= filter_d_dilation: args_dict = { 'errCode': 'E60013', 'depth_of_pad': '{}, {}'.format(pad_front, pad_back), 'depth_of_filter': '{}'.format(filter_d_dilation) } raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) if pad_up >= filter_h_dilation or pad_down >= filter_h_dilation: args_dict = { 'errCode': 'E60016', 'h_of_filter': '{}'.format(filter_h_dilation), 'h_of_pad': '{}, {}'.format(pad_up, pad_down) } raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) if pad_left >= filter_w_dilation or pad_right >= filter_w_dilation: args_dict = { 'errCode': 'E60017', 'w_of_filter': '{}'.format(filter_w_dilation), 'w_of_pad': '{}, {}'.format(pad_left, pad_right) } raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) fmap_w_padding = fmap_w + pad_left + pad_right fmap_h_padding = fmap_h + pad_up + pad_down # special cases fmap_hw_min, dey_hw_min = FMAP_HW_MIN, DEDY_HW_MIN # limitation by chip: # if kernel h,w in [1,11] and fmap h/w after padding equals to filter h/w # load3d support h,w is 1 if (1 <= filter_w <= 11) and (1 <= filter_h <= 11) and (1 <= filter_d <= 11)\ and (fmap_w_padding == filter_w or fmap_h_padding == filter_h): fmap_hw_min = 1 dey_hw_min = 1 # Dedy value limit _check_attr_range_dw("Dedy's H", dedy_h, dey_hw_min, DEDY_HW_MAX) _check_attr_range_dw("Dedy's W", dedy_w, dey_hw_min, DEDY_HW_MAX) # filter value limit _check_attr_range_dw("filter's H", filter_h, FILTER_HW_MIN, FILTER_HW_MAX) _check_attr_range_dw("filter's W", filter_w, FILTER_HW_MIN, FILTER_HW_MAX) # Fmap value limit _check_attr_range_dw("Fmap's H", fmap_h, fmap_hw_min, FMAP_HW_MAX) _check_attr_range_dw("Fmap's W", fmap_w, fmap_hw_min, FMAP_HW_MAX) # stride value limit _check_attr_range_dw("stride's H", stride_h, STRIDE_HW_MIN, STRIDE_HW_MAX) _check_attr_range_dw("stride's W", stride_w, STRIDE_HW_MIN, STRIDE_HW_MAX) def _check_axis_hw(): if fmap_batch != dedy_batch: args_dict = { 'errCode': 'E62503', 'backprop_N': str(dedy_batch), 'forward_shape': str(fmap_batch) } raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) if dedy_channel != filter_batch: args_dict = { 'errCode': 'E62504', 'backprop_C': str(dedy_channel), 'forward_shape': str(filter_batch) } raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) if fmap_channel != filter_channel: args_dict = { 'errCode': 'E60010', 'channel_of_x': str(fmap_channel), 'channel_of_filter': str(filter_channel) } raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) if filter_w_dilation > fmap_w_padding: args_dict = { 'errCode': 'E60015', 'w_of_x': str(fmap_w_padding), 'w_of_filter': str(filter_w_dilation) } raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) if filter_h_dilation > fmap_h_padding: args_dict = { 'errCode': 'E60014', 'h_of_x': str(fmap_h_padding), 'h_of_filter': str(filter_h_dilation) } raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) # Third : value check, Mainly required by the convolution rule if ((fmap_w - filter_w_dilation + pad_left + pad_right) // stride_w + 1) != dedy_w: args_dict = {'errCode': 'E60025'} raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) if ((fmap_h - filter_h_dilation + pad_up + pad_down) // stride_h + 1) != dedy_h: args_dict = {'errCode': 'E60024'} raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) _check_axis_hw() def _min_l1_byte(): # Forth : L1 limitation, Mainly required by chip al1_min_byte = C0 * C0 * 2 if dedy_w % C0 == 0: bl1_min_byte = filter_h_dilation * fmap_w * C0 * 2 else: bl1_min_byte = (filter_h_dilation + stride_h) * fmap_w * C0 * 2 l1_size = get_soc_spec("L1_SIZE") # L1 size if (al1_min_byte + bl1_min_byte) > l1_size: args_dict = {'errCode': 'E60022'} raise RuntimeError(args_dict, err_mana.get_error_message(args_dict)) _min_l1_byte() # Fifth : check shape size, 64 bits limitation c0_size = cce_params.C0_SIZE fmap_size = fmap_batch * fmap_d * align(fmap_channel, c0_size) * fmap_h * fmap_w dedy_size = dedy_batch * dedy_d * align(dedy_channel, c0_size) * dedy_h * dedy_w filter_size = \ align(filter_batch, c0_size) * filter_d * align(filter_channel, c0_size) \ * filter_h * filter_w _check_64bits_limitation("fmap_size", fmap_size, dtype=x_dtype) _check_64bits_limitation("dedy_size", dedy_size, dtype=out_backprop_dtype) _check_64bits_limitation("filter_size", filter_size, dtype=res_dtype) result = (shape_x, shape_out_backprop, filter_sizes, strides, pads, dilations, x_dtype, out_backprop_dtype, res_dtype, kernel_name) return result
def check_param(self, var_out): """ Check parameter Parameters ---------- var_out: dict data of input datatype suports float32,float16,int32,int8,uint8 Returns ------- None """ var_out_shape = var_out.get("shape") var_out_dtype = var_out.get("dtype").lower() if var_out_dtype == "bool": var_out_dtype = "int8" util.check_kernel_name(self.kernel_name) util.check_shape_rule(self.var_shape) util.check_shape_rule(self.indices_shape) util.check_shape_rule(self.updates_shape) util.check_tensor_shape_size(self.var_shape) util.check_tensor_shape_size(self.indices_shape) util.check_tensor_shape_size(self.updates_shape) util.check_tensor_shape_size(var_out_shape) check_list_indices = ("int32") util.check_dtype_rule(self.indices_dtype, check_list_indices) check_list_var = ("float16", "float32", "int32", "int8", "uint8") util.check_dtype_rule(self.var_dtype, check_list_var) util.check_dtype_rule(self.updates_dtype, check_list_var) util.check_dtype_rule(var_out_dtype, check_list_var) if (self.updates_dtype != self.var_dtype or var_out_dtype != self.var_dtype): raise RuntimeError( "dtype updates:{} var_out:{} must same as var{}".format(self.updates_dtype, var_out_dtype, self.var_dtype)) if var_out_shape != self.var_shape: raise RuntimeError( "var_out's shape:{} must be the same as var's shape:{}".format(var_out_shape, self.var_shape)) # updates is not support broadcast to var current if self.var_shape != self.updates_shape: raise RuntimeError( "var's shape:{} must same as updates's shape:{}".format(self.updates_shape, self.var_shape)) if self.axis >= len(self.updates_shape): raise RuntimeError("axis:{} must in range updates shapes:{} len:{}".format(self.axis, self.updates_shape, len(self.updates_shape))) # not support indecis is null if len(self.indices_shape) != 1: raise RuntimeError("indices_shape:{} len:{} must be l".format(self.indices_shape, len(self.indices_shape))) if self.indices_shape[0] != self.updates_shape[self.axis]: raise RuntimeError("indices:{} != updates.size(axis({})):{}".format(len(self.indices_shape), self.axis, self.updates_shape[self.axis])) # indicis now support cut slice to ub if (self.indices_dtype_bytes_size * self.indices_num) > (self.ub_size_bytes * 8 // 10): raise RuntimeError("indices num:{} large than ub size:{}".format(self.indices_num, self.ub_size_bytes))
def fake_quant_min_max_per_channel_update( x, min_val, max_val, min_up, max_up, ema, ema_decay, symmetric, narrow_range, training, num_bits, channel_axis, kernel_name="fake_quant_min_max_per_channel_update"): """FakeQuantPerLayer op""" x_shape = x.get("ori_shape") x_format = x.get("format") x_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") util.check_kernel_name(kernel_name) util.check_shape_rule(x_shape) util.check_shape_rule(min_shape, 1, 1, x_shape[channel_axis]) util.check_shape_rule(max_shape, 1, 1, x_shape[channel_axis]) util.check_tensor_shape_size(x_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = x_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) if symmetric: quant_min = 0 - 2**(num_bits - 1) quant_max = 2**(num_bits - 1) - 1 else: quant_min = 0 quant_max = 2**num_bits - 1 if narrow_range: quant_min = quant_min + 1 shape_c = [min_val.get("shape")[1], min_val.get("shape")[-1]] input_data = tvm.placeholder(x.get("shape"), name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype) max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype) res_list = fake_quant_min_max_per_channel_update_compute( input_data, min_data, max_data, ema, ema_decay, quant_min, quant_max, training, channel_axis, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [input_data, min_data, max_data] + list(res_list) config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def conv_layer_cce_para_check(shape_in, shape_w, in_dtype, w_dtype, res_dtype, padh, padw, strideh, stridew, quantize_config, scale_sqrt, scale_q_dtype, offset_q_dtype, scale_dq_dtype, scale_rq_dtype, offset_rq_dtype, offset_w_dtype, offset_pad_dtype, bias, kernel_name): # conv shape check util.check_kernel_name(kernel_name) # conv data type check util.check_dtype_rule(in_dtype, ['float16', 'int8', 'uint8']) util.check_dtype_rule(w_dtype, ['float16', 'int8', 'uint8']) res_dtype_list = ['float16', 'int8', 'uint8'] if is_v200_version(): res_dtype_list.append('int32') util.check_dtype_rule(res_dtype, res_dtype_list) util.check_dtype_rule(scale_q_dtype, ['float16']) util.check_dtype_rule(offset_q_dtype, ['float16']) util.check_dtype_rule(scale_dq_dtype, ['float16']) util.check_dtype_rule(scale_rq_dtype, ['float16']) util.check_dtype_rule(offset_rq_dtype, ['float16']) util.check_dtype_rule(offset_w_dtype, ['int32']) util.check_dtype_rule(offset_pad_dtype, ['uint8']) if not isinstance(bias, bool): raise RuntimeError("bias dtype should be bool.") if quantize_config[0] == 0: if is_v200_version(): util.check_dtype_rule(in_dtype, ('int8', )) util.check_dtype_rule(w_dtype, ('int8', )) util.check_dtype_rule(res_dtype, ('int32', )) else: util.check_dtype_rule(in_dtype, ['float16']) util.check_dtype_rule(w_dtype, ['float16']) util.check_dtype_rule(res_dtype, ['float16']) if quantize_config[0] == 1: util.check_dtype_rule(w_dtype, ['int8']) if quantize_config[1] == 0: util.check_dtype_rule(in_dtype, ['int8', 'float16']) util.check_dtype_rule(res_dtype, ['int8', 'float16']) elif quantize_config[1] == 1: util.check_dtype_rule(in_dtype, ['uint8', 'float16']) util.check_dtype_rule(res_dtype, ['uint8', 'float16']) elif quantize_config[1] == 2: raise RuntimeError("All Offset mode quantize not support.") else: raise RuntimeError("Invalid quantize algorithm.") # quantize switch on if quantize_config[0] == 1: quantize_turn_on = True # quantize -> DeQuantize dataflow if in_dtype == 'float16' and w_dtype == 'int8' and res_dtype == 'float16': pass # DeQuantize dataflow elif (in_dtype in ['int8', 'uint8'] and w_dtype == 'int8' and res_dtype == 'float16'): pass # quantize -> ReQuantize dataflow elif (in_dtype == 'float16' and w_dtype == 'int8' and res_dtype in ['int8', 'uint8']): pass # ReQuantize dataflow elif (in_dtype in ['int8', 'uint8'] and w_dtype == 'int8' and res_dtype in ['int8', 'uint8']): pass else: raise RuntimeError("Not support in/out data type for quantize.") if quantize_config not in ([1, 0, 0], [1, 1, 0], [1, 0, 1], [1, 1, 1]): raise RuntimeError("Invalid Quantize Config.") if scale_sqrt not in ([0, 0, 0], [1, 0, 0], [0, 1, 0], [1, 1, 0], [0, 0, 1], [1, 0, 1], [0, 1, 1], [1, 1, 1]): raise RuntimeError("Invalid Quantize Config.") # quantize switch off elif quantize_config[0] == 0: if quantize_config != [0, 0, 0]: raise RuntimeError("Invalid Quantize Config.") if scale_sqrt != [0, 0, 0]: raise RuntimeError("Invalid Quantize Config.") else: raise RuntimeError("Invalid Quantize Config.") if isinstance(padh, list): if len(padh) != PAD_SHAPE_DIM: raise RuntimeError("Dimension must be %d when padh is a list." % PAD_SHAPE_DIM) pad_top = padh[0] pad_bottom = padh[1] else: pad_top = padh pad_bottom = padh if isinstance(padw, list): if len(padw) != PAD_SHAPE_DIM: raise RuntimeError("Dimension must be %d when padw is a list." % PAD_SHAPE_DIM) pad_left = padw[0] pad_right = padw[1] else: pad_left = padw pad_right = padw shape_in, shape_w = te.lang.cce.check_conv_shape(shape_in, shape_w, pad_top, pad_bottom, \ pad_left, pad_right, strideh, \ stridew, in_dtype, w_dtype, res_dtype) return shape_in, shape_w
def check_conv2dbp_filter_params(shape_x, shape_out_backprop, filter_sizes, strides, pads, dilations, x_dtype, out_backprop_dtype, res_dtype, kernel_name): """ The params check function of conv2d_backprop_filter Parameters: ---------- shape_x : The shape of feature map, which is 4-D [batch, channels, height, weight]. shape_out_backprop : The shape of gradients, which is 4-D [batch, channels, height, weight]. filter_sizes : The shape of filter. which is 4-D [batch, channels, height, weight]. strides : The stride of the sliding window. A list of ints. pads : "SAME"or"VALID", indicating the type of pads algorithm to use, or list. dilations : An optional list of ints. Default value is [1, 1, 1, 1]. x_dtype : Fmeature map data dtype. Default value is float16. out_backprop_dtype : Gradients data dtype. Default value is float16. res_dtype : Result(De/Dw) data dtype. Default value is float32. kernel_name : Kernel name of cce. Default value is "conv2d_backprop_filter_cce" Returns : All transformed params. ---------- """ def _align(input_x, input_y): if input_y == 0: dict_args = {} dict_args['errCode'] = "E60108" dict_args['reason'] = "Division by zero" raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) return (input_x + input_y - 1) // input_y * input_y def _check_attr_range_dw(name, value, attr_min=None, attr_max=None): if not attr_min and not attr_max: return if not attr_min: if (not isinstance(value, int)) or value > attr_max: dict_args = {} dict_args['errCode'] = "E64001" dict_args['range'] = "(, {}]".format(attr_max) dict_args['attr_name'] = name dict_args["value"] = str(value) raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) elif not attr_max: if (not isinstance(value, int)) or value < attr_min: dict_args = {} dict_args['errCode'] = "E64001" dict_args['range'] = "[{}, )".format(attr_min) dict_args['attr_name'] = name dict_args["value"] = str(value) raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) elif(not isinstance(value, int)) or value > attr_max \ or value < attr_min: dict_args = {} dict_args['errCode'] = "E64001" dict_args['range'] = "[{},{}]".format(attr_min, attr_max) dict_args['attr_name'] = name dict_args["value"] = str(value) raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) def _check_64bits_limitation(attr_name, attr_value, dtype=None): if dtype: bit_ratio = BIT_RATIO_DICT.get(dtype) else: bit_ratio = BIT_RATIO_DICT.get("float16") if attr_value * bit_ratio > DATA_SIZE_MAX: dict_args = {} dict_args['errCode'] = "E60020" dict_args['attr_name'] = attr_name raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) def _is_conv1d_situation(): if fmap_h_padding == 1 and filter_h_dilation == 1 and stride_h == 1: return True return False def _is_load3d_special(): # limitation by chip: # Ascend910 # load3d not support when only fmap w after padding equals to filter w if get_soc_spec("SOC_VERSION") == 'Ascend910' \ and fmap_h_padding != filter_h \ and fmap_w_padding == filter_w: return False # limitation by chip: # if kernel h,w in [1,11] # and fmap h/w after padding equals to filter h/w # load3d support h,w is 1 if (1 <= filter_h <= 11) and (1 <= filter_w <= 11) \ and (fmap_h_padding == filter_h or fmap_w_padding == filter_w): return True return False # First : Base check, Mainly required by interface appearance # =========================================================== # util check util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x, CONV_BACKPROP_SHAPE_DIM, CONV_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM) util.check_shape_rule(shape_out_backprop, CONV_BACKPROP_SHAPE_DIM, CONV_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM) util.check_shape_rule(filter_sizes, CONV_BACKPROP_SHAPE_DIM, CONV_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM) util.check_shape_rule(strides, STRIDES_SHAPE_DIM, STRIDES_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM) def _check_attr_pads(): # pads check if isinstance(pads, (tuple, list)) and \ len(pads) != CONV_BACKPROP_SHAPE_DIM: dict_args = dict() dict_args["errCode"] = "E60107" dict_args["param_name"] = "pads" raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) if isinstance(pads, str) and pads not in PADDING_SUPPORT: dict_args = {} dict_args['errCode'] = "E60021" dict_args['expected_pad_mode'] = str(PADDING_SUPPORT) dict_args['actual_pad_mode'] = str(pads) raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) _check_attr_pads() # dilations check util.check_shape_rule(dilations, CONV_BACKPROP_SHAPE_DIM, CONV_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM) dilation_n, dilation_c, dilation_h, dilation_w = dilations _check_attr_range_dw("dilations's H", dilation_h, DILATION_MIN, DILATION_MAX) _check_attr_range_dw("dilations's W", dilation_w, DILATION_MIN, DILATION_MAX) if dilation_n != 1 or dilation_c != 1: dict_args = {} dict_args["errCode"] = "E60023" dict_args["dilation_n"] = str(dilation_n) dict_args["dilation_c"] = str(dilation_c) raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) # detype chek x_dtype = x_dtype.lower() out_backprop_dtype = out_backprop_dtype.lower() res_dtype = res_dtype.lower() util.check_dtype_rule(x_dtype, ['float16']) util.check_dtype_rule(out_backprop_dtype, ['float16']) util.check_dtype_rule(res_dtype, ['float32', 'float16']) # Second : Furture Check, Mainly required by SRS # =========================================================== # the relation limits between shape shape_x = list(shape_x) shape_out_backprop = list(shape_out_backprop) filter_sizes = list(filter_sizes) strides = list(strides) fmap_batch, fmap_channel, fmap_h, fmap_w = shape_x dedy_batch, dedy_channel, dedy_h, dedy_w = shape_out_backprop filter_batch, filter_channel, filter_h, filter_w = filter_sizes stride_h, stride_w = strides filter_h_dilation = (filter_h - 1) * dilation_h + 1 filter_w_dilation = (filter_w - 1) * dilation_w + 1 # pads compute if pads == 'SAME': pad_w = _align(fmap_w, stride_w) - stride_w + \ filter_w_dilation - fmap_w pad_w = max(pad_w, 0) pad_left = pad_w // 2 pad_right = pad_w - pad_left pad_h = _align(fmap_h, stride_h) - stride_h + \ filter_h_dilation - fmap_h pad_h = max(pad_h, 0) pad_up = pad_h // 2 pad_down = pad_h - pad_up pads = [pad_up, pad_down, pad_left, pad_right] elif pads == "VALID": pads = PADDING_VAILD pads = list(pads) pad_up, pad_down, pad_left, pad_right = pads if pad_up >= filter_h_dilation or pad_down >= filter_h_dilation: dict_args = dict() dict_args["errCode"] = "E64005" dict_args["direction"] = 'H' dict_args["pads_dir"] = "pad_up and pad_down" dict_args["pads_value"] = "[{}, {}]".format(pad_up, pad_down) dict_args["filter_value"] = str(filter_h_dilation) raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) if pad_left >= filter_w_dilation or pad_right >= filter_w_dilation: dict_args = dict() dict_args["errCode"] = "E64005" dict_args["direction"] = 'W' dict_args["pads_dir"] = "pad_left and pad_right" dict_args["pads_value"] = "[{}, {}]".format(pad_left, pad_right) dict_args["filter_value"] = str(filter_w_dilation) raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) fmap_w_padding = fmap_w + pad_left + pad_right fmap_h_padding = fmap_h + pad_up + pad_down # special cases fmap_hw_min, dedy_hw_min = FMAP_HW_MIN, DEDY_HW_MIN dedy_hw_max, fmap_hw_max = DEDY_HW_MAX, FMAP_HW_MAX # exchange h and w will not change date in memmory if fmap_w_padding == 1 and filter_w == 1 and dedy_w == 1: shape_x = (fmap_batch, fmap_channel, fmap_w, fmap_h) shape_out_backprop = (dedy_batch, dedy_channel, dedy_w, dedy_h) filter_sizes = (filter_batch, filter_channel, filter_w, filter_h) strides = stride_w, stride_h dilations = dilation_n, dilation_c, dilation_w, dilation_h fmap_h_padding, fmap_w_padding = fmap_w_padding, fmap_h_padding dedy_h, dedy_w = dedy_w, dedy_h fmap_h, fmap_w = fmap_w, fmap_h filter_h, filter_w = filter_w, filter_h filter_h_dilation, filter_w_dilation = filter_w_dilation,\ filter_h_dilation # limitation by chip: # if kernel h,w in [1,11] and fmap h/w after padding equals to filter h/w # load3d support h,w is 1 if _is_load3d_special(): fmap_hw_min = 1 dedy_hw_min = 1 # if conv1d situation, make sure w is in [1,2**31-1] if _is_conv1d_situation(): dedy_hw_min = 1 fmap_hw_min = 1 dedy_hw_max = CONV1D_MAX_W fmap_hw_max = CONV1D_MAX_W # Dedy value limit _check_attr_range_dw("Dedy's H", dedy_h, dedy_hw_min, dedy_hw_max) _check_attr_range_dw("Dedy's W", dedy_w, dedy_hw_min, dedy_hw_max) # filter value limit _check_attr_range_dw("filter's H", filter_h, FILTER_HW_MIN, FILTER_HW_MAX) _check_attr_range_dw("filter's W", filter_w, FILTER_HW_MIN, FILTER_HW_MAX) # Fmap value limit _check_attr_range_dw("Fmap's H", fmap_h, fmap_hw_min, fmap_hw_max) _check_attr_range_dw("Fmap's W", fmap_w, fmap_hw_min, fmap_hw_max) # stride value limit _check_attr_range_dw("stride's H", stride_h, STRIDE_HW_MIN, STRIDE_HW_MAX) _check_attr_range_dw("stride's W", stride_w, STRIDE_HW_MIN, STRIDE_HW_MAX) def _check_axis_hw(): if fmap_batch != dedy_batch: dict_args = {} dict_args['errCode'] = "E64002" dict_args['param1'] = "Fmap's N" dict_args['param2'] = "Dedy's N" dict_args['actual_value'] = "{}, {}".\ format(fmap_batch, dedy_batch) raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) if dedy_channel != filter_batch: dict_args = {} dict_args['errCode'] = "E64002" dict_args['param1'] = "Dedy's C" dict_args['param2'] = "Filter's N" dict_args['actual_value'] = "{}, {}". \ format(dedy_channel, filter_batch) raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) if fmap_channel != filter_channel: dict_args = {} dict_args['errCode'] = "E64002" dict_args['param1'] = "Fmap's C" dict_args['param2'] = "Filter's C" dict_args['actual_value'] = "{}, {}". \ format(fmap_channel, filter_channel) raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) if filter_w_dilation > fmap_w_padding: dict_args = dict() dict_args["errCode"] = "E60015" dict_args["w_of_x"] = str(fmap_w_padding) dict_args["w_of_filter"] = str(filter_w_dilation) raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) if filter_h_dilation > fmap_h_padding: dict_args = dict() dict_args["errCode"] = "E60014" dict_args["h_of_x"] = str(fmap_h_padding) dict_args["h_of_filter"] = str(filter_h_dilation) raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) # Third : value check, Mainly required by the convolution rule if ((fmap_w - filter_w_dilation + pad_left + pad_right) // stride_w + 1) != dedy_w: dict_args = {} dict_args["errCode"] = "E60025" raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) if ((fmap_h - filter_h_dilation + pad_up + pad_down) // stride_h + 1) != dedy_h: dict_args = {} dict_args["errCode"] = "E60024" raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) _check_axis_hw() def _min_l1_byte(): # Forth : L1 limitation, Mainly required by chip al1_min_byte = C0 * C0 * 2 if not _is_conv1d_situation(): kl1_min = fmap_w else: kl1_min = (C0 - 1) * stride_w + filter_w_dilation if dedy_w % C0 == 0: bl1_min_byte = filter_h_dilation * kl1_min * C0 * 2 else: bl1_min_byte = (filter_h_dilation + stride_h) * kl1_min * C0 * 2 l1_size = get_soc_spec("L1_SIZE") # L1 size if (al1_min_byte + bl1_min_byte) > l1_size: dict_args = {} dict_args["errCode"] = "E60026" raise RuntimeError(dict_args, err_man.get_error_message(dict_args)) _min_l1_byte() # Fifth : check shape size, 64 bits limitation c0_size = cce_params.C0_SIZE fmap_size = fmap_batch * _align(fmap_channel, c0_size) * fmap_h * fmap_w dedy_size = dedy_batch * _align(dedy_channel, c0_size) * dedy_h * dedy_w filter_size = \ _align(filter_batch, c0_size) * _align(filter_channel, c0_size) \ * filter_h * filter_w _check_64bits_limitation("fmap_size", fmap_size, dtype=x_dtype) _check_64bits_limitation("dedy_size", dedy_size, dtype=out_backprop_dtype) _check_64bits_limitation("filter_size", filter_size, dtype=res_dtype) result = (shape_x, shape_out_backprop, filter_sizes, strides, pads, dilations, x_dtype, out_backprop_dtype, res_dtype, kernel_name) return result
def decode_bbox(box_predictions, anchors, decoded_boxes, decode_clip, kernel_name="decode_bbox"): """ calculating data Parameters ---------- box_predictions : shape and dtype of input anchors : shape and dtype of input decoded_boxes : shape and dtype of output, s hould be same shape and type as input decode_clip : decode_clip kernel_name : kernel name, default value is "decode_bbox" Returns ------- None """ # check param & data shape_box_predictions = box_predictions.get("shape") shape_anchors = anchors.get("shape") shape_decoded_boxes = decoded_boxes.get("shape") util.check_kernel_name(kernel_name) format_box_predictions = box_predictions.get("format") format_anchors = anchors.get("format") format_decoded_boxes = decoded_boxes.get("format") check_format_shape(format_box_predictions, format_anchors, format_decoded_boxes) util.check_shape_rule(shape_box_predictions, CONFIG_THREE, CONFIG_FOUR, None) util.check_shape_rule(shape_anchors, CONFIG_THREE, CONFIG_FOUR, None) util.check_shape_rule(shape_decoded_boxes, CONFIG_TWO, CONFIG_TWO, None) util.check_shape_size(shape_box_predictions, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_anchors, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_decoded_boxes, SHAPE_SIZE_LIMIT) util.check_dtype_rule(box_predictions.get("dtype").lower(), ("float16", )) util.check_dtype_rule(anchors.get("dtype").lower(), ("float16", )) util.check_dtype_rule(decoded_boxes.get("dtype").lower(), ("float16", )) if shape_box_predictions != shape_anchors: raise RuntimeError("the input shape_box_predictions and anchors)" "must be same") if (reduce(lambda x, y: x * y, shape_box_predictions[:])) \ != (reduce(lambda x, y: x * y, shape_decoded_boxes[:])): raise RuntimeError("the input shape (box_predictions and anchors" "is not equal to out shape(decoded_boxes)") if (shape_box_predictions[-1] == CONFIG_FOUR and len(shape_box_predictions) == CONFIG_THREE): if shape_decoded_boxes[1] != CONFIG_FOUR: raise RuntimeError("the output shape_decoded_boxes must be 4") else: if (shape_box_predictions[0] == CONFIG_FOUR and len(shape_box_predictions) == CONFIG_FOUR): if shape_decoded_boxes[0] != CONFIG_FOUR: raise RuntimeError("the output shape_decoded_boxes must be 4") else: raise RuntimeError("the input shape not in {(4,C,H,W), (H,W,4)}") if not isinstance(decode_clip, (float, int)): raise RuntimeError("input param type of decode_clip should be Float") if decode_clip < 0 or decode_clip > 10: raise RuntimeError( "input param decode_clip can't be negtive and shoud be [0,10]! ") # init the tiling shape print("shape_box_predictions", shape_box_predictions) shape = TilingFunc(shape_box_predictions) # calculate the deocede_bbox tik_instance = tik.Tik(tik.Dprofile()) data_tensor = InitTensor(tik_instance, shape) if shape.input_shape[-1] == CONFIG_FOUR \ and len(shape.input_shape) == CONFIG_THREE: decode_bbox_compute(tik_instance, shape, data_tensor, decode_clip, kernel_name) if shape.input_shape[0] == CONFIG_FOUR \ and len(shape.input_shape) == CONFIG_FOUR: decode_bbox_compute_transpose(tik_instance, shape, data_tensor, decode_clip, kernel_name) return tik_instance
def select_v2(condition, x1, x2, y, kernel_name="select_v2"): """ Selects elements from `x1` or `x2`, depending on `condition`. Parameters ---------- condition: dict dict of condition, include keys(shape and dtype), only support bool x1: dict dict of x1, only support float16, float32, int32, int8, uint8 x2: dict dict of x2, only support float16, float32, int32, int8, uint8 y: dict dict of output kernel_name: str cce kernel name, default value is "select" Returns ------- None """ shape_x1 = x1.get("shape") dtype_x1 = x1.get("dtype") shape_x2 = x2.get("shape") dtype_x2 = x2.get("dtype") bool_dtype = condition.get("dtype") con_shape = condition.get("shape") shape_x1, con_shape, shape_max_x1 = util.produce_shapes( shape_x1, con_shape) shape_x2, con_shape, shape_max_x2 = util.produce_shapes( shape_x2, con_shape) if shape_x1[-1] == 1 and shape_x2[-1] == 1 and con_shape[-1] == 1 \ and shape_max_x1[-1] == 1: shape_x1 = shape_x1 if len(shape_x1) == 1 else shape_x1[:-1] shape_x2 = shape_x2 if len(shape_x2) == 1 else shape_x2[:-1] con_shape = con_shape if len(con_shape) == 1 else con_shape[:-1] util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x1) util.check_tensor_shape_size(shape_x1) if shape_x1 == shape_x2 == con_shape: shape_x1 = (functools_reduce(lambda x, y: x * y, shape_x1[:]), ) shape_x2 = (functools_reduce(lambda x, y: x * y, shape_x2[:]), ) con_shape = (functools_reduce(lambda x, y: x * y, con_shape[:]), ) dtype_x1 = dtype_x1.lower() dtype_x2 = dtype_x2.lower() check_list = ("float16", "float32", "int32", "int8", "uint8") util.check_dtype_rule(dtype_x1, check_list) if dtype_x1 != dtype_x2: raise RuntimeError("Dtype of tensor x1 and x2 must be equal!") bool_dtype = bool_dtype.lower() bool_check_list = ("bool", "int8", "uint8") util.check_dtype_rule(bool_dtype, bool_check_list) condition = tvm.placeholder(con_shape, name="condition", dtype=bool_dtype) input_then = tvm.placeholder(shape_x1, name="input_then", dtype=dtype_x1) input_else = tvm.placeholder(shape_x2, name="input_else", dtype=dtype_x2) with tvm.target.cce(): res = select_v2_compute(condition, input_then, input_else, y, kernel_name) sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [condition, input_then, input_else, res], "bool_storage_as_1bit": False } te.lang.cce.cce_build_code(sch, config)
def ascend_dequant_s16(x0, deq_scale, x1, y, relu_flag=False, kernel_name='ascend_dequant_s16'): """ int32 -> int16 Parameters: ---------- x0 : the dict of input deq_scale: the dict of dequant num x1 : the input of add tensor y : the dict of output. relu_flag : the relu mode when true the result to do relu kernel_name : cce kernel name, default value is "ascend_dequant_s16" Returns: ------- None """ shape_x0 = x0.get("shape") format_x0 = x0.get("format") dtype_x0 = x0.get("dtype") shape_deq = deq_scale.get("shape") format_deq = deq_scale.get("format") dtype_deq = deq_scale.get("dtype") check_list = [("int32", ), ("uint64", ), ("int16", )] format_list = ["NC1HWC0", "FRACTAL_NZ"] util.check_dtype_rule(dtype_x0, check_list[0]) util.check_dtype_rule(dtype_deq, check_list[1]) if format_x0 not in format_list: raise RuntimeError("x0 only support [NC1HWC0, FRACTAL_NZ]") if format_x0 == "NC1HWC0": if len(shape_x0) != 5: raise ValueError( "x0 shape must of length 5 when format is NC1HWC0") if format_x0 == "FRACTAL_NZ": if len(shape_x0) < 4: raise RuntimeError( "x0 shape length must >= 4 when format is FRACTAL_NZ") if len(shape_deq) != 5: raise ValueError("deq_scale shape must of length 5") if format_deq != "NC1HWC0": raise ValueError("deq_scale only support NC1HWC0") if shape_deq[0] != 1 or shape_deq[2] != 1 or shape_deq[3] != 1: raise RuntimeError("deq_scale shape must be 1 in n,h,w") if format_x0 == "NC1HWC0": # n, C1, H*W, C0 shape_x0 = [ shape_x0[0], shape_x0[1], shape_x0[2] * shape_x0[3], shape_x0[4] ] ori_shape_deq = deq_scale.get("ori_shape") attr = {"ori_shape": ori_shape_deq} input_x0 = tvm.placeholder(shape_x0, dtype_x0, "x0") input_deq = tvm.placeholder(shape_deq, name="deq_scale", dtype=dtype_deq, attrs=attr) input_x1 = None if x1: shape_bias = x1.get("shape") input_x1 = tvm.placeholder(shape_bias, "int16", "x1") with tvm.target.cce(): res = ascend_dequant_s16_compute(input_x0, input_deq, input_x1, relu_flag, kernel_name) generic.auto_schedule(res)
def conv_layer_cce(shape_in, shape_w, in_dtype, w_dtype, res_dtype, padh, padw, strideh, stridew, bias=0, kernel_name="conv_layer_cce", need_build=0, need_print=0): """ Parameters ---------- shape_in : shape of data_in shape_w : shape of filter in_dtype : the feature map data type w_dtype : the weight data type res_dtype : the result data type padh: the padding shape in H padw: the padding shape in Weight strideh: the stride value in H stridew: the stride value in Weight quantizeConfig: quantize config table, default [0, 0, 0] quantizeConfig[0] - quantize function switch 0: quantize off 1: quantize on quantizeConfig[1] - QuantizeAlgorithm 0: non offset 1: half offset 2: all offset ( Not supported now ) quantizeConfig[2] - QuantizeScaleType (for Dequantize/Requantize, quantize always scalar) 0: scalar 1: vector scaleSqrt: scale mode scaleSqrt[0] - Quantize scale mode 0: non sqrt 1: sqrt scaleSqrt[1] - DeQuantize scale mode 0: non sqrt 1: sqrt scaleSqrt[2] - ReQuantize scale mode 0: non sqrt 1: sqrt scaleQ_dtype: Quantize scale data type, default 'float16' offsetQ_dtype: Quantize offset data type, default 'float16' scaleDq_dtype: DeQuantize scale data type, default 'float16' scaleRq_dtype: ReQuantize scale data type, default 'float16' offsetRq_dtype: ReQuantize offset data type, default 'float16' offsetW_dtype: Weight offset data type, default 'int32' offsetPad_dtype: Quantize Cube offset data type, default 'uint8' bias: the tag for bias or not kernel_name : cce kernel name, default value is "cce_conv" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ # for pylint, otherwise "Dangerous default value [] as argument" # if quantizeConfig is None: # quantizeConfig = [0, 0, 0] # if scaleSqrt is None: # scaleSqrt = [0, 0, 0] # conv shape check util.check_kernel_name(kernel_name) util.check_shape_rule(shape_in, CONV_SHAPE_DIM, CONV_SHAPE_DIM) util.check_shape_rule(shape_w, CONV_SHAPE_DIM, CONV_SHAPE_DIM) in_dtype = in_dtype.lower() w_dtype = w_dtype.lower() res_dtype = res_dtype.lower() # scaleQ_dtype = scaleQ_dtype.lower() # offsetQ_dtype = offsetQ_dtype.lower() # scaleDq_dtype = scaleDq_dtype.lower() # scaleRq_dtype = scaleRq_dtype.lower() # offsetRq_dtype = offsetRq_dtype.lower() # offsetW_dtype = offsetW_dtype.lower() # offsetPad_dtype = offsetPad_dtype.lower() # conv data type check util.check_dtype_rule(in_dtype, ['float16', 'int8', 'uint8']) util.check_dtype_rule(w_dtype, ['float16', 'int8', 'uint8']) util.check_dtype_rule(res_dtype, ['float16', 'int8', 'uint8']) # util.check_dtype_rule(scaleQ_dtype, ['float16']) # util.check_dtype_rule(offsetQ_dtype, ['float16']) # util.check_dtype_rule(scaleDq_dtype, ['float16']) # util.check_dtype_rule(scaleRq_dtype, ['float16']) # util.check_dtype_rule(offsetRq_dtype, ['float16']) # util.check_dtype_rule(offsetW_dtype, ['int32']) # util.check_dtype_rule(offsetPad_dtype, ['uint8']) # if quantizeConfig[0] == 0: util.check_dtype_rule(in_dtype, ['float16']) util.check_dtype_rule(w_dtype, ['float16']) util.check_dtype_rule(res_dtype, ['float16']) # if quantizeConfig[0] == 1: # util.check_dtype_rule(w_dtype, ['int8']) shape_in = list(shape_in) shape_w = list(shape_w) # shape_in, shape_w = te.lang.cce.check_conv_shape(shape_in, shape_w, padh, padw, strideh, # stridew, in_dtype, w_dtype, res_dtype) # if shape_in[1]!=shape_w[1]: # raise RuntimeError("shape_in[1] must equal to shape_w[1]") block_size_K = CUBE_MKN[in_dtype]['mac'][1] shape_in[1] = ( (shape_in[1] + block_size_K - 1) // block_size_K) * block_size_K shape_w[1] = shape_in[1] hi = shape_in[2] wi = shape_in[3] hk = shape_w[2] wk = shape_w[3] h_out = 0 w_out = 0 # print(hi) # print(wi) # print(hk) # print(wk) # print(strideh) # print(stridew) # print(padh) # print(padw) if strideh != 0: h_out = (hi + (2 * padh) - hk) / strideh + 1 # calculated by hi and wi if stridew != 0: w_out = (wi + (2 * padw) - wk) / stridew + 1 # calculated by hi and wi if h_out <= 0: raise RuntimeError( "h_out must >0, h_out = (hi + (2 * padh) - hk) / strideh + 1") if w_out <= 0: raise RuntimeError( "w_out must >0, w_out = (wi + (2 * padw) - wk) / stridew + 1") if padh > hk: raise RuntimeError("kernel H must >= Pad H") if (shape_in[0] * w_out * h_out * hk * wk * CUBE_MKN[w_dtype]['mac'][1]) > (np.int64(2**31) - 1): raise RuntimeError("im2col shape exceed 32bit limitation") conv_check_rule(shape_in, shape_w, in_dtype, w_dtype, padh, padw, strideh, stridew) if res_dtype in ['int8', 'uint8']: w_block_size_K = CUBE_MKN[w_dtype]['mac'][1] shape_w[0] = ((shape_w[0] + w_block_size_K - 1) // w_block_size_K) * w_block_size_K else: w_block_size_N = CUBE_MKN[w_dtype]['mac'][2] shape_w[0] = ((shape_w[0] + w_block_size_N - 1) // w_block_size_N) * w_block_size_N # padh, padw check if padh < PAD_MIN or padh > PAD_MAX: raise RuntimeError("padh must be in [0,255].") if padw < PAD_MIN or padw > PAD_MAX: raise RuntimeError("padw must be in [0,255].") # strideh, stridew check if strideh < STRIDE_MIN or strideh > STRIDE_MAX: raise RuntimeError("strideh must be in [1,63].") if stridew < STRIDE_MIN or stridew > STRIDE_MAX: raise RuntimeError("stridew must be in [1,63].") # filterH, filterW check if shape_w[2] < FILTER_HW_MIN or shape_w[2] > FILTER_HW_MAX: raise RuntimeError("filterh must be in [1,255].") if shape_w[3] < FILTER_HW_MIN or shape_w[3] > FILTER_HW_MAX: raise RuntimeError("filterw must be in [1,255].") # tiling check, filterH*inputC*inputW*sizeof(in_dtype) < half of(L1_BUFFER) SIZE_OF_L1_BUFFER = cce_product.getParams("L1_Buffer") # bytes if (in_dtype == 'float16'): if (shape_w[2]) * (shape_in[1]) * (shape_in[3]) * SIZE_OF_FP16 > ( SIZE_OF_L1_BUFFER / 2): raise RuntimeError("min cut is out of half of L1 memory.") if (in_dtype == 'int8' or in_dtype == 'uint8'): if (shape_w[2]) * (shape_in[1]) * (shape_in[3]) * SIZE_OF_8BIT > ( SIZE_OF_L1_BUFFER / 2): raise RuntimeError("min cut is out of half of L1 memory.") # quantize switch on # if quantizeConfig[0] == 1: # quantizeTurnOn = True # quantize -> DeQuantize dataflow # if (in_dtype == 'float16' and w_dtype == 'int8' and res_dtype == 'float16'): # isQuantize = True # isDeQuantize = True # isReQuantize = False # DeQuantize dataflow # elif ((in_dtype == 'int8' or in_dtype == 'uint8') and w_dtype == 'int8' and res_dtype == 'float16'): # isQuantize = False # isDeQuantize = True # isReQuantize = False # quantize -> ReQuantize dataflow # elif (in_dtype == 'float16' and w_dtype == 'int8' and (res_dtype == 'int8' or res_dtype == 'uint8')): # isQuantize = True # isDeQuantize = False # isReQuantize = True # ReQuantize dataflow # elif ((in_dtype == 'int8' or in_dtype == 'uint8') and w_dtype == 'int8' and (res_dtype == 'int8' or res_dtype == 'uint8')): # isQuantize = False # isDeQuantize = False # isReQuantize = True # else: # raise RuntimeError("Not support in/out data type for quantize.") # quantize switch off # elif quantizeConfig[0] == 0: quantizeTurnOn = False isQuantize = False isDeQuantize = False isReQuantize = False # else: # raise RuntimeError("Invalid Quantize Config.") # - - - # - - - # - - - - - - - # - - - - - - # - - - # - - - # - - - - # # 07 | 06 | 05 04 | 03 | 02 | 01 | 00 # # QSqrt | scale | offset | ReQ | DeQ | Quan | Switch # # - - - # - - - # - - - # - - - # - - - - - - # - - - # - - - # - - - - # # 15 | 14 | 13 | 12 | 11 | 10 | 09 | 08 # # Null | Null | Null | Null |in_dsl_flag | bias | RqSqrt| DqSqrt # # - - - # - - - # - - - # - - - # - - - # - - - # - - - # - - - - # # in_dsl_flag #0: to imply conv by ir directly, it's not perferred # #1: to imply conv by dsl, it's default way # in_dsl_flag = 1 # 0 for old conv # te.lang.cce.conv_param.tiling = tiling model_config = (1 if quantizeTurnOn else 0) \ | (1 if isQuantize else 0) << 1 \ | (1 if isDeQuantize else 0) << 2 \ | (1 if isReQuantize else 0) << 3 \ | 0 << 4 \ | 0 << 6 \ | 0 << 7 \ | 0 << 8 \ | 0 << 9 \ | (1 if bias else 0) << 10 \ | 1 << 11 with tvm.target.cce(): Data = tvm.placeholder(shape_in, name='Fmap', dtype=in_dtype) Weight = tvm.placeholder(shape_w, name='Filter', dtype=w_dtype) # bias or fusion_bias(half offset) if bias or (model_config & 0x31 == 0x11): Bias = tvm.placeholder( (shape_w[0], ), name='Bias', dtype="int32" if quantizeTurnOn else "float16") # bias or fusion_bias(all offset) elif bias or (model_config & 0x31 == 0x21): Bias = tvm.placeholder( (shape_w[0], ), name='Bias', dtype="uint32" if quantizeTurnOn else "float16") # quantize on if quantizeTurnOn: QuantizeAlgorithm = quantizeConfig[1] if isQuantize: scaleQ = tvm.placeholder((CUBE_MKN[scaleQ_dtype]['mac'][1], ), name='scaleQ', dtype=scaleQ_dtype) if QuantizeAlgorithm == 1 or QuantizeAlgorithm == 2: offsetQ = tvm.placeholder( (CUBE_MKN[offsetQ_dtype]['mac'][1], ), name='offsetQ', dtype=offsetQ_dtype) if isDeQuantize: scaleDq_shape = (CUBE_MKN[scaleDq_dtype]['mac'][1], ) if quantizeConfig[2] == 0 else ( shape_w[0], ) scaleDq = tvm.placeholder(scaleDq_shape, name='scaleDq', dtype=scaleDq_dtype) if isReQuantize: scaleRq_shape = (CUBE_MKN[scaleRq_dtype]['mac'][1], ) if quantizeConfig[2] == 0 else ( shape_w[0], ) scaleRq = tvm.placeholder(scaleRq_shape, name='scaleRq', dtype=scaleRq_dtype) if QuantizeAlgorithm == 1 or QuantizeAlgorithm == 2: offsetRq_shape = (CUBE_MKN[offsetRq_dtype]['mac'][1], ) if quantizeConfig[2] == 0 else ( shape_w[0], ) offsetRq = tvm.placeholder(offsetRq_shape, name='offsetRq', dtype=offsetRq_dtype) # need offsetPad , for half offset and all offset if QuantizeAlgorithm == 1 or QuantizeAlgorithm == 2: offsetPad = tvm.placeholder( (CUBE_MKN[offsetPad_dtype]['mac'][1], ), name='offsetPad', dtype=offsetPad_dtype) # non offset if QuantizeAlgorithm == 0: if bias: if isQuantize: if isDeQuantize: tensor_list = te.lang.cce.conv( Data, Weight, Bias, scaleQ, scaleDq, res_dtype, padh, padw, strideh, stridew, model_config) else: tensor_list = te.lang.cce.conv( Data, Weight, Bias, scaleQ, scaleRq, res_dtype, padh, padw, strideh, stridew, model_config) else: if isDeQuantize: tensor_list = te.lang.cce.conv( Data, Weight, Bias, scaleDq, res_dtype, padh, padw, strideh, stridew, model_config) else: tensor_list = te.lang.cce.conv( Data, Weight, Bias, scaleRq, res_dtype, padh, padw, strideh, stridew, model_config) else: if isQuantize: if isDeQuantize: tensor_list = te.lang.cce.conv( Data, Weight, scaleQ, scaleDq, res_dtype, padh, padw, strideh, stridew, model_config) else: tensor_list = te.lang.cce.conv( Data, Weight, scaleQ, scaleRq, res_dtype, padh, padw, strideh, stridew, model_config) else: if isDeQuantize: tensor_list = te.lang.cce.conv( Data, Weight, scaleDq, res_dtype, padh, padw, strideh, stridew, model_config) else: tensor_list = te.lang.cce.conv( Data, Weight, scaleRq, res_dtype, padh, padw, strideh, stridew, model_config) # half offset elif QuantizeAlgorithm == 1: if isQuantize: if isDeQuantize: tensor_list = te.lang.cce.conv(Data, Weight, Bias, scaleQ, offsetQ, scaleDq, offsetPad, res_dtype, padh, padw, strideh, stridew, model_config) else: tensor_list = te.lang.cce.conv(Data, Weight, Bias, scaleQ, offsetQ, scaleRq, offsetRq, offsetPad, res_dtype, padh, padw, strideh, stridew, model_config) else: if isDeQuantize: tensor_list = te.lang.cce.conv(Data, Weight, Bias, scaleDq, offsetPad, res_dtype, padh, padw, strideh, stridew, model_config) else: tensor_list = te.lang.cce.conv(Data, Weight, Bias, scaleRq, offsetRq, offsetPad, res_dtype, padh, padw, strideh, stridew, model_config) # all offset elif QuantizeAlgorithm == 2: raise RuntimeError("All Offset mode quantize not support.") else: raise RuntimeError("Invalid quantize algorithm.") # quantize off else: if bias: # Res = Data * Weight + Bias tensor_list = te.lang.cce.conv(Data, Weight, Bias, res_dtype, padh, padw, strideh, stridew, model_config) else: # Res = Data * Weight tensor_list = te.lang.cce.conv(Data, Weight, res_dtype, padh, padw, strideh, stridew, model_config) tensor_list = list(tensor_list) sch = generic.auto_schedule(tensor_list[-1]) config = { "print_ir": need_print, "need_build": need_build, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def ascend_requant(x, req_scale, y, relu_flag=False, kernel_name='ascend_requant'): """ int32 -> int8 Parameters: ---------- x : the dict of input req_scale: the dict of requant num offset: the dict of offset num y : the dict of output. relu_flag : the relu mode when true the result to do relu kernel_name : cce kernel name, default value is "ascend_requant" Returns: ------- None """ shape_x = x.get("shape") format_x = x.get("format") shape_req = req_scale.get("shape") format_req = req_scale.get("format") dtype_x = x.get("dtype") dtype_req = req_scale.get("dtype") check_list = [("int32", ), ("uint64", )] format_list = ["NC1HWC0", "FRACTAL_NZ"] util.check_dtype_rule(dtype_x, check_list[0]) util.check_dtype_rule(dtype_req, check_list[1]) if format_x not in format_list: raise RuntimeError("x only support [NC1HWC0, FRACTAL_NZ]") if format_x == "NC1HWC0": if len(shape_x) != 5: raise ValueError("x shape must of length 5 when format is NC1HWC0") if format_x == "FRACTAL_NZ": if len(shape_x) < 4: raise RuntimeError( "x shape length must >= 4 when format is FRACTAL_NZ") if len(shape_req) != 5: raise ValueError("req_scale shape must of length 5") if format_req != "NC1HWC0": raise ValueError("req_scale only support NC1HWC0") if shape_req[0] != 1 or shape_req[2] != 1 or shape_req[3] != 1: raise RuntimeError("req_scale shape must be 1 in n,h,w") if format_x == "NC1HWC0": # n, C1, H*W, C0 shape_x = [shape_x[0], shape_x[1], shape_x[2] * shape_x[3], shape_x[4]] ori_shape_req = req_scale.get("ori_shape") attr = {"ori_shape": ori_shape_req} input_x = tvm.placeholder(shape_x, dtype_x, "x") input_req = tvm.placeholder(shape_req, name="req_scale", dtype=dtype_req, attrs=attr) with tvm.target.cce(): res = ascend_requant_compute(input_x, input_req, relu_flag, kernel_name) generic.auto_schedule(res)