def correction_mul(x, batch_std, running_std, y, channel, kernel_name="correction_mul"): """CorrectionMul op""" shape = x.get("shape") data_format = x.get("format") util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32"] inp_dtype = x.get("dtype").lower() if not inp_dtype in check_list: raise RuntimeError("Dtype of input only support float16, float32") # shape = util.shape_refine(shape) x_t = tvm.placeholder(shape, name="x", dtype=inp_dtype) shape_c = [1] * len(shape) shape_c[channel] = batch_std.get("ori_shape")[0] if data_format == "NC1HWC0" and channel == 1: shape_c = batch_std.get("shape") batch_std_t = tvm.placeholder(shape_c, name="batch_std", dtype=inp_dtype) running_std_t = tvm.placeholder(shape_c, name="running_std", dtype=inp_dtype) res = correction_mul_compute(x_t, batch_std_t, running_std_t, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"print_ir": False, "name": kernel_name, "tensor_list": [x_t, batch_std_t, running_std_t, res]} te.lang.cce.cce_build_code(sch, config)
def fake_learned_scale_quant_perchannel_grad_d_reduce( dout_alpha, dalpha, channel_axis, kernel_name="fake_learned_scale_quant_perchannel_grad_d_reduce"): """FakeLearnedScaleQuantPerChannelGradDReduce""" dout_alpha_shape = dout_alpha.get("shape") dout_alpha_dtype = dout_alpha.get("dtype") util.check_kernel_name(kernel_name) util.check_shape_rule(dout_alpha_shape) util.check_tensor_shape_size(dout_alpha_shape) check_list = ["float32", 'float16'] dout_alpha_dtype = dout_alpha_dtype.lower() util.check_dtype_rule(dout_alpha_dtype, check_list) dout_alpha_data = tvm.placeholder(dout_alpha_shape, name="dout_alpha", dtype=dout_alpha_dtype) res = fake_learned_scale_quant_perchannel_grad_d_reduce_compute( dout_alpha_data, dout_alpha, channel_axis, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [dout_alpha_data, res] config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def clip_boxes_d(boxes_input, boxes_output, img_size, kernel_name="clip_boxes"): """ the External interface function input: boxes_input: an dict, include shape, and dtype of input boxes_output: an dict, include shape, and dtype of output img_w: width of the image img_h: height of the image kernel_name: the kernel name return: the tik container """ if len(img_size) != CONFIG_TWO: raise RuntimeError("img_size should be [img_h, img_w]!") img_h, img_w = img_size check_clip_boxes_input_dict(boxes_input, boxes_output) check_clip_boxes_input_attr(img_w, img_h) if len(kernel_name) > util.MAX_KERNEL_NAEM_LEN: raise RuntimeError("kernel_name len must be less than 200!") util.check_kernel_name(kernel_name) tik_instance = clip_boxes_d_compute(boxes_input, img_w, img_h, kernel_name=kernel_name) return tik_instance
def fill_v2_d(y, value, shape, kernel_name="fill_v2_d"): """ interface of fill_v2_d :param y: output :param value: value to fill the shape, float32 :param shape: list int, output shape :param kernel_name: fill_v2_d :return: """ # check kernel name util.check_kernel_name(kernel_name) # shape to list shape = te.lang.cce.util.shape_to_list(shape) util.check_shape_rule(shape) # pseudo input, won't be used. data_x = tvm.placeholder(shape, dtype="float32", name="data_x") # do compute res = fill_v2_compute(data_x, value, shape, y, kernel_name) # new schedule schedule = [tvm.create_schedule(res.op)] elewise_sch = te.lang.cce.te_schedule.cce_schedule.ElewiseSchedule() elewise_sch._get_emit_insn_map = types.MethodType(_get_emit_insn_map, elewise_sch) elewise_sch._do_buffer_tile = types.MethodType(_do_buffer_tile, elewise_sch) elewise_sch.do_schedule([res], schedule, []) schedule = schedule[0] schedule.cce_special = {"tensor_list": (), "orign_out_tensor": [res], "real_out_tensor": [res]} # build operater config = {"name": kernel_name, "tensor_list": (data_x, res)} te.lang.cce.cce_build_code(schedule, config)
def minmax_update_perchannel(x, min_val, max_val, min_up, max_up, ema, ema_decay, channel_axis, kernel_name="minmax_update_perchannel"): """MinMaxUpdatePerChannel op""" x_shape = x.get("ori_shape") x_format = x.get("format") x_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") # for Dense weight quant, 2d[co,ci] -> 4d[1,co,ci,1], channel_axis_ need change to 1. if channel_axis == 0 and x_shape[0] != min_shape[0] and x_shape[ 1] == min_shape[0]: channel_axis_ = 1 else: channel_axis_ = channel_axis util.check_kernel_name(kernel_name) util.check_shape_rule(x_shape) util.check_shape_rule(min_shape, 1, 1, x_shape[channel_axis_]) util.check_shape_rule(max_shape, 1, 1, x_shape[channel_axis_]) util.check_tensor_shape_size(x_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = x_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) if channel_axis_ == 0: shape_c = min_val.get("ori_shape") else: shape_c = [min_val.get("shape")[1], min_val.get("shape")[-1]] input_data = tvm.placeholder(x.get("shape"), name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype) max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype) res_list = minmax_update_perchannel_compute(input_data, min_data, max_data, ema, ema_decay, channel_axis_) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [input_data, min_data, max_data] + list(res_list) config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def fake_quant_per_layer(x, min_val, max_val, y, symmetric, narrow_range, num_bits, kernel_name="fake_quant_per_layer"): """FakeQuantPerLayer""" input_shape = x.get("shape") input_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") min_shape = util.scalar2tensor_one(min_shape) max_shape = util.scalar2tensor_one(max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(min_shape, 1, 1, 1) util.check_shape_rule(max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = input_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), ) shape_min, _, _ = util.produce_shapes(min_shape, input_shape) quant_min = 0 quant_max = 2**num_bits - 1 if narrow_range: quant_min = quant_min + 1 input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype) max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype) res = fake_quant_per_layer_compute(input_data, min_data, max_data, y, quant_min, quant_max, symmetric, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [input_data, min_data, max_data, res] config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def custom_Concat(shapes, dtype, axis, kernel_name="concat", need_build=False, need_print=False): """ concat one or two input data Parameters ---------- shapes : input shape of data dtype : the data type, assume src_dtype equals dst_dtype, support uint8, int8, int32, float16, float32 axis : concat axis kernel_name : cce kernel name, default value is "concat" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) for i in range(len(shapes)): util.check_shape_rule(shapes[i]) sum_dim = 0 for shape in shapes: sum_dim += functools_reduce(lambda x, y: x*y, shape) if sum_dim > 2**31-1: raise RuntimeError("shape exceed 32bit limitation") check_list = ["uint8", "int8", "float16", "float32", "int32"] if not (dtype.lower() in check_list): raise RuntimeError( "concat_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) inp_dtype = dtype.lower() data = [] for i in range(len(shapes)): shape = shapes[i] data.append(tvm.placeholder(shape, name="data_%d" % i, dtype=inp_dtype)) with tvm.target.cce(): res = te.lang.cce.concat(data, axis) sch = generic.auto_schedule(res) data.append(res) config = {"print_ir": need_print, "need_build": need_build, "name": kernel_name, "tensor_list": data} te.lang.cce.cce_build_code(sch, config)
def sqrt(input_x, output_y, kernel_name="sqrt"): """ calculating data Parameters ---------- input_x : dict shape and dtype of input output_y : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "sqrt" Returns ------- None """ """ TODO: Please refer to the TE DSL Manual, And code here with TE DSL. """ """ TODO: operator check """ """ TODO: operator compute, invoke sqrt_compute """ print("=================当你看到这句话时,说明我这个自定义sqrt算子被执行了============================") shape = input_x.get("shape") dtype = input_x.get("dtype") input_dtype = dtype.lower() util.check_shape_rule(shape) util.check_tensor_shape_size(shape) util.check_kernel_name(kernel_name) data_input = tvm.placeholder(shape, name="data_input", dtype=input_dtype) res = sqrt_compute(data_input, output_y, kernel_name) """ TODO: auto schedule """ with tvm.target.cce(): schedule = generic.auto_schedule(res) """ TODO: operator build """ config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.cce.cce_build_code(schedule, config)
def custom_equal(shape_x, shape_y, dtype, kernel_name="cce_tf_equal", need_build=False, need_print=False): """ do element-wise equal operation between two input tensors Parameters: ---------- shape_x : shape of input x shape_y : shape of input y dtype : source data type, support float16,float32,int32,int8,uint8 kernel_name : cce kernel name, default value is "cce_tf_equal" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_rule(shape_y) check_list = ["float16", "float32", "int32", "int8", "uint8", "bool"] dtype = dtype.lower() if not (dtype in check_list): raise RuntimeError( "tf_equal_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT) shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y) util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT) x = tvm.placeholder(shape_x, dtype=dtype, name="x") y = tvm.placeholder(shape_y, dtype=dtype, name="y") x_tmp = te.lang.cce.broadcast(x, shape_max) y_tmp = te.lang.cce.broadcast(y, shape_max) res = tvm.compute(shape_max, lambda *i: x_tmp(*i) == y_tmp(*i), name='res') sch = tvm.create_schedule(res.op) if need_print: with build_config: print(tvm.lower(sch, [x, y, res], simple_mode=True)) if need_build: with build_config: tvm.build(sch, [x, y, res], "cce", name=kernel_name)
def decode_cornerpoints_target_bg(keypoints_prediction, anchors, keypoints_decoded, kernel_name="decode_cornerpoints_target_bg"): """ The params check function of decode_cornerpoints_target_bg Parameters: ---------- Returns : All transformed params. ---------- """ tik_instance = tik.Tik(tik.Dprofile(), True) util.check_kernel_name(kernel_name) check_decode_cornerpoints_target_bg_params(keypoints_prediction, anchors, keypoints_decoded) init_shape = InitShape(keypoints_prediction, anchors, keypoints_decoded) total_handling_times, last_handling_n = tiling_func(init_shape.shape_x[0]) init_first_tensor = InitFirstTensor(tik_instance, init_shape) with tik_instance.for_range(0, total_handling_times - CONFIG_ONE) as current_handling_times: n_x = SINGLE_N_MAX init_number = InitNumber(n_x) with tik_instance.new_stmt_scope(): init_second_tensor = InitsecondTensor(tik_instance, init_shape, init_number) init_third_tensor = InitThirdTensor(tik_instance, init_shape, init_number) calculate_process(tik_instance, init_number, init_first_tensor, init_second_tensor, init_third_tensor, current_handling_times) n_x = last_handling_n init_number = InitNumber(n_x) with tik_instance.new_stmt_scope(): init_second_tensor = InitsecondTensor(tik_instance, init_shape, init_number) init_third_tensor = InitThirdTensor(tik_instance, init_shape, init_number) calculate_process(tik_instance, init_number, init_first_tensor, init_second_tensor, init_third_tensor, total_handling_times - CONFIG_ONE) tik_instance.BuildCCE( kernel_name=kernel_name, inputs=[init_first_tensor.data_x, init_first_tensor.data_y], outputs=[init_first_tensor.data_z])
def custom_subtract(shape_x, shape_y, dtype, kernel_name="cce_subtract", need_build=True, need_print=True): """ do element-wise subtract operation between two input tensors Parameters: ---------- shape_x : shape of input data1 shape_y : shape of input data2 dtype : source data type, support float16,float32,int32 kernel_name : cce kernel name, default value is "cce_subtract" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_rule(shape_y) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32", "int32"] dtype = dtype.lower() if not (dtype in check_list): raise RuntimeError( "tf_subtract_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) print("######## shape") shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y) util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT) data1 = tvm.placeholder(shape_x, dtype=dtype, name="data1") data2 = tvm.placeholder(shape_y, dtype=dtype, name="data2") with tvm.target.cce(): data1_tmp1 = te.lang.cce.broadcast(data1, shape_max) data2_tmp1 = te.lang.cce.broadcast(data2, shape_max) res = te.lang.cce.vsub(data1_tmp1, data2_tmp1) sch = generic.auto_schedule(res) config = { "print_ir": need_print, "need_build": need_build, "name": kernel_name, "tensor_list": [data1, data2, res] } te.lang.cce.cce_build_code(sch, config)
def check_param(self): """ check the parameters :param var_out: :return: """ var_out_shape = self.var_out.get("shape") var_out_dtype = self.var_out.get("dtype").lower() if var_out_dtype == "bool": var_out_dtype = "int8" util.check_kernel_name(self.kernel_name) util.check_shape_rule(self.var_shape) util.check_shape_rule(self.indices_shape) util.check_shape_rule(self.updates_shape) util.check_shape_rule(var_out_shape) util.check_tensor_shape_size(self.var_shape) util.check_tensor_shape_size(self.indices_shape) util.check_tensor_shape_size(self.updates_shape) util.check_tensor_shape_size(var_out_shape) check_list_var = ("float16", "float32", "int32", "int8", "uint8") check_list_indices = "int32" util.check_dtype_rule(self.var_dtype, check_list_var) util.check_dtype_rule(self.indices_dtype, check_list_indices) util.check_dtype_rule(self.updates_dtype, check_list_var) util.check_dtype_rule(var_out_dtype, check_list_var) if var_out_shape != self.var_shape: raise RuntimeError( "var_out's shape must be the same as var's shape") if (self.updates_dtype != self.var_dtype or var_out_dtype != self.var_dtype): raise RuntimeError( "updates's datatype and var_out's datatype must be the" " same as var's datatype") if self.nd_flag: if len(self.indices_shape) < 2: raise RuntimeError( "the lenth of indices_shape must be large than 2") k = self.indices_shape[-1] updates_len = len(self.indices_shape) - 1 + len(self.var_shape) - k if k > len(self.var_shape): raise RuntimeError( "indices_shape[-1] can not be large than var's rank") if len(self.updates_shape) != updates_len: raise RuntimeError("the lenth of update must be len(indices_" "shape)-1+len(var_shape)-indices_shape[-1]") updates_true_shape = self.indices_shape[:-1] + self.var_shape[k:] else: updates_true_shape = self.var_shape[:self. axis] + self.indices_shape + self.var_shape[ self.axis + 1:] if self.updates_shape != updates_true_shape: raise RuntimeError("updates's shape is illegal")
def fake_learned_scale_quant_perlayer( input_x, alpha, quant_max, out, neg_trunc, kernel_name="fake_learned_scale_quant_perlayer"): """FakeLearnedScaleQuantPerLayer""" input_shape = input_x.get("shape") input_dtype = input_x.get("dtype") alpha_shape = alpha.get("ori_shape") alpha_dtype = alpha.get("dtype") quant_max_shape = quant_max.get("ori_shape") quant_max_dtype = quant_max.get("dtype") alpha_shape = util.scalar2tensor_one(alpha_shape) quant_max_shape = util.scalar2tensor_one(quant_max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(alpha_shape, 1, 1, 1) util.check_shape_rule(quant_max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(alpha_shape) util.check_tensor_shape_size(quant_max_shape) check_list = ["float32", "float16"] input_dtype = input_dtype.lower() alpha_dtype = alpha_dtype.lower() quant_max_dtype = quant_max_dtype.lower() util.check_dtype_rule(input_dtype, check_list) util.check_dtype_rule(alpha_dtype, check_list) util.check_dtype_rule(quant_max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), ) input_data = tvm.placeholder(input_shape, name="x", dtype=input_dtype) alpha_data = tvm.placeholder(alpha_shape, name="alpha_data", dtype=alpha_dtype) quant_max_data = tvm.placeholder(quant_max_shape, name="quant_max_data", dtype=quant_max_dtype) res = fake_learned_scale_quant_perlayer_compute(input_data, alpha_data, quant_max_data, neg_trunc, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [input_data, alpha_data, quant_max_data, res] config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list, "bool_storage_as_1bit": False } te.lang.cce.cce_build_code(sch, config)
def fake_quant_perchannel(x, min_val, max_val, y, symmetric, narrow_range, num_bits, channel_axis, kernel_name="fake_quant_perchannel"): """FakeQuantPerChannel""" x_shape = x.get("shape") x_shape_ = x.get("ori_shape") x_format = x.get("format") x_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") # for Dense weight quant, 2d[co,ci] -> 4d[1,co,ci,1], channel_axis_ need change to 1. if channel_axis == 0 and x_shape_[0] != min_shape[0] and x_shape_[1] == min_shape[0]: channel_axis_ = 1 else: channel_axis_ = channel_axis util.check_kernel_name(kernel_name) util.check_shape_rule(x_shape) util.check_shape_rule(min_shape, 1, 1, x_shape_[channel_axis_]) util.check_shape_rule(max_shape, 1, 1, x_shape_[channel_axis_]) util.check_tensor_shape_size(x_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = x_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) quant_min = 0 quant_max = 2 ** num_bits - 1 if narrow_range: quant_min = quant_min + 1 shape_c = [1] * len(x_shape) shape_c[channel_axis_] = min_val.get("ori_shape")[0] if x_format == "NC1HWC0" and channel_axis_ == 1: shape_c = min_val.get("shape") input_data = tvm.placeholder(x_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype) max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype) res = fake_quant_perchannel_compute(input_data, min_data, max_data, y, quant_min, quant_max, symmetric, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [input_data, min_data, max_data, res] config = {"print_ir": False, "name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def leaky_relu_grad(g, x, y, negative_slope=0, kernel_name="leaky_relu_grad"): """ calculate the backpropagation of leaky_relu operation y = gradients(x>0) or negative_slope*gradients(x<=0). support dtype:float16,float32 Parameters ---------- g : dict the backpropagated gradients to the corresponding leaky_relu operation x : dict the x passed as output of leaky_relu operation y : dict the output of leaky_relu back propagation negative_slope : float or int allow non-zero slope for negative inputs to speed up optimization kernel_name : str kernel name, default value is "leaky_relu_grad" Returns ------- None """ shape_g = g.get("shape") shape_x = x.get("shape") dtype_g = g.get("dtype").lower() dtype_x = x.get("dtype").lower() util.check_kernel_name(kernel_name) util.check_shape_rule(shape_g) util.check_shape_rule(shape_x) util.check_tensor_shape_size(shape_g) util.check_tensor_shape_size(shape_x) shape_list = util.produce_shapes(shape_g, shape_x) util.check_tensor_shape_size(shape_list[2]) # check input tensor data_type check_list = ["float16", "float32"] util.check_dtype_rule(dtype_g, check_list) util.check_dtype_rule(dtype_x, check_list) util.compare_tensor_dict_key(g, x, "dtype") shape_g, shape_x = refine_shapes_for_broadcast(shape_list[0], shape_list[1]) data_g = tvm.placeholder(shape_g, name="data_g", dtype=dtype_g) data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_g) res = leaky_relu_grad_compute(data_g, data_x, y, negative_slope, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_g, data_x, res]} te.lang.cce.cce_build_code(schedule, config)
def custom_l2_loss(shape, dtype, kernel_name="cce_tf_l2_loss", need_build=False, need_print=False): """ Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) / 2 Parameters ---------- shape : shape of data dtype : source data type, only support float16, float32 kernel_name : cce kernel name, default value is "cce_reductionLayer" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) util.check_reduce_shape_rule(shape) check_list = ["float16", "float32"] if not dtype.lower() in check_list: raise RuntimeError("tf_l2_loss_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) shape, axis = util.simplify_axis_shape(shape, range(len(shape))) inp_dtype = dtype.lower() data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) coeff_sqrt = tvm.const(1.0 / (2**(0.5)), dtype=inp_dtype) data_mul = te.lang.cce.vmuls(data_input, coeff_sqrt) data_sqr = te.lang.cce.vmul(data_mul, data_mul) res = te.lang.cce.sum(data_sqr, axis) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": need_print, "need_build": need_build, "name": kernel_name, "tensor_list": [data_input, res] } te.lang.cce.cce_build_code(sch, config)
def custom_sign(shape, dtype, kernel_name="cce_custom_sign", need_build=False, need_print=False): """ x*32768 algrithm: sign = round(-------------------------) 2 ** (-15) + |x*32768| calculating data type is float16 Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32, int32 kernel_name : cce kernel name, default value is "cce_sign" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32", "int32"] if not dtype.lower() in check_list: raise RuntimeError( "custom_sign_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) shape = util.shape_refine(shape) inp_dtype = dtype.lower() data = tvm.placeholder(shape, name="data", dtype=inp_dtype) with tvm.target.cce(): res = custom_sign_compute([data], shape, dtype, kernel_name, need_build, need_print) sch = generic.auto_schedule(res) config = { "print_ir": need_print, "need_build": need_build, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)
def correction_mul_grad(dout, x, batch_std, running_std, dx, mul_dx, channel, kernel_name="correction_mul_grad"): """CorrectionMulGrad op""" shape_dout = dout.get("shape") shape_x = dout.get("shape") dtype_dout = dout.get("dtype") dtype_x = x.get("dtype") dtype_batch_std = batch_std.get("dtype") dtype_running_std = running_std.get("dtype") inp_dtype_dout = dtype_dout.lower() inp_dtype_x = dtype_x.lower() inp_dtype_batch_std = dtype_batch_std.lower() inp_dtype_running_std = dtype_running_std.lower() util.check_dtype_rule(inp_dtype_dout, ("float16", "float32")) util.check_dtype_rule(inp_dtype_x, ("float16", "float32")) util.check_dtype_rule(inp_dtype_batch_std, ("float16", "float32")) util.check_dtype_rule(inp_dtype_running_std, ("float16", "float32")) util.compare_tensor_dict_key(dout, x, "dtype") util.compare_tensor_dict_key(dout, x, "shape") util.compare_tensor_dict_key(dx, x, "shape") util.compare_tensor_dict_key(batch_std, running_std, "shape") util.compare_tensor_dict_key(dx, mul_dx, "shape") util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) data_format = dout.get("format") ori_format = dout.get("format") if data_format.upper() not in ("NC1HWC0", "NCHW"): raise RuntimeError("Un supported data format {}".format(data_format)) if data_format.upper() == "NCHW" and ori_format != "NCHW": raise RuntimeError("data_format(NCHW) must same as ori_format") shape_c = [1] * len(shape_x) shape_c[channel] = batch_std.get("ori_shape")[0] if data_format == "NC1HWC0" and channel == 1: shape_c = batch_std.get("shape") dout_t = tvm.placeholder(shape_dout, name="dout", dtype=inp_dtype_dout) x_t = tvm.placeholder(shape_x, name="x", dtype=inp_dtype_x) batch_std_t = tvm.placeholder(shape_c, name="batch_std", dtype=inp_dtype_batch_std) running_std_t = tvm.placeholder(shape_c, name="running_std", dtype=inp_dtype_running_std) res_list = correction_mul_grad_compute(dout_t, x_t, batch_std_t, running_std_t, channel, data_format, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [dout_t, x_t, batch_std_t, running_std_t] + res_list config = {"print_ir": False, "name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def custom_logical_not(shape, dtype, kernel_name="cce_tf_logical_not", need_build=False, need_print=False): """ logical not for the input tensor Parameters ---------- shape : input shape of data dtype : the data type, support bool kernel_name : cce kernel name, default value is "cce_logical_not" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape) check_list = ["bool"] if not dtype.lower() in check_list: raise RuntimeError( "logical_not_cce ony supports %s while dtype is %s" % (",".join(check_list), dtype)) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) inp_dtype = dtype.lower() data = tvm.placeholder(shape, name="data", dtype=inp_dtype) with tvm.target.cce(): result = tvm.compute( shape, lambda *i: tvm.select(data[i] is True, False, True), name="result") schedule = tvm.create_schedule(result.op) if need_print: with build_config: print(tvm.lower(schedule, [data, result], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [data, result], "cce", name=kernel_name)
def minmax_update_perlayer(x, min_val, max_val, min_up, max_up, ema, ema_decay, kernel_name="minmax_update_perlayer"): """MinMaxUpdatePerLayer op""" input_shape = x.get("shape") input_dtype = x.get("dtype") min_shape = min_val.get("ori_shape") min_dtype = min_val.get("dtype") max_shape = max_val.get("ori_shape") max_dtype = max_val.get("dtype") min_shape = util.scalar2tensor_one(min_shape) max_shape = util.scalar2tensor_one(max_shape) util.check_kernel_name(kernel_name) util.check_shape_rule(input_shape) util.check_shape_rule(min_shape, 1, 1, 1) util.check_shape_rule(max_shape, 1, 1, 1) util.check_tensor_shape_size(input_shape) util.check_tensor_shape_size(min_shape) util.check_tensor_shape_size(max_shape) check_list = ["float32", "float16"] x_dtype = input_dtype.lower() min_dtype = min_dtype.lower() max_dtype = max_dtype.lower() util.check_dtype_rule(x_dtype, check_list) util.check_dtype_rule(min_dtype, check_list) util.check_dtype_rule(max_dtype, check_list) input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), ) shape_min, _, _ = util.produce_shapes(min_shape, input_shape) input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype) min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype) max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype) res_list = minmax_update_perlayer_compute(input_data, min_data, max_data, ema, ema_decay) with tvm.target.cce(): sch = generic.auto_schedule(res_list) tensor_list = [input_data, min_data, max_data] + list(res_list) config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(sch, config)
def hwcn_2_fractal_z_c04(src, dst, src_format, dst_format, kernel_name="hwcn_2_fractal_z_c04"): """ algorithm: hwcn_2_fractal_z_c04 Parameters ---------- src: dict dict with keys(shape, dtype) of src dst: dict dict with keys(shape, dtype) of dst src_format: str data format of src dst_format: str data format of dst kernel_name: str kernel name, default value is "hwcn_2_fractal_z_c04" Returns ------- tik_instance: tik_instance """ src_shape = src.get("shape") src_dtype = src.get("dtype").lower() util.check_kernel_name(kernel_name) util.check_shape_rule(src_shape) util.check_tensor_shape_size(src_shape) check_list = ("float16") util.check_dtype_rule(src_dtype, check_list) if len(src_shape) != 4: raise RuntimeError("hwcn_2_fractal_z_c04 only support 4D " "while src shape is %s" % ", ".join(src_shape)) if src_shape[2] > 4: raise RuntimeError("hwcn_2_fractal_z_c04 only support C <= 4 " "while src shape is %s" % ", ".join(src_shape)) if src_format.upper() != "HWCN": raise RuntimeError("hwcn_2_fractal_z_c04 only support %s " "while src format is %s" % ("HWCN", src_format)) if dst_format.upper() != "FRACTAL_Z_C04": raise RuntimeError("hwcn_2_fractal_z_c04 only support %s " "while dst format is %s" % ("FRACTAL_Z_C04", dst_format)) src_shape = list(src_shape) hwcn_2_fractal_z_c04_template = HWCN2FRACTALZC04Compute( src_shape, src_dtype, kernel_name) return hwcn_2_fractal_z_c04_template.get_tik_instance()
def addcdiv(x1, x2, x3, y=None, alpha=1.0, kernel_name="addcdiv"): check_list = ("float16", "float32") shape_x1 = x1.get("shape") dtype_x1 = x1.get("dtype").lower() shape_x2 = x2.get("shape") dtype_x2 = x2.get("dtype").lower() shape_x3 = x3.get("shape") dtype_x3 = x3.get("dtype").lower() util.check_shape_rule(shape_x1) # 校验算子的shape,维度数需要大于等于1、小于等于8 util.check_shape_size(shape_x1, SHAPE_SIZE_LIMIT) # 校验算子第一个输入shape大小 util.check_dtype_rule(dtype_x1, check_list) # 校验算子的输入数据类型 util.check_shape_rule(shape_x2) util.check_shape_size(shape_x2, SHAPE_SIZE_LIMIT) util.check_dtype_rule(dtype_x2, check_list) util.check_shape_rule(shape_x3) util.check_shape_size(shape_x3, SHAPE_SIZE_LIMIT) util.check_dtype_rule(dtype_x3, check_list) if dtype_x1 != dtype_x2 or dtype_x1 != dtype_x3: raise RuntimeError("the type of x1, x2, x3 must be the same!") util.check_kernel_name(kernel_name) # 校验算子的kernel_name # 取shape_x1,shape_x2,shape_x3中每个维度的大值赋给shape_max shape_x2, shape_x3, shape_max = broadcast_shapes(shape_x2, shape_x3) util.check_tensor_shape_size(shape_max) # 对shape_max进行校验 shape_x1, _, shape_max = broadcast_shapes(shape_x1, shape_max) util.check_tensor_shape_size(shape_max) # 对shape_max进行校验 shape_x2, _, _ = broadcast_shapes(shape_x2, shape_max) # 将input_x的shape广播为shape_max shape_x3, _, _ = broadcast_shapes(shape_x3, shape_max) # 将input_y的shape广播为shape_max data_x1 = tvm.placeholder(shape_x1, name="data_x1", dtype=dtype_x1) data_x2 = tvm.placeholder(shape_x2, name="data_x2", dtype=dtype_x2) data_x3 = tvm.placeholder(shape_x3, name="data_x3", dtype=dtype_x3) res = addcdiv_compute(data_x1, data_x2, data_x3, shape_max, alpha, kernel_name) with tvm.target.cce(): schedule = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_x1, data_x2, data_x3, res]} te.lang.cce.cce_build_code(schedule, config)
def _shape_and_dtype_check(x, y_grad, target, weight, total_weight, reduction, kernel_name): x_shape = x.get("shape") x_dtype = x.get("dtype").lower() y_grad_shape = y_grad.get("shape") y_grad_dtype = y_grad.get("dtype").lower() target_shape = target.get("shape") target_dtype = target.get("dtype").lower() total_weight_shape = total_weight.get("shape") total_weight_dtype = total_weight.get("dtype").lower() weight_shape = weight.get("shape") weight_dtype = weight.get("dtype").lower() util.check_tensor_shape_size(weight_shape) util.check_shape_rule(weight_shape) util.check_shape_rule(x_shape) util.check_shape_rule(y_grad_shape) util.check_shape_rule(target_shape) util.check_tensor_shape_size(y_grad_shape) util.check_tensor_shape_size(target_shape) util.check_kernel_name(kernel_name) util.check_dtype_rule(x_dtype, "float32") util.check_dtype_rule(y_grad_dtype, "float32") util.check_dtype_rule(target_dtype, "int32") util.check_dtype_rule(weight_dtype, "float32") util.check_dtype_rule(total_weight_dtype, "float32") if reduction in ("mean", "sum") and y_grad_shape[0] != 1: raise RuntimeError("The shape of y_grad must be (1,)," " while reduction is mean or sum. ") if len(x_shape) == 1 and y_grad_shape[0] != 1: raise RuntimeError("The shape of y_grad must be (1,)," " while input x is 1D. ") if len(x_shape) > DIM2: raise RuntimeError("The dimension of x should be equal to" "or less than two.") if len(x_shape) == DIM2 and x_shape[0] != target_shape[0]: raise RuntimeError("The first dimension of x and" " target should be equal") if x_shape[-1] != weight_shape[0]: raise RuntimeError("The last dimension of x and the first dimension" " of weight should be equal") if len(y_grad_shape) != 1: raise RuntimeError("The dimension of y_grad should be 1D.") if len(weight_shape) != 1: raise RuntimeError("The dimension of weight should be 1D.") if len(target_shape) != 1: raise RuntimeError("The dimension of target should be 1D.") if total_weight_shape[0] != 1: raise RuntimeError("The shape of total_weight must be (1,)")
def batchnorm_fold2(x, beta, gamma, batch_std, batch_mean, running_std, y, kernel_name="batchnorm_fold2"): """_BatchNormFold2 op""" shape = x.get("shape") util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32"] inp_dtype = x.get("dtype").lower() if not inp_dtype in check_list: raise RuntimeError("Dtype of input only support float16, float32") data_format = x.get("format") ori_format = x.get("ori_format") if data_format.upper() not in ("NC1HWC0", "NCHW"): raise RuntimeError("Un supported data format {}".format(data_format)) if data_format.upper() == "NCHW" and ori_format != "NCHW": raise RuntimeError("data_format(NCHW) must same as ori_format") shape_c = gamma.get("shape") if gamma.get("format").upper() == "NCHW": shape_c = 1, gamma.get("shape")[0], 1, 1 x_t = tvm.placeholder(shape, name="x", dtype=inp_dtype) beta_t = tvm.placeholder(shape_c, name="beta", dtype=inp_dtype) gamma_t = tvm.placeholder(shape_c, name="gamma", dtype=inp_dtype) batch_std_t = tvm.placeholder(shape_c, name="batch_std", dtype=inp_dtype) batch_mean_t = tvm.placeholder(shape_c, name="batch_mean", dtype=inp_dtype) running_std_t = tvm.placeholder(shape_c, name="running_std", dtype=inp_dtype) res = batchnorm_fold2_compute(x_t, beta_t, gamma_t, batch_std_t, batch_mean_t, running_std_t, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [x_t, beta_t, gamma_t, batch_std_t, batch_mean_t, running_std_t, res] } te.lang.cce.cce_build_code(sch, config)
def decode_cornerpoints_target_wrt_center_v1( keypoints_prediction, anchors, keypoints_decoded, kernel_name="cce_decode_cornerpoints_target_wrt_center_v1_float16"): """ The params check function of decode_wheels_target Parameters: ---------- Returns : All transformed params. ---------- """ check_decode_cornerpoints_target_wrt_center_v1_shape_params( keypoints_prediction, anchors, keypoints_decoded) util.check_kernel_name(kernel_name) shape_x = keypoints_prediction.get("shape") tik_instance = tik.Tik(tik.Dprofile(), True) core_num = tik.Dprofile().get_aicore_num() tiling = Tiling(shape_x[0], core_num) # gm_tensor init gm_tensor = InitTensor(tik_instance, shape_x, [shape_x[0], FOUR], 'float16') if tiling.factor > 0: thread_num = TWO if tiling.factor != ONE else ONE with tik_instance.for_range(0, core_num, block_num=core_num) as current_core: with tik_instance.for_range(0, tiling.factor, thread_num=thread_num) as current_factor: shape = InitShape(SINGLE_N_MAX) current_data_x = EIGHT * SINGLE_N_MAX * (current_core + core_num * current_factor) current_data_y = FOUR * SINGLE_N_MAX * (current_core + core_num * current_factor) calculate_process(tik_instance, gm_tensor, shape, current_data_x, current_data_y) if tiling.last_core > 0: thread_num = TWO if tiling.last_core != ONE else ONE with tik_instance.for_range(0, tiling.last_core, thread_num=thread_num) as current_core: shape = InitShape(SINGLE_N_MAX) current_data_x = EIGHT * SINGLE_N_MAX * (core_num * tiling.factor + current_core) current_data_y = FOUR * SINGLE_N_MAX * (core_num * tiling.factor + current_core) calculate_process(tik_instance, gm_tensor, shape, current_data_x, current_data_y) if tiling.last_n > 0: shape = InitShape(tiling.last_n) current_data_x = EIGHT * SINGLE_N_MAX * (core_num * tiling.factor + tiling.last_core) current_data_y = FOUR * SINGLE_N_MAX * (core_num * tiling.factor + tiling.last_core) calculate_process(tik_instance, gm_tensor, shape, current_data_x, current_data_y) # build_cce tik_instance.BuildCCE( kernel_name=kernel_name, inputs=[gm_tensor.data_x, gm_tensor.data_y], outputs=[gm_tensor.data_z])
def custom_Tile(shape, dtype, tiles, axis=1, kernel_name="cce_caffe_tile_layer", need_build=False, need_print=False): """Operation and Schedule for tilelayer, construct an array by axis and tiles. Parameters ---------- shape: shape of Tensor dtype: the data type. only support float16, float32, int32, int8, uint8 tiles: the number of copies (tiles) of the tensor to output axis: the index of the axis to tile kernel_name: cce kernel name, default value is "cce_caffe_tile_layer" need_buid: if need to build CCEC kernel, default value is False need_print: if need to print the ir, default value is False Returns ------- None """ check_list = ["float16", "float32", "int32", "int8", "uint8"] if not (dtype.lower() in check_list): raise RuntimeError( "caffe_tile_layer only support %s while dtype is %s" % (",".join(check_list), dtype)) util.check_kernel_name(kernel_name) util.check_shape_rule(shape) if type(axis) != int: raise RuntimeError("type of axis value should be int") if axis >= len(shape) or axis < -len(shape): raise RuntimeError( "input axis is out of range, axis value can be from %d to %d" % ( -len(shape), len(shape) - 1)) if type(tiles) != int: raise RuntimeError("type of tiles must be int.") if tiles < 0: raise RuntimeError("Number of tiles must be positive.") multiples = [1]*len(shape) multiples[axis] = tiles tf_tile.tf_tile_cce(shape, dtype, multiples, kernel_name=kernel_name, need_build=need_build, need_print=need_print)
def check_param(x, grad, argmax, y, ksize, strides, padding, dtype, dilation, ceil_mode, kernel_name): """ check the parameters is valid, if one is invalid,then raise error Parameters ---------- x: dict,shape and datatype grad: dict,shape and datatype argmax: dict,shape and datatype y: dict,shape and datatype ksize: kernel or windows size,minimum length is 4, just like [1, poolingWindowH, poolingWindowW, 1] strides: stride , minimum length is 4, just like [1, poolingStrideH, poolingStrideW, 1] padding: pad mode Returns ------- None """ y_shape = x.get("shape") y_dtype = x.get("dtype").lower() y_dtype_arg = y.get("dtype").lower() input_gard_shape = grad.get("shape") grad_dtype = grad.get("dtype").lower() argmax_shape = argmax.get("shape") argmax_dtype = argmax.get("dtype").lower() util.check_shape_rule(y_shape) util.check_shape_rule(input_gard_shape) util.check_shape_rule(argmax_shape) util.check_kernel_name(kernel_name) check_shape_5hd(y_shape) check_shape_5hd(input_gard_shape) util.check_tensor_shape_size(input_gard_shape) util.check_tensor_shape_size(argmax_shape) util.check_tensor_shape_size(y_shape) util.check_dtype_rule(grad_dtype, ("float16", "float32", "int32")) util.check_dtype_rule(argmax_dtype, ("uint16")) util.check_dtype_rule(y_dtype, ("float16", "float32", "int32")) if y_dtype != grad_dtype or y_dtype_arg != y_dtype: raise RuntimeError("The dtype of tensor must be same") if dtype != DT_INT32 and dtype != DT_INT64: raise RuntimeError( "The dtype of input max indice must be int32 or int64") check_output_dim_with_ksize_stride(padding, input_gard_shape, y_shape, ksize, strides, dilation, ceil_mode)
def batchnorm_fold_grad(d_batch_mean, d_batch_std, x, batch_mean, batch_std, dx, epsilon=1e-5, is_training=True, freeze_bn=0, kernel_name="batchnorm_fold_grad"): """batchnorm_fold_grad op """ util.check_kernel_name(kernel_name) for iv in (d_batch_mean, d_batch_std, x, batch_mean, batch_std): util.check_shape_rule(iv.get("shape")) util.check_tensor_shape_size(iv.get("shape")) check_tuple = ("float16", "float32") for iv in (d_batch_mean, d_batch_std, x, batch_mean, batch_std): util.check_dtype_rule(iv.get("dtype").lower(), check_tuple) shape_x = x.get("shape") dtype_x = x.get("dtype") format_data = x.get("format").upper() if format_data not in ("NCHW", "NC1HWC0"): raise RuntimeError("Format of input only support 4D and 5HD") shape_mean = d_batch_mean.get("shape") dtype_mean = d_batch_mean.get("dtype").lower() if format_data == "NC1HWC0": if len(shape_x) != 5: raise RuntimeError("batchnorm_fold only support shape 5D" "when input format is NC1HWC0") shape_mean = (1, shape_x[1], 1, 1, shape_x[4]) elif format_data == "NCHW": if len(shape_x) < 2 or len(shape_x) > 4: raise RuntimeError("batchnorm_fold only support shape 2D to 4D") if shape_x[1] != shape_mean[0]: raise RuntimeError("data_format is NCHW, shape_bias must" "be equal to the second axis of shape_x") shape_mean = (1, shape_x[1],) for _ in range(2, len(shape_x)): shape_mean = shape_mean + (1,) d_batch_mean = tvm.placeholder(shape_mean, name="d_batch_mean", dtype=dtype_mean) d_batch_std = tvm.placeholder(shape_mean, name="d_batch_std", dtype=dtype_mean) data_x = tvm.placeholder(shape_x, name="data_x", dtype=dtype_x.lower()) batch_mean = tvm.placeholder(shape_mean, name="batch_mean", dtype=dtype_mean) batch_std = tvm.placeholder(shape_mean, name="batch_std", dtype=dtype_mean) res = _batchnorm_fold_grad_compute(d_batch_mean, d_batch_std, data_x, batch_mean, batch_std) with tvm.target.cce(): sch = generic.auto_schedule(res) tensor_list = [d_batch_mean, d_batch_std, data_x, batch_mean, batch_std] + res config = {"name": kernel_name, "tensor_list": tensor_list} te.lang.cce.cce_build_code(sch, config)
def segment_max_d(x, y, segment_ids, kernel_name="segment_max_d"): """ Operation and Schedule for segment_max Parameters ---------- x : dict shape and dtype of input y: dict shape and dtype of output segment_ids : list should be the size of the first dimension kernel_name: str kernel name, default value is "segment_max_d" Returns ------- None """ shape = x.get("shape") dtype = x.get("dtype") util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32", "int32"] if dtype.lower() not in check_list: raise RuntimeError("segment_max only support float16, float32, int32") # when shape[0] > first_dim_size_threshold, # default stack space may not be enough, we need to prompt the user if shape[0] > FIRST_DIM_SIZE_THRESHOLD: print("Default stack space may not be enough.\ You shall increase the stack space.") dtype = dtype.lower() _check_segment_ids(shape, segment_ids) input_data = tvm.placeholder(shape, name="input_data", dtype=dtype) with tvm.target.cce(): res = segment_max_d_compute(input_data, y, segment_ids, kernel_name) sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [input_data, res]} te.lang.cce.cce_build_code(sch, config)
def _check_parameters(src, dst, src_format, dst_format, kernel_name): """ check the parameters including src_shape, dst_shape, src_format, dst_format, dtype and kernel_name """ src_shape = src.get("shape") dst_shape = dst.get("shape") dtype = src.get("dtype") dtype_dst = dst.get("dtype") if src_format.lower() != "ndhwc": raise RuntimeError("src_format must be NDHWC !") if dst_format.lower() != "ndc1hwc0": raise RuntimeError("dst_format must be NDC1HWC0!") util.check_kernel_name(kernel_name) check_list = ("float16", ) util.check_dtype_rule(dtype, check_list) if dtype != dtype_dst: raise RuntimeError("dtype of src and dst are different !") util.check_shape_rule(src_shape, 5, 5) util.check_shape_rule(dst_shape, 6, 6) util.check_tensor_shape_size(src_shape) util.check_tensor_shape_size(dst_shape) if dst_shape[5] != 16: raise RuntimeError( "the last dimension of dst_shape is not 16, c0 must be 16 !") if dst_shape[0] != src_shape[0]\ or (dst_shape[1] != src_shape[1] and dst_shape[1] != src_shape[1] + 2)\ or dst_shape[3] != src_shape[2] or dst_shape[4] != src_shape[3]: raise RuntimeError("the shape of src and dst not match, " "the 1st,2nd,4th,5th dimension of dst_shape and " "the 1st,2nd,3rd,4th dimension of src_shape " "must be the same !") c_dst = src_shape[4] c_1 = dst_shape[2] c_0 = dst_shape[5] if not ((c_dst <= c_1 * c_0) and (c_dst > (c_1 - 1) * c_0)): raise RuntimeError("c must be less than or equal to c1*c0," "and greater than ((c1 - 1)*c0 )!")