def __init__(self, shape, dtype, kernel_name): """ init the parameters Parameters ---------- shape: tuple or list the shape of input tensor dtype: string the dtype of input tensor kernel_name: str kernel name, default value is "reverse_ext2" Returns ------- None """ self.tik_instance = tik.Tik(tik.Dprofile()) self.aicore_num = cce.cce_conf.get_soc_spec(cce.cce_conf.CORE_NUM) self.shape = list(shape) self.dtype = dtype self.kernel_name = kernel_name block_byte_size = 32 dtype_byte_size = cce.cce_intrin.get_bit_len(dtype) // 8 self.data_each_block = block_byte_size // dtype_byte_size ub_byte_size = (cce.cce_conf.get_soc_spec(cce.cce_conf.UB_SIZE) - block_byte_size) self.ub_element_number = (ub_byte_size // dtype_byte_size // self.data_each_block * self.data_each_block) self.input_total_num = functools_reduce(lambda x, y: x * y, shape) self.data_num_each_core = self.input_total_num // self.aicore_num self.last_data_num = self.input_total_num % self.aicore_num self.input_gm = self.tik_instance.Tensor(self.dtype, self.shape, name="input_gm", scope=tik.scope_gm) self.output_gm = self.tik_instance.Tensor(self.dtype, self.shape, name="output_gm", scope=tik.scope_gm) self.input_ub = None
def init_tik_instance(self): """ init the tik_instance Parameters ---------- Returns ------- None """ profile = tik.Dprofile() self.tik_instance = tik.Tik(profile) self.real_core_num = profile.get_aicore_num() self.l1_buffer_size = profile.get_l1_buffer_size()
def __init__(self, input_dict): """ init the Crop parameters Parameters ---------- input_dict: input_dict is a dict, the keys as follow: x1: dict,shape and datatype,datatype supports int8,uint8, int16,uint16,int32,uint32,int64,uint64,float16,float32 x2: dict,shape and datatype,datatype supports int8,uint8, int16,uint16,int32,uint32,int64,uint64,float16,float32 y: dict,shape and datatype,datatype supports int8,uint8, int16,uint16,int32,uint32,int64,uint64,float16,float32 axis: crop start with axis offsets: crop start offset of each axis kernel_name: cce kernel name, default value is "crop" Returns ------- None """ self.instance = tik.Tik(tik.Dprofile()) self.dtype = input_dict.get("x1").get("dtype").lower() self.dsize = common_util.get_data_size(self.dtype) total_size = tbe_platform.cce_conf.get_soc_spec( tbe_platform.cce_conf.UB_SIZE) ub_size = (total_size - RESERVE_SIZE) // (2 * self.dsize) burnest_len = constant.BLOCK_SIZE // self.dsize ub_size = ((ub_size + burnest_len - 1) // burnest_len) * burnest_len self.one_max_size = ub_size x1_len = get_shape_total_number(input_dict.get("x1").get("shape")) x1_len = ((x1_len + burnest_len - 1) // burnest_len) * burnest_len mod = input_dict.get("y").get("shape")[-1] % burnest_len if mod != 0: x1_len = x1_len + burnest_len self.x1_gm = self.instance.Tensor(self.dtype, (x1_len, ), name="x1_gm", scope=tik.scope_gm) self.x2_gm = self.instance.Tensor(self.dtype, (32, ), name="x2_gm", scope=tik.scope_gm) y_len = get_shape_total_number(input_dict.get("y").get("shape")) y_len = ((y_len + burnest_len - 1) // burnest_len) * burnest_len if mod != 0: y_len = y_len + burnest_len self.y_gm = self.instance.Tensor(self.dtype, (y_len, ), name="y_gm", scope=tik.scope_gm) self.input_dict = input_dict
def __init__(self, input_dict, stride_h, stride_w): self.dprofile = tik.Dprofile() self.tik_instance = tik.Tik(self.dprofile) self.ub_size = self.dprofile.get_unified_buffer_size() self.dtype = input_dict.get("x").get("dtype").lower() self.x_shape = input_dict.get("x").get("shape") self.dsize = get_data_size(self.dtype) self.y_shape = cal_out_shape(self.x_shape, stride_h, stride_w) self.x_gm = self.tik_instance.Tensor(self.dtype, self.x_shape, name="x_gm", scope=tik.scope_gm) self.y_gm = self.tik_instance.Tensor(self.dtype, self.y_shape, name="y_gm", scope=tik.scope_gm)
def map_index(x_dic, data_seq_dic, level_index_dic, y_dic, kernel_name="map_index"): """ :param x_dic: :param data_seq_dic: :param level_index_dic: :param y_dic: :param kernel_name: :return: """ check_list = ["int32"] x_shape = x_dic.get("shape") x_dtype = x_dic.get("dtype") check_dtype(x_dtype.lower(), check_list, param_name="x") data_seq_shape = data_seq_dic.get("shape") data_seq_dtype = data_seq_dic.get("dtype") check_dtype(data_seq_dtype.lower(), check_list, param_name="data_seq") y_dtype = y_dic.get("dtype") check_dtype(y_dtype.lower(), check_list, param_name="y") if x_shape[0] > 8: raise RuntimeError("the length of x should " "be less than or equal to 8") if data_seq_shape[0] % x_shape[0] != 0: raise RuntimeError("the length of data_seq must " "be multiple of the length of x") tik_instance = tik.Tik(tik.Dprofile()) if level_index_dic: level_index_dtype = level_index_dic.get("dtype") check_dtype(level_index_dtype.lower(), check_list, param_name="level_index") map_index_result = MapIndexProcess((tik_instance, x_dic, data_seq_dic, y_dic, level_index_dic)) else: map_index_result = MapIndexProcess((tik_instance, x_dic, data_seq_dic, y_dic)) return map_index_result.cce_map_index(kernel_name)
def __init__(self, input_dict): """ init the permute parameters """ self.instance = tik.Tik(tik.Dprofile()) self.dtype = input_dict.get("x").get("dtype").lower() self.dsize = 2 size = get_shape_size(input_dict.get("x").get("shape")) self.x_gm = self.instance.Tensor(self.dtype, (size, ), name="x_gm", scope=tik.scope_gm) self.y_gm = self.instance.Tensor(self.dtype, (size, ), name="y_gm", scope=tik.scope_gm) ub_size = (UB_SIZE_B - 1024) // 4 // self.dsize // 256 * 256 self.ub_size = ub_size self.input_dict = input_dict
def init_param(self, pooled_hw, dicts, spatial_scale_list, kernel_name): """ init parameters Parameters ---------- pooled_hw: (pooled_h, pooled_w) dicts: (x_dict, rois_dict, actual_dict, y_dict) spatial_scale_list: (spatial_scale_h, spatial_scale_w) kernel_name: kernel name Returns ------- None """ self.tik_instance = tik.Tik(tik.Dprofile()) self.pooled_h = pooled_hw[0] self.pooled_w = pooled_hw[1] self.dtype = dicts[0].get("dtype").lower() self.shape = dicts[0].get("shape") self.rois_dtype = dicts[1].get("dtype").lower() self.rois_shape = dicts[1].get("shape") self.output_shape = dicts[3].get("shape") self.spatial_scale_h = spatial_scale_list[0] self.spatial_scale_w = spatial_scale_list[1] self.roi_actual_num_effect = (dicts[2] is not None) self.kernel_name = kernel_name self.feature_batch = self.shape[0] self.fm_c1 = self.shape[1] self.fm_h = self.shape[2] self.fm_w = self.shape[3] self.fm_c0 = self.shape[4] self.device_core_num = \ tbe_platform.cce_conf.get_soc_spec(tbe_platform.cce_conf.CORE_NUM) self.proposal_num_per_tiling = 128 self.roi_max_num = self.rois_shape[2]
def clip_boxes_d_compute(boxes_input, img_w, img_h, kernel_name="clip_boxes"): """ the compute process of clip_boxes input: boxes_input:a dict, include shape, and dtype img_w: width of the image img_h: height of the image kernel_name: the kernel name return: the tik container """ const_num = ConstList() tiling_para = TilingFunc(boxes_input.get("shape")) # start the TIK container tik_instance = tik.Tik(tik.Dprofile(), True) anchors = tik_instance.Tensor("float16", (tiling_para.tot_of_blk*const_num.num_d, const_num.num_d), name="anchors", scope=tik.scope_gm) res_anchors = tik_instance.Tensor("float16", (tiling_para.tot_of_blk*const_num.num_d, const_num.num_d), name="res_anchors", scope=tik.scope_gm) with tik_instance.for_range(0, tiling_para.loop_time - CONFIG_ONE, thread_num=tiling_para.thread_num) as loop_i: processing_one_loop(tik_instance, (anchors, res_anchors), tiling_para, (img_h, img_w), loop_i) # the tail processing processing_tail(tik_instance, (anchors, res_anchors), tiling_para, (img_h, img_w)) tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[anchors], outputs=[res_anchors]) return tik_instance
def __init__(self, input_data, block_size): """ init space_to_depth base parameters Parameters ---------- input_data: shape and data type,data type supports float16,float32, int32,uint32,int16,uint16,int8,uint8,int64,uint64 block_size: must be greater than one. It indicates the block size """ self.input_shape = input_data.get("shape") self.dtype = input_data.get("dtype").lower() self.dtype_size = common_util.get_data_size(self.dtype) self.block_size = block_size self.tik_instance = tik.Tik(tik.Dprofile()) self.output_shape = (self.input_shape[0], self.input_shape[1] // block_size, self.input_shape[2] // block_size, self.input_shape[3] * block_size * block_size)
def check_shape_dtype_format(input_shape, input_dtype, input_format): """ check shape, dtype and format Parameters ---------- input_shape: input dic shape input_dtype: input dtype input_format: input format, NC1HWC0 The common check rule for tensor shape, just for 5hd Returns ------- None """ tik_name = tik.Dprofile().get_product_name() if tik_name == "hisi-es": check_list = ["float16"] else: check_list = ["float16", "float32"] if input_dtype not in check_list: raise RuntimeError("upsample only support %s while dtype is %s" % (",".join(check_list), input_dtype)) util.check_shape_rule(input_shape) if len(input_shape) != DIM_5HD: raise RuntimeError( "The dim of tensor must be %d" ", actual dim is %d" % (DIM_5HD, len(input_shape))) n, c1, h, w, c0 = input_shape shape_c0 = C0 if input_shape[DIM_5HD - 1] != shape_c0: raise RuntimeError( "The value of C0 must be 16") if input_format != "NC1HWC0": raise RuntimeError( "The format must be NC1HWC0, while actual format is %s" %(input_format))
def __init__(self, kernel_name): self.hidden_size = 32 self.feature_size = 32 self.block_size = 16 self.feature_block_num = self.feature_size // self.block_size self.hidden_block_size = self.hidden_size // self.block_size self.batch_size = 32 self.batch_blocks = self.batch_size // self.block_size self.num_step = 16 self.forget_bias = 1.0 self.use_fixpipe = True self.matmul_init_l1out = self.use_fixpipe self.feature_hidden_size = self.feature_size + self.hidden_size self.feature_hidden_block = self.feature_hidden_size // self.block_size self.tik_instance = tik.Tik(tik.Dprofile()) self.fixpipe_workspace = self.tik_instance.Tensor( "float16", (1, 4 * self.hidden_block_size, self.batch_blocks, self.block_size, self.block_size), name="fixpipe_workspace", scope=tik.scope_gm, is_workspace=True) self.declare_gm_tensor() self.init_core() self.tik_instance.BuildCCE( kernel_name, inputs=[ self.gm_x, self.gm_init_h, self.gm_init_c, self.gm_weight, self.gm_b ], outputs=[self.gm_output_h, self.gm_output_c])
def __init__(self, shape_info, param_info): classes = param_info['classes'] coords = param_info['coords'] boxes = param_info['boxes'] dtype = param_info['dtype'] batch = shape_info['batch'] height = shape_info['height'] width = shape_info['width'] dtype_size = 2 if (param_info['dtype'] == "float16") else 4 self.product_name = "" self.total_ub_size = 0 self.tik_inst = tik.Tik(tik.Dprofile()) # in order to solve 32B not enough when do the last data_mov batch_padding = 32 // ( (boxes * (coords + 1 + classes)) * height * width * dtype_size) + 1 self.yolo_din = \ self.tik_inst.Tensor(dtype, (batch+batch_padding, boxes*(coords+1+classes), height, width), scope=tik.scope_gm, name="yolo_din") # shape defined by fp16 for infershape dtype can not be determined self.crd_dout = \ self.tik_inst.Tensor(dtype, (batch, boxes*coords, ceil_x(height*width*2+32, 32)//2), scope=tik.scope_gm, name="crd_dout") self.obj_dout = \ self.tik_inst.Tensor(dtype, (batch, ceil_x(boxes*height*width * 2+32, 32)//2), scope=tik.scope_gm, name="obj_dout") self.cls_dout = \ self.tik_inst.Tensor(dtype, (batch, classes, ceil_x(boxes*height*width * 2+32, 32)//2), scope=tik.scope_gm, name="cls_dout")
def __init__(self, input_dict): """ init the ShuffleChannel parameters Parameters ---------- input_dict: input_dict is a dict, the keys as follow: x: dict,shape and datatype,datatype supports int8,uint8,int16, uint16,int32,uint32,int64,uint64,float16,float32 y: dict,shape and datatype,datatype supports int8,uint8,int16, uint16,int32,uint32,int64,uint64,float16,float32 group: 1 channel group kernel_name: cce kernel name, default value is "shuffle_channel" Returns ------- None """ self.instance = tik.Tik(tik.Dprofile()) self.dtype = input_dict.get("x").get("dtype").lower() self.dsize = common_util.get_data_size(self.dtype) total_size = tbe_platform.cce_conf.get_soc_spec(tbe_platform.cce_conf.UB_SIZE) ub_size = (total_size - RESERVE_SIZE) // (2 * self.dsize) burnest_len = constant.BLOCK_SIZE // self.dsize ub_size = ((ub_size + burnest_len - 1) // burnest_len) * burnest_len self.one_max_size = ub_size x_len = get_shape_total_number(input_dict.get("x").get("shape")) x_len = ((x_len + burnest_len - 1) // burnest_len) * burnest_len hw = input_dict.get("y").get("shape")[2] * \ input_dict.get("y").get("shape")[3] mod = hw % burnest_len if mod != 0: x_len = x_len + burnest_len self.x_gm = self.instance.Tensor(self.dtype, (x_len,), name="x_gm", scope=tik.scope_gm) self.y_gm = self.instance.Tensor(self.dtype, (x_len,), name="y_gm", scope=tik.scope_gm) self.input_dict = input_dict
def __init__(self, x, y, split_dim, num_split, kernel_name): """ Init split_d parameters Parameters ---------- x: dict the dict of input tensor. y: list or tuple the list of output tensor. split_dim: int the dimension along which to split_d. num_split: int an integer indicating the number of split_d along `split_dim`. kernel_name: str cce kernel name, default value is "split_d". Returns ------- None """ self.tik_instance = tik.Tik(tik.Dprofile()) self.split_dim = split_dim self.num_split = num_split self.kernel_name = kernel_name self.input_dtype = x.get("dtype").lower() self.output_dtype = y[0].get("dtype").lower() self.input_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len( self.input_dtype) // EIGHT_BIT self.input_data_each_block = BLOCK_BYTES // self.input_dtype_bytes_size self.core_num = tbe_platform.cce_conf.get_soc_spec( tbe_platform.cce_conf.CORE_NUM) self.ub_size = tbe_platform.cce_conf.get_soc_spec( tbe_platform.cce_conf.UB_SIZE) - RESERVED_UB_SIZE self.ub_number = self.ub_size // self.input_dtype_bytes_size self.ub_number = (self.ub_number // self.input_data_each_block ) * self.input_data_each_block self.tiling_gm, self.input_gm, self.outs_gm = self.init_gm_tensor() self.check_input_params() self.ub_number_new = None self.input_ub = None self.temp_ub = None self.tiling_ub = None self.select_mode = None self.input_size_split = None self.output_size_split = None self.act_core_num = None self.loop_each_core = None self.loop_last_core = None self.data_each_core = None self.data_last_core = None self.loop_num = None self.last_num = None self.loop_num_last_core = None self.last_num_last_core = None self.input_num = None self.loop_each = None self.loop_last = None self.loop_each_last_core = None self.loop_last_last_core = None self.loop_burst_len = None
def __init__(self, var, indices, updates, var_out, axis, kernel_name, compute_type): """ Init scatter axis parameters Parameters ---------- var: dict data of input datatype suports float32,float16,int32,int8,uint8 indices: dict data of indices datatype supports int32 updates: dict data of updates datatype supports float32,float16,int32,int8,uint8 var_out: dict data of input axis: bool axis kernel_name: str the name of the operator compute_type: str the compute type of scatter Returns ------- example: var(2, 6, 8, 8) axis=1 process uint is var[axis:] (6,8,8) slice shape small slice shape is var[axis+1:] (8,8) slice num is 2 and divide in each core to proc each proc of slice data(6,8,8) updates_date proc by indices info to copy """ self.tik_instance = tik.Tik(tik.Dprofile()) self.var_shape = var.get("shape") self.var_dtype = var.get("dtype").lower() self.indices_shape = indices.get("shape") self.indices_dtype = indices.get("dtype").lower() self.updates_shape = updates.get("shape") self.updates_dtype = updates.get("dtype").lower() self.var_ele_num = functools_reduce(lambda x, y: x * y, self.var_shape) self.indices_num = functools_reduce(lambda x, y: x * y, self.indices_shape) self.updates_num = functools_reduce(lambda x, y: x * y, self.updates_shape) self.axis = axis self.kernel_name = kernel_name self.compute_type = compute_type self.ub_size_bytes = (tik.Dprofile().get_unified_buffer_size() - UB_RESERVE_SIZE) self.ai_core_num = tik.Dprofile().get_aicore_num() self.var_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len(self.var_dtype) // 8 self.indices_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len(self.indices_dtype) // 8 self.var_data_each_block = 32 // self.var_dtype_bytes_size self.indices_data_each_block = 32 // self.indices_dtype_bytes_size self.check_param(var_out) # indices buf size in ub self.indices_ub_number = 0 # var and updates buf size in ub self.updates_ub_number = 0 # slice is var[axis:], one uint of process if axis == 0: self.slice_num = 1 else: self.slice_num = functools_reduce(lambda x, y: x * y, self.var_shape[0:axis]) self.slice_shape = self.var_shape[axis:] self.slice_data_num = functools_reduce(lambda x, y: x * y, self.var_shape[axis:]) self.small_elem_num = self.slice_data_num // self.var_shape[axis] self.slice_size = self.slice_data_num * self.var_dtype_bytes_size self.max_num_one_repeat = 128 if self.var_dtype in ("float32", "int32"): self.max_num_one_repeat = 64 # decide block num if self.slice_num == 1: self.block_num = 1 self.slice_step = 0 else: self.slice_step = math.ceil(self.slice_num / self.ai_core_num) self.block_num = math.ceil(self.slice_num / self.slice_step) # each loop data buf now is one slice data var[axis:] date self.update_data_num = self.slice_data_num self.vconv_dst_dtype = "float16" self.init_gm_tensor() self.init_ub_tensor_para() self.init_scalar_val()
def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="CusMatMulCubeFraczLeftCast"): """ calculating matrix multiplication with bias, C = A*B + bias, support input data with fractal format. Parameters: shape_a: list or tuple Shape of the first tensor a with rank > 1 shape_b: list or tuple Shape of the second tensor b with the same type with a, and shape_a, shape_b must be 2 dims src_dtype: str The data type of input, support "float32", "float16" dst_dtype: str The data type of output, support "float32", "float16" trans_a: bool If True, shape_a == transposed before multiplication trans_b: bool If True, shape_b == transposed before multiplication is_fractal: bool If True, the input data format of a and b must be fractal format shape_bias: list or tuple Shape of bias, only support the input data format with ND Returns ------- None """ shape_a = input_x1.get("ori_shape") shape_b = input_x2.get("ori_shape") print("============") print(input_x1.get("format"), input_x2.get("format")) print(shape_a, shape_b) print("============") if input_x2.get("format") == "FRACTAL_Z": n, c, h, w = shape_b c0 = 16 c1 = c // c0 if c1 == 0: c1 = 1 shape_b = [n, c1 * h * w * c0] shape_a = [n, n] if input_x1.get("format") == "FRACTAL_Z": n, c, h, w = shape_a c0 = 16 c1 = c // c0 if c1 == 0: c1 = 1 shape_a = [n, c1 * h * w * c0] shape_b = [c1 * h * w * c0, c1 * h * w * c0] if input_x2.get("format") == "FRACTAL_NZ": shape_a = [shape_b[0], shape_b[0]] shape_b = shape_b if input_x1.get("format") == "FRACTAL_NZ": shape_a = shape_a shape_b = [shape_a[1], shape_a[1]] shape_a = list(shape_a) shape_b = list(shape_b) shape_a = _get_input_shape(shape_a) shape_b = _get_input_shape(shape_b) util.check_kernel_name(kernel_name) util.check_shape_rule(shape_a) util.check_shape_rule(shape_b) util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT) shape_a = [shape_a[1], shape_a[0]] trans_a = bool(1 - trans_a) shape_b = [shape_b[1], shape_b[0]] trans_b = bool(1 - trans_b) shape_bias = () if bias is not None and bool(bias): shape_bias = bias.get("shape") shape_bias = list(shape_bias) shape_bias = _get_bias(shape_bias) src_dtype = input_x1.get("dtype").lower() _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b) m_shape = shape_a[len(shape_a) - 2] km_shape = shape_a[len(shape_a) - 1] kn_shape = shape_b[len(shape_a) - 2] n_shape = shape_b[len(shape_a) - 1] if src_dtype == "float16": block_reduce = cce.BLOCK_REDUCE block_in = cce.BLOCK_IN block_out = cce.BLOCK_OUT if trans_a and km_shape == 1: block_in = cce.BLOCK_VECTOR if not trans_a and m_shape == 1: block_in = cce.BLOCK_VECTOR if trans_b and kn_shape == 1: block_out = cce.BLOCK_VECTOR if not trans_b and n_shape == 1: block_out = cce.BLOCK_VECTOR if trans_a: shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in) else: shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce) if trans_b: shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out) else: shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce) shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3]) shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3]) if util.get_product_version() == util.VERSION_MINI: tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) else: tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) input_x1 = tik_instance.Tensor(input_x1.get("dtype"), shape_a_temp, name="left_matrix", scope=tik.scope_gm) input_x2 = tik_instance.Tensor(input_x2.get("dtype"), shape_b_temp, name="right_matrix", scope=tik.scope_gm) res_matmul = tik_instance.Tensor(output_y.get("dtype"), output_y.get("shape"), name="output", scope=tik.scope_gm) DIAG_SIZE = 128 mo_tile, ko_tile, no_tile, diag_opt = get_cus_tile_info( input_x1, input_x2, DIAG_SIZE) cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b, res_matmul, mo_tile=mo_tile, ko_tile=ko_tile, no_tile=no_tile, diag_opt=diag_opt, diag_size=DIAG_SIZE) tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[res_matmul]) return tik_instance
def CusCholeskyTrsm(input_x, output, kernel_name): """CusCholeskyTrsm""" input_x_shape = input_x.get("shape") output_shape = output.get("shape") split_dim = 128 matrix_dim = input_x_shape[0] split_dim = min(matrix_dim, split_dim) vector_repeat_times = int(split_dim // 64) blocks = int(matrix_dim // split_dim) if blocks == 0: blocks = 1 if util.get_product_version() == util.VERSION_MINI: tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) else: tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: input_x_ub = tik_instance.Tensor("float32", (split_dim, split_dim), name="input_x_ub", scope=tik.scope_ubuf) temp_ub = tik_instance.Tensor("float32", (split_dim, split_dim), name="temp_ub", scope=tik.scope_ubuf) assist_1_ub = tik_instance.Tensor("float32", (split_dim, ), name="assist_1_ub", scope=tik.scope_ubuf) assist_2_ub = tik_instance.Tensor("float32", (split_dim, ), name="assist_2_ub", scope=tik.scope_ubuf) with tik_instance.for_range(0, split_dim) as i: tik_instance.data_move( input_x_ub[i, 0], input_x[block_index * split_dim + i, block_index * split_dim], 0, 1, vector_repeat_times * 8, 0, 0) scalar1 = tik_instance.Scalar("float32", init_value=-0.5) with tik_instance.for_range(0, split_dim) as i: scalar2 = tik_instance.Scalar("float32") tik_instance.vln(64, assist_1_ub[0], input_x_ub[i, 0], vector_repeat_times, 1, 1, 8, 8) tik_instance.vmuls(64, assist_2_ub[0], assist_1_ub[0], scalar1, vector_repeat_times, 1, 1, 8, 8) tik_instance.vexp(64, assist_1_ub[0], assist_2_ub[0], vector_repeat_times, 1, 1, 8, 8) scalar2.set_as(assist_1_ub[i]) tik_instance.vmuls(64, input_x_ub[i, 0], input_x_ub[i, 0], scalar2, vector_repeat_times, 1, 1, 8, 8) with tik_instance.for_range(i + 1, split_dim) as j: scalar3 = tik_instance.Scalar("float32") scalar3.set_as(input_x_ub[i, j]) tik_instance.vmuls(64, temp_ub[j, 0], input_x_ub[i, 0], scalar3, vector_repeat_times, 1, 1, 8, 8) tik_instance.vsub(64, input_x_ub[i + 1, 0], input_x_ub[i + 1, 0], temp_ub[i + 1, 0], (split_dim - 1 - i) * vector_repeat_times, 1, 1, 1, 8, 8, 8) zero = tik_instance.Scalar("float32") zero.set_as(0.0) one = tik_instance.Scalar("float32") one.set_as(1.0) with tik_instance.for_range(0, split_dim) as i: tik_instance.vector_dup(64, temp_ub[i, 0], zero, vector_repeat_times, 1, 8) temp_ub.__setitem__(i * split_dim + i, one) chol_diag_element_final = tik_instance.Scalar("float32") chol_diag_element_final.set_as(input_x_ub[split_dim * split_dim - 1]) trsm_diag_element = tik_instance.Scalar("float32") trsm_diag_element.set_as(1.0 / chol_diag_element_final) temp_ub.__setitem__(split_dim * split_dim - 1, trsm_diag_element) with tik_instance.for_range(1, split_dim) as i: index = split_dim - i - 1 tik_instance.vector_dup(64, assist_1_ub, zero, vector_repeat_times, 1, 8) with tik_instance.for_range(0, i) as j: chol_diag_element_loop = tik_instance.Scalar("float32") chol_diag_element_loop.set_as(input_x_ub[index, index + 1 + j]) tik_instance.vmuls(64, assist_2_ub, temp_ub[j + index + 1, 0], chol_diag_element_loop, vector_repeat_times, 1, 1, 8, 8) tik_instance.vadd(64, assist_1_ub, assist_2_ub, assist_1_ub, vector_repeat_times, 1, 1, 1, 8, 8, 8) temp_scalar = tik_instance.Scalar("float32") temp_scalar.set_as(input_x_ub[index, index]) chol_diag_element = tik_instance.Scalar("float32") chol_diag_element.set_as(1.0 / temp_scalar) tik_instance.vsub(64, temp_ub[index, 0], temp_ub[index, 0], assist_1_ub, vector_repeat_times, 1, 1, 1, 8, 8, 8) tik_instance.vmuls(64, temp_ub[index, 0], temp_ub[index, 0], chol_diag_element, vector_repeat_times, 1, 1, 8, 8) tik_instance.data_move(res[block_index, 0, 0], temp_ub, 0, 1, 8 * vector_repeat_times * split_dim, 0, 0) tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res]) return tik_instance
def __init__(self, params_dict, indices_dict, axis_dict, y_dict, kernel_name): """ constructor of GatherV2 Parameters ---------- params_dict: dict shape and dtype of input params indices_dict: dict shape and dtype of input indices axis_dict: dict shape and dtype of input axis y_dict: dict shape and dtype of output, should be same dtype as input kernel_name: str kernel name, default value is "GatherV2" Returns ------- None """ self.params_dtype = params_dict.get("dtype").lower() self.indices_dtype = indices_dict.get("dtype").lower() self.axis_dtype = axis_dict.get("dtype").lower() self.y_dtype = y_dict.get("dtype").lower() self.tiling_dtype = INT32 dtype_list = ("int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "float16", "float32") indices_support_dtype_list = ("int32", "int64") check_dtype(self.params_dtype, dtype_list, param_name="x") check_dtype(self.indices_dtype, indices_support_dtype_list, param_name="indices") check_dtype(self.axis_dtype, (INT32,), param_name="axis") if self.y_dtype != self.params_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal(kernel_name, "y", "x", self.y_dtype, self.params_dtype) profile = tik.Dprofile() self.ub_size = profile.get_unified_buffer_size() self.l1_size = profile.get_l1_buffer_size() self.core_num = profile.get_aicore_num() self.tik_instance = tik.Tik(profile, disable_debug=True) self.kernel_name = kernel_name self.axis_shape = (1,) self.x_shape = (PARAMS_SIZE,) self.indices_shape = (INDICES_NUM,) self.y_shape = (PARAMS_SIZE,) self.params_dsize = TYPE_LEN_DICT.get(self.params_dtype) self.indices_dsize = TYPE_LEN_DICT.get(self.indices_dtype) self.block_elem = BLOCK_SIZE // self.params_dsize self.x = None self.indices = None self.axis = None self.tiling_gm = None self.y = None self.params_pre = None self.params_axis = None self.params_row = None self.indices_num = None self.cache_params = None self.need_core_num = None self.tail_process_core = None self.indices_num_each_core = None self.indices_num_remaining = None self.indices_loop_num = None self.indices_row_num_once = None self.indices_row_num_last = None self.row_num_once_ub = None self.row_num_once_tail_ub = None self.inner_loop_num = None self.row_num_last_ub = None self.row_num_last_tail_ub = None self.inner_loop_num_last = None
def __init__(self, padding, dtype, kernel_name, tik_obj, fuse_mark): """ Function: store pad_d's parameters of compilation """ self.dtype = dtype.lower() self.ori_padding = padding.copy() self.padding = padding.copy() self.kernel_name = kernel_name self.num_bit = tbe_platform.cce_intrin.get_bit_len(self.dtype) // 8 self.fuse_mark = fuse_mark self.mask = 128 if self.num_bit == 4: self.mask = 64 self.max_ub_size = tik.Dprofile().get_unified_buffer_size() - 1024 self.max_core = tik.Dprofile().get_aicore_num() self.tiling_gm = None self.input_gm = None self.output_gm = None self.tiling_buf = None self.tiling_buf_size = None self.buf = None self.buf_size = None self.help_buf = None self.tik_instance = tik_obj # circulation self.axis_amount = None self.branch = None self.depth = None self.top_vol = None self.top_address = None self.top_div_core = None self.top_total_core = None self.top_core_vol_0 = None self.top_core_vol_1 = None self.top_core_gap_0 = None self.top_core_gap_1 = None self.bottom_vol = None self.bottom_address = None self.bottom_div_core = None self.bottom_total_core = None self.bottom_core_vol_0 = None self.bottom_core_vol_1 = None self.bottom_core_gap_0 = None self.bottom_core_gap_1 = None # recursion self.recur_total_core = None self.recur_div_core = None self.recur_in_vol = None self.recur_loop_0 = None self.recur_loop_1 = None self.recur_gap_0 = None self.recur_gap_1 = None self.recur_cond = None self.recur_start_address = None self.new_in_shape = None self.new_out_shape = None self.new_padding_top = None self.new_padding_bottom = None self.recur_model = None self.recur_dup_mk = None self.recur_gm2buf_mk = None self.prod_new_in = None self.prod_new_out = None self.tiling_arg_kind = None self.tiling_arg_num = None self.tiling_arg_idx = None
def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"): """ calculating matrix multiplication with bias, C = A*B + bias, support input data with fractal format. Parameters: shape_a: list or tuple Shape of the first tensor a with rank > 1 shape_b: list or tuple Shape of the second tensor b with the same type with a, and shape_a, shape_b must be 2 dims src_dtype: str The data type of input, support "float32", "float16" dst_dtype: str The data type of output, support "float32", "float16" trans_a: bool If True, shape_a == transposed before multiplication trans_b: bool If True, shape_b == transposed before multiplication is_fractal: bool If True, the input data format of a and b must be fractal format shape_bias: list or tuple Shape of bias, only support the input data format with ND Returns ------- None """ print("!!!!come into zzt~~~~~~~!!!!") shape_a = input_x1.get("ori_shape") shape_b = input_x2.get("ori_shape") shape_output = output_y.get("ori_shape") print("============") print(input_x1.get("format"), input_x2.get("format")) print(shape_a, shape_b) print("============") if input_x2.get("format") == "FRACTAL_Z": n, c, h, w = shape_b c0 = 16 c1 = c // c0 if c1 == 0: c1 = 1 shape_b = [n, c1 * h * w * c0] shape_a = [n, n] if input_x1.get("format") == "FRACTAL_Z": n, c, h, w = shape_a c0 = 16 c1 = c // c0 if c1 == 0: c1 = 1 shape_a = [n, c1 * h * w * c0] shape_b = [c1 * h * w * c0, c1 * h * w * c0] if input_x2.get("format") == "FRACTAL_NZ": shape_a = [shape_b[0], shape_b[0]] shape_b = shape_b if input_x1.get("format") == "FRACTAL_NZ": shape_a = shape_a shape_b = [shape_a[1], shape_a[1]] shape_a = list(shape_a) shape_b = list(shape_b) shape_a = _get_input_shape(shape_a) shape_b = _get_input_shape(shape_b) util.check_kernel_name(kernel_name) util.check_shape_rule(shape_a) util.check_shape_rule(shape_b) util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT) shape_a = [shape_a[1], shape_a[0]] trans_a = bool(1 - trans_a) shape_b = [shape_b[1], shape_b[0]] trans_b = bool(1 - trans_b) shape_bias = () if bias is not None and bool(bias): shape_bias = bias.get("shape") shape_bias = list(shape_bias) shape_bias = _get_bias(shape_bias) src_dtype = input_x1.get("dtype").lower() dst_dtype = output_y.get("dtype").lower() _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b) m_shape = shape_a[len(shape_a) - 2] km_shape = shape_a[len(shape_a) - 1] kn_shape = shape_b[len(shape_a) - 2] n_shape = shape_b[len(shape_a) - 1] if src_dtype == "float16": block_reduce = cce.BLOCK_REDUCE block_in = cce.BLOCK_IN block_out = cce.BLOCK_OUT if trans_a and km_shape == 1: block_in = cce.BLOCK_VECTOR if not trans_a and m_shape == 1: block_in = cce.BLOCK_VECTOR if trans_b and kn_shape == 1: block_out = cce.BLOCK_VECTOR if not trans_b and n_shape == 1: block_out = cce.BLOCK_VECTOR if trans_a: shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in) else: shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce) if trans_b: shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out) else: shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce) shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3]) format_a = "FRACTAL_NZ" shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3]) format_b = "FRACTAL_NZ" print("=======================================") print(shape_a_temp, shape_b_temp) print(format_a, format_b) print("=======================================") tensor_bias = None tensor_a = tvm.placeholder(shape_a_temp, name='tensor_a', dtype=src_dtype) tensor_b = tvm.placeholder(shape_b_temp, name='tensor_b', dtype=src_dtype) if shape_bias: tensor_bias = tvm.placeholder(shape_bias, name='tensor_bias', dtype=dst_dtype) if shape_a_temp[0] == 63 and shape_a_temp[1] == 63 and shape_b_temp[ 0] == 128 and shape_b_temp[1] == 63: if util.get_product_version() == util.VERSION_MINI: tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) else: tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm) input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm) resMatmul = tik_instance.Tensor("float16", shape_output, name="output", scope=tik.scope_gm) with tik_instance.for_range(0, 32, block_num=32) as block_index: resMatmul_local_UB = tik_instance.Tensor("float16", (128 * 256, ), scope=tik.scope_ubuf, name="resMatmul_local_UB") resMatmul_local_UB_local_L0C = tik_instance.Tensor( "float32", (128 * 256, ), scope=tik.scope_cc, name="resMatmul_local_UB") input_1_local_L1_local_L0A = tik_instance.Tensor( "float16", (128 * 128, ), scope=tik.scope_ca, name="input_1_local_L1_local_L0A") input_2_local_L1 = tik_instance.Tensor("float16", (128 * 256, ), scope=tik.scope_cbuf, name="input_2_local_L1") input_1_local_L1 = tik_instance.Tensor("float16", (128 * 128, ), scope=tik.scope_cbuf, name="input_1_local_L1") input_2_local_L1_local_L0B = tik_instance.Tensor( "float16", (128 * 256, ), scope=tik.scope_cb, name="input_2_local_L1_local_L0B") core_m_idx = block_index % 8 core_n_idx = block_index // 8 with tik_instance.if_scope(core_m_idx != 7): tik_instance.data_move( input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128, 55 * 16, 0) tik_instance.data_move( input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0, 32, 128, 55 * 16, 0) with tik_instance.for_range(0, 8) as cc12: tik_instance.load2dv1( input_1_local_L1_local_L0A[cc12 * 2048], input_1_local_L1[cc12 * 256], 0, 8, 8, 0, False) with tik_instance.for_range(0, 2) as cc6: with tik_instance.for_range(0, 8) as cc121: tik_instance.load2dv1( input_2_local_L1_local_L0B[cc121 * 4096], input_2_local_L1[cc6 * 32768 + cc121 * 256], 0, 16, 8, 0, True) tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 128, 128, 256, 0) tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0, 1) tik_instance.data_move( resMatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], resMatmul_local_UB, 0, 16, 256 // 2, 0, 55 * 16 * 2 // 2) with tik_instance.else_scope(): tik_instance.data_move( input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112, 56 * 16, 0) tik_instance.data_move( input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0, 32, 112, 56 * 16, 0) with tik_instance.for_range(0, 7) as cc10: tik_instance.load2dv1( input_1_local_L1_local_L0A[cc10 * 1792], input_1_local_L1[cc10 * 256], 0, 7, 7, 0, False) with tik_instance.for_range(0, 2) as cc5: with tik_instance.for_range(0, 7) as cc101: tik_instance.load2dv1( input_2_local_L1_local_L0B[cc101 * 4096], input_2_local_L1[cc5 * 28672 + cc101 * 256], 0, 16, 7, 0, True) tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 112, 112, 256, 0) tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 112, 0, 0, 1) tik_instance.data_move( resMatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], resMatmul_local_UB, 0, 16, 224 // 2, 0, 56 * 16 * 2 // 2) tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[resMatmul]) return tik_instance print("come into tbe, shape is error!") result = te.lang.cce.matmul(tensor_a, tensor_b, trans_a, trans_b, format_a=format_a, format_b=format_b, dst_dtype=dst_dtype, tensor_bias=tensor_bias) with tvm.target.cce(): schedule = generic.auto_schedule(result) tensor_list = [tensor_a, tensor_b, result] if shape_bias: tensor_list = [tensor_a, tensor_b, tensor_bias, result] config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(schedule, config)
def decode_bbox(box_predictions, anchors, decoded_boxes, decode_clip, kernel_name="decode_bbox"): """ calculating data Parameters ---------- box_predictions : shape and dtype of input anchors : shape and dtype of input decoded_boxes : shape and dtype of output, s hould be same shape and type as input decode_clip : decode_clip kernel_name : kernel name, default value is "decode_bbox" Returns ------- None """ # check param & data shape_box_predictions = box_predictions.get("shape") shape_anchors = anchors.get("shape") shape_decoded_boxes = decoded_boxes.get("shape") util.check_kernel_name(kernel_name) format_box_predictions = box_predictions.get("format") format_anchors = anchors.get("format") format_decoded_boxes = decoded_boxes.get("format") check_format_shape(format_box_predictions, format_anchors, format_decoded_boxes) util.check_shape_rule(shape_box_predictions, CONFIG_THREE, CONFIG_FOUR, None) util.check_shape_rule(shape_anchors, CONFIG_THREE, CONFIG_FOUR, None) util.check_shape_rule(shape_decoded_boxes, CONFIG_TWO, CONFIG_TWO, None) util.check_shape_size(shape_box_predictions, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_anchors, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_decoded_boxes, SHAPE_SIZE_LIMIT) util.check_dtype_rule(box_predictions.get("dtype").lower(), ("float16", )) util.check_dtype_rule(anchors.get("dtype").lower(), ("float16", )) util.check_dtype_rule(decoded_boxes.get("dtype").lower(), ("float16", )) if shape_box_predictions != shape_anchors: raise RuntimeError("the input shape_box_predictions and anchors)" "must be same") if (reduce(lambda x, y: x * y, shape_box_predictions[:])) \ != (reduce(lambda x, y: x * y, shape_decoded_boxes[:])): raise RuntimeError("the input shape (box_predictions and anchors" "is not equal to out shape(decoded_boxes)") if (shape_box_predictions[-1] == CONFIG_FOUR and len(shape_box_predictions) == CONFIG_THREE): if shape_decoded_boxes[1] != CONFIG_FOUR: raise RuntimeError("the output shape_decoded_boxes must be 4") else: if (shape_box_predictions[0] == CONFIG_FOUR and len(shape_box_predictions) == CONFIG_FOUR): if shape_decoded_boxes[0] != CONFIG_FOUR: raise RuntimeError("the output shape_decoded_boxes must be 4") else: raise RuntimeError("the input shape not in {(4,C,H,W), (H,W,4)}") if not isinstance(decode_clip, (float, int)): raise RuntimeError("input param type of decode_clip should be Float") if decode_clip < 0 or decode_clip > 10: raise RuntimeError( "input param decode_clip can't be negtive and shoud be [0,10]! ") # init the tiling shape print("shape_box_predictions", shape_box_predictions) shape = TilingFunc(shape_box_predictions) # calculate the deocede_bbox tik_instance = tik.Tik(tik.Dprofile()) data_tensor = InitTensor(tik_instance, shape) if shape.input_shape[-1] == CONFIG_FOUR \ and len(shape.input_shape) == CONFIG_THREE: decode_bbox_compute(tik_instance, shape, data_tensor, decode_clip, kernel_name) if shape.input_shape[0] == CONFIG_FOUR \ and len(shape.input_shape) == CONFIG_FOUR: decode_bbox_compute_transpose(tik_instance, shape, data_tensor, decode_clip, kernel_name) return tik_instance
def __init__(self, input0, gamma0, beta0, output0, kernel_name="BatchNorm"): self.tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) self.sclar_gamma = self.tik_instance.Scalar("float16") self.sclar_beta = self.tik_instance.Scalar("float16") # self.input_n = self.tik_instance.InputScalar(dtype="int32", name="inputscalar_n") self.input_c = self.tik_instance.InputScalar(dtype="int32", name="inputscalar_c") self.input_h = self.tik_instance.InputScalar(dtype="int32", name="inputscalar_h") self.input_w = self.tik_instance.InputScalar(dtype="int32", name="inputscalar_w") self.inputtype = \ self.tik_instance.InputScalar(dtype="int32", name="inputscalar_dtype") self.output_n = self.tik_instance.InputScalar(dtype="int32", name="outputscalar_n") self.output_c = self.tik_instance.InputScalar(dtype="int32", name="outputscalar_c") self.output_h = self.tik_instance.InputScalar(dtype="int32", name="outputscalar_h") self.output_w = self.tik_instance.InputScalar(dtype="int32", name="outputscalar_w") self.outputtype = \ self.tik_instance.InputScalar(dtype="int32", name="outputscalar_dtype") self.gamma_c = self.tik_instance.InputScalar(dtype="int32", name="gammascalar") self.gammatype = \ self.tik_instance.InputScalar(dtype="int32", name="gammascalar_dtype") self.beta_c = self.tik_instance.InputScalar(dtype="int32", name="betascalar") self.betatype = self.tik_instance.InputScalar(dtype="int32", name="betascalar_dtype") self.param1 = self.tik_instance.InputScalar(dtype="int32", name="param1") self.param2 = self.tik_instance.InputScalar(dtype="int32", name="param2") self.param3 = self.tik_instance.InputScalar(dtype="int32", name="param3") self.param4 = self.tik_instance.InputScalar(dtype="int32", name="param4") self.param5 = self.tik_instance.InputScalar(dtype="int32", name="param5") self.param6 = self.tik_instance.InputScalar(dtype="int32", name="param6") self.param7 = self.tik_instance.InputScalar(dtype="int32", name="param7") self.param8 = self.tik_instance.InputScalar(dtype="int32", name="param8") self.param9 = self.tik_instance.InputScalar(dtype="int32", name="param9") self.param10 = self.tik_instance.InputScalar(dtype="int32", name="param10") self.byte_fp16 = 2 self.input_dtype = "float16" self.kernel_name = kernel_name # gm buffer self.gamma_gm = self.tik_instance.Tensor("float16", (MAX_CHANNEL, ), name="gamma_gm", scope=tik.scope_gm) self.beta_gm = self.tik_instance.Tensor("float16", (MAX_CHANNEL, ), name="beta_gm", scope=tik.scope_gm) self.input_gm = self.tik_instance.\ Tensor("float16", (MAX_BATCH*MAX_CHANNEL*MAX_HEIGHT*MAX_WIDTH,), name="input_gm", scope=tik.scope_gm) self.output_gm = self.tik_instance.\ Tensor("float16", (MAX_BATCH*MAX_CHANNEL*MAX_HEIGHT*MAX_WIDTH,), name="output_gm", scope=tik.scope_gm) self.gamma_ub = self.tik_instance.\ Tensor("float16", (MAX_CHANNEL, ), name="gamma_ub", scope=tik.scope_ubuf) self.beta_ub = self.tik_instance.\ Tensor("float16", (MAX_CHANNEL, ), name="beta_ub", scope=tik.scope_ubuf) align_c = ceil_div_mul(self.input_c, 16) #clear to zero self.tik_instance.vec_muls(128, self.gamma_ub, self.gamma_ub, 0, MAX_CHANNEL // 128, 8, 8) self.tik_instance.vec_muls(128, self.beta_ub, self.beta_ub, 0, MAX_CHANNEL // 128, 8, 8) self.tik_instance.data_move(self.gamma_ub, self.gamma_gm, 0, 1, align_c // 16, 0, 0) self.tik_instance.data_move(self.beta_ub, self.beta_gm, 0, 1, align_c // 16, 0, 0) # 1/var self.tik_instance.vrec(16, self.beta_ub, self.beta_ub, align_c // 16, 1, 1, 1, 1) # -mean self.tik_instance.vec_muls(16, self.gamma_ub, self.gamma_ub, -1.0, align_c // 16, 1, 1)
def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=True, kernel_name="batchmatmul"): """CusBatchMatMul""" if util.get_product_version() == util.VERSION_MINI: tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) else: tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) x1_shape = input_x1.get("shape") dtype = input_x1.get("dtype").lower() x2_shape = input_x2.get("shape") if dtype != input_x2.get("dtype").lower(): raise RuntimeError("dtype of input_x1 and input_x2 must be same, but got %s vs %s" % ( dtype, input_x2.get("dtype").lower())) input_shape = (tuple(x1_shape), tuple(x2_shape), dtype, transpose_a, transpose_b) support_shape = [((8, 128, 128), (8, 128, 128), "float32", False, True), ((36, 128, 128), (36, 128, 128), "float32", False, True), ((5, 128, 128), (5, 128, 128), "float32", False, True), ((18, 128, 128), (18, 128, 128), "float32", False, True), ((16, 128, 128), (16, 128, 128), "float32", False, True), ((9, 128, 128), (9, 128, 128), "float32", False, True), ((1, 64, 64), (1, 64, 64), "float32", False, True), ((1, 128, 128), (1, 128, 128), "float32", False, True), ((4, 128, 128), (4, 128, 128), "float32", False, True), ((2, 128, 128), (2, 128, 128), "float32", False, True)] if input_shape not in support_shape: raise RuntimeError("input_shape %s is not supported" % str(input_shape)) # if not transpose_a and transpose_b: batch, m, k = x1_shape input1_shape = _get_flattern_shape(x1_shape) input1 = tik_instance.Tensor(dtype, input1_shape, name="input1", scope=tik.scope_gm) input2_shape = _get_flattern_shape(x2_shape) input2 = tik_instance.Tensor(dtype, input2_shape, name="input2", scope=tik.scope_gm) output_shape = x1_shape res_shape = _get_flattern_shape(output_shape) res = tik_instance.Tensor(dtype, res_shape, name="res", scope=tik.scope_gm) if input_shape == ((36, 128, 128), (36, 128, 128), "float32", False, True): with tik_instance.for_range(0, 18, block_num=18) as block_idx: with tik_instance.for_range(0, 2) as cc0: with tik_instance.for_range(0, 128, thread_num=2) as cc1: input1_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128 input2_index = block_idx * 32768 + cc0 * 16384 res_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128 _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index) if input_shape == ((5, 128, 128), (5, 128, 128), "float32", False, True): with tik_instance.for_range(0, 30, block_num=30) as block_idx: with tik_instance.for_range(0, 11) as cc1_db: with tik_instance.for_range(0, 2, thread_num=2) as thread_idx: with tik_instance.if_scope(((((block_idx % 6) * 22) + (cc1_db * 2) + thread_idx) < 128)): input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB", scope=tik.scope_ubuf) t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB, input1[ (block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 + thread_idx * 128], 0, 1, 16, 0, 0) with tik_instance.for_range(0, 2) as vec_i: tik_instance.vadds(64, t_1_0_local_UB[vec_i * 64], input_1_local_UB[vec_i * 64], 0, 64, 1, 1, 16, 0) with tik_instance.for_range(0, 2, thread_num=2) as thread_idx2: input_2_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="input_2_local_UB", scope=tik.scope_ubuf) t_1_local_UB = input_2_local_UB bisec_last_axis_local_UB = input_2_local_UB matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [64], name="matmul_hybrid_f_t_local_UB", scope=tik.scope_ubuf) matmul_hybrid_f_t_local_UB_dst_tmp = tik_instance.Tensor(dtype, [64], name="matmul_hybrid_f_t_local_UB_dst_tmp", scope=tik.scope_ubuf) tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB, 0, 1, 1, 8) tik_instance.data_move(input_2_local_UB, input2[(block_idx // 6) * 16384 + thread_idx2 * 8192], 0, 1, 1024, 0, 0) tik_instance.vmul(64, t_1_local_UB, t_1_0_local_UB, input_2_local_UB, 128, 1, 1, 1, 8, 8, 8) tik_instance.vadd(64, bisec_last_axis_local_UB, t_1_local_UB, t_1_local_UB[64], 64, 1, 1, 1, 16, 16, 16) tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB_dst_tmp, 0, 1, 1, 8) with tik_instance.for_range(0, 64) as cc6: tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB_dst_tmp[cc6], bisec_last_axis_local_UB[cc6 * 128], 1, 1, 1, 8) tik_instance.vadd(64, matmul_hybrid_f_t_local_UB, matmul_hybrid_f_t_local_UB_dst_tmp, matmul_hybrid_f_t_local_UB, 1, 1, 1, 1, 8, 8, 8) tik_instance.data_move( res[(block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 + thread_idx * 128 + thread_idx2 * 64], matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0) if input_shape == ((18, 128, 128), (18, 128, 128), "float32", False, True): with tik_instance.for_range(0, 18, block_num=18) as block_idx: with tik_instance.for_range(0, 128, thread_num=2) as cc0: input1_index = block_idx * 16384 + cc0 * 128 input2_index = block_idx * 16384 res_index = block_idx * 16384 + cc0 * 128 _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index) if input_shape == ((9, 128, 128), (9, 128, 128), "float32", False, True): with tik_instance.for_range(0, 27, block_num=27) as block_idx: with tik_instance.for_range(0, 42, thread_num=2) as cc0: input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128 input2_index = (block_idx // 3) * 16384 res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128 _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index) with tik_instance.if_scope((block_idx % 3) < 2): input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128 input2_index = (block_idx // 3) * 16384 res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128 _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index) if input_shape == ((1, 64, 64), (1, 64, 64), "float32", False, True): with tik_instance.for_range(0, 32, block_num=32) as block_idx: with tik_instance.for_range(0, 2, thread_num=2) as cc0: input1_index = block_idx * 128 + cc0 * 64 input2_index = 0 res_index = block_idx * 128 + cc0 * 64 _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index) input_shape_list = [((1, 128, 128), (1, 128, 128), "float32", False, True), ((2, 128, 128), (2, 128, 128), "float32", False, True), ((4, 128, 128), (4, 128, 128), "float32", False, True), ((8, 128, 128), (8, 128, 128), "float32", False, True), ((16, 128, 128), (16, 128, 128), "float32", False, True) ] if input_shape in input_shape_list: block_num = 32 input1_unit_size = 128 input2_unint_size = 128 * 128 with tik_instance.for_range(0, block_num, block_num=block_num) as block_idx: block_process_ele_num = (batch * m * k) // block_num loop_time = (batch * m * k) // block_num // input1_unit_size thread_num = 2 with tik_instance.for_range(0, loop_time, thread_num=thread_num) as cc0: input1_index = block_idx * block_process_ele_num + cc0 * input1_unit_size if batch > 1: input2_index = block_idx // (block_num // batch) * input2_unint_size else: input2_index = 0 res_index = block_idx * block_process_ele_num + cc0 * input1_unit_size _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index) tik_instance.BuildCCE(kernel_name, inputs=[input1, input2], outputs=[res]) return tik_instance
def sort(x, y1, y2, axis=-1, descending=False, kernel_name="sort"): """ Function: Sorts the elements of the input tensor along a given dimension in ascending order by value. Modify : 2020-08-03 Init base parameters Parameters ---------- input(x): dict data of input output(y1): dict data of output indices(y2): dict data of indices dim(axis): int descending: bool kernel_name: str the name of the operator ---------- """ shape, dtype, allnum, num = cheak(x, y1, y2, axis, kernel_name) tik_instance = tik.Tik(tik.Dprofile()) add16 = (16 - (num % 16)) % 16 total = num + add16 big_shape = list(shape) big_shape[-1] = total input_gm = tik_instance.Tensor(dtype, shape, name="x", scope=tik.scope_gm) data_out = tik_instance.Tensor(dtype, big_shape, name="data_out", scope=tik.scope_gm, is_workspace=True) data_indices = tik_instance.Tensor("int32", big_shape, name="data_indices", scope=tik.scope_gm, is_workspace=True) data_out_ = tik_instance.Tensor(dtype, shape, name="data_out_", scope=tik.scope_gm) data_indices_ = tik_instance.Tensor("int32", shape, name="data_indices_", scope=tik.scope_gm) # to figure the index of input_gm L = len(shape) distance = [] big_distance = [] tmp = allnum big_tmp = allnum // num * total for i in range(L - 1): tmp = tmp // shape[i] distance.append(tmp) big_tmp = big_tmp // shape[i] big_distance.append(big_tmp) rounds = allnum // num available_aicore_num = tik.Dprofile().get_aicore_num() used_aicore_num = available_aicore_num if rounds > available_aicore_num else rounds batch_num_per_aicore_process = rounds // used_aicore_num batch_tail = rounds % used_aicore_num with tik_instance.for_range(0, used_aicore_num, block_num=used_aicore_num) as i: with tik_instance.for_range(0, batch_num_per_aicore_process) as k: data_out, data_indices = sort_compute(tik_instance, dtype, total, i + k * used_aicore_num, descending, num, distance, shape, big_distance, data_out, data_indices, input_gm, L) with tik_instance.if_scope(i < batch_tail): data_out, data_indices = sort_compute( tik_instance, dtype, total, batch_num_per_aicore_process * used_aicore_num + i, descending, num, distance, shape, big_distance, data_out, data_indices, input_gm, L) float_ub = tik_instance.Tensor("float16", [total], name="float_ub", scope=tik.scope_ubuf) int_ub = tik_instance.Tensor("int32", [total], name="int_ub", scope=tik.scope_ubuf) with tik_instance.for_range(0, rounds) as i: tik_instance.data_move(float_ub[0], data_out[i * total], 0, 1, total // 16, 0, 0) tik_instance.data_move(data_out_[i * num], float_ub[0], 0, 1, total // 16, 0, 0) tik_instance.data_move(int_ub[0], data_indices[i * total], 0, 1, total // 8, 0, 0) tik_instance.data_move(data_indices_[i * num], int_ub[0], 0, 1, total // 8, 0, 0) tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_gm], outputs=[data_out_, data_indices_]) return tik_instance
def safe_check(dicts, kernel_name): """ check if the inputs are legal Parameters ---------- dicts: (x_dict, rois_dict, actual_dict, y_dict) kernel_name: kernel name Returns ------- None """ x_shape = dicts[0].get("shape") x_dtype = dicts[0].get("dtype").lower() rois_shape = dicts[1].get("shape") rois_dtype = dicts[1].get("dtype").lower() y_dtype = dicts[3].get("dtype").lower() y_shape = dicts[3].get("shape") profile = tik.Dprofile() tik_name_check = tbe_platform.cce_conf.get_soc_spec("SOC_VERSION") if tik_name_check in ("Ascend310", "Ascend910", "Hi3796CV300ES", "Hi3796CV300CS"): op_utils.check_dtype(x_dtype, ["float16"], param_name="input_x") op_utils.check_dtype(rois_dtype, ["float16"], param_name="input_rois") else: op_utils.check_dtype(x_dtype, ["float16", "float32"], param_name="input_x") op_utils.check_dtype(rois_dtype, ["float16", "float32"], param_name="input_rois") if x_dtype != rois_dtype or x_dtype != y_dtype: error_info = {} error_info['errCode'] = 'E81012' error_info['op_name'] = 'roi_pooling' error_info['real_dtypes'] = str((x_dtype, rois_dtype, y_dtype)) raise RuntimeError( error_info, "In op[roi_pooling], the dtype of tensor x, rois and y should be the same, but actually they are [%s]." % error_info['real_dtypes']) op_utils.check_shape(x_shape, min_rank=5, max_rank=5, param_name="input_x") op_utils.check_shape(rois_shape, min_rank=3, max_rank=3, param_name="input_rois") op_utils.check_shape(y_shape, min_rank=5, max_rank=5, param_name="output_y") roi_max_num = rois_shape[2] if roi_max_num > 6000 or roi_max_num % 16 != 0: error_info = {} error_info['errCode'] = 'E81013' error_info['real_rois_shape[2]'] = str(rois_shape[2]) raise RuntimeError( error_info, "In op[roi_pooling], the rois_shape[2] should be less than 6000 and can be divided by 16, but actually is [%s]." % error_info['real_rois_shape[2]'])
def conv2d_tik_compute(params): te_set_l2_mode(1) tik_instance = tik.Tik(tik.Dprofile(params["arch"], params["version"]), err_msg_level=1) n, c1, h, w, c0 = params["fm_shape"] c1, kh, kw, cout, c0 = params["weight_shape"] stride_h, stride_w = params["stride_list"] dilation_h, dilation_w = params["dilation_list"] pad_top, pad_bot, pad_left, pad_right = params["pad_list"] kh_dilation = (kh - 1) * dilation_h + 1 kw_dilation = (kw - 1) * dilation_w + 1 ho = int(np.ceil((h + pad_top + pad_bot - kh_dilation + 1) / stride_h)) wo = int(np.ceil((w + pad_right + pad_left - kw_dilation + 1) / stride_w)) round_howo = ceil_div(ho * wo, 16) * 16 fm_gm = tik_instance.Tensor(params['fm_dtype'], (n, c1, h, w, c0), name='fm_gm', scope=tik.scope_gm) weight_gm = tik_instance.Tensor(params['weight_type'], (c1, kh, kw, cout, c0), name='weight_gm', scope=tik.scope_gm) if params['dst_gm_type'] in ("int8", "uint8"): dst_gm = tik_instance.Tensor(params['dst_gm_type'], [n, cout // 32, ho, wo, 32], name='dst_gm', scope=tik.scope_gm) else: dst_gm = tik_instance.Tensor(params['dst_gm_type'], [n, cout // 16, ho, wo, 16], name='dst_gm', scope=tik.scope_gm) core_num = 2 pre_core_cout = cout // core_num cout_iter_num = pre_core_cout // params["cout_split_factor"] Cin_blocks = c1 with tik_instance.for_range(0, core_num, block_num=core_num) as cout_o: with tik_instance.for_range(0, cout_iter_num, thread_num=1) as cout_i: weight_L1 = tik_instance.Tensor( params['weight_type'], (Cin_blocks, kh, kw, params["cout_split_factor"], c0), name='weight_l1', scope=tik.scope_cbuf) tik_instance.data_move( weight_L1, weight_gm.flatten()[cout_o * pre_core_cout * c0 + params["cout_split_factor"] * cout_i * c0], 0, Cin_blocks * kh * kw, params["cout_split_factor"], (cout - params["cout_split_factor"]), 0) with tik_instance.for_range(0, n, thread_num=2) as n_index: feature_map_l1 = tik_instance.Tensor(params['fm_dtype'], (c1, h, w, c0), name='feature_map_l1', scope=tik.scope_cbuf) tik_instance.data_move(feature_map_l1, fm_gm[n_index, :, :, :, :], 0, 1, c1 * h * w, 0, 0) dst_l0c = tik_instance.Tensor( params['dst_l0c_type'], [params["cout_split_factor"] // 16, round_howo, 16], name='dst_l0c', scope=tik.scope_cbuf_out) tik_instance.conv2d( dst_l0c, feature_map_l1, weight_L1, (c1, h, w, c0), (Cin_blocks, kh, kw, params["cout_split_factor"], c0), params['stride_list'], params['pad_list'], params['dilation_list'], params['pad_value']) tik_instance.fixpipe( dst_gm[n_index, (cout_o * pre_core_cout + params["cout_split_factor"] * cout_i) // (32 // DTYPE_SIZE[params['dst_gm_type']]), 0, 0, 0], dst_l0c, params["cout_split_factor"] // 16, ho * wo * 16 * DTYPE_SIZE[params['dst_l0c_type']] // 32, 0, 0, extend_params={ "bias": None, "quantize_params": params["quantize_params"] }) tik_instance.BuildCCE(kernel_name=params["kernel_name"], inputs=[fm_gm, weight_gm], outputs=[dst_gm]) return tik_instance
def decode_boundaries_target(boundary_predictions, anchors, boundary_encoded, kernel_name="cce_decode_boundaries_target_fpLINE"): """ calculating data Parameters ---------- boundary_predictions : dict shape and dtype of input anchors : dict shape and dtype of input boundary_encoded : dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "decode_boundaries_target" Returns ------- None """ util.check_kernel_name(kernel_name) input_info = InputInfo( shape_boundary_predictions=boundary_predictions.get("shape"), shape_anchors=anchors.get("shape"), dtype_boundary_predictions=boundary_predictions.get("dtype").lower(), dtype_anchors=anchors.get("dtype").lower() ) input_info.set_nmax(n_max=NMAX) output_info = Output() total_handling_times, last_handling_n = check_input( boundary_predictions=boundary_predictions, anchors=anchors, boundary_encoded=boundary_encoded, n_max=input_info.n_max) tik_instance = tik.Tik(tik.Dprofile(), True) # tensor init data_boundary_predictions, data_anchors, \ data_z = get_gm(tik_instance=tik_instance, dtype=input_info.dtype_anchors, shape1=input_info.shape_boundary_predictions, shape2=input_info.shape_anchors, name1="data_boundary_predictions", name2="data_anchors", name3="data_z", scope=tik.scope_gm) if total_handling_times > 0: with tik_instance.for_range(0, total_handling_times) as current_handling_times: # current_handling_times: output_info.set_burst_num(burst_num=input_info.n_max) # number of LINE*LINE output_info.update( n_vector=int_ceil_div(output_info.burst_num, MATRIX_NUM), n_matrix=int_ceil_div(output_info.burst_num * FOUR, MATRIX_NUM) ) output_info.update( shape_vector=(output_info.n_vector, LINE, LINE), shape_matrix=(output_info.n_matrix * FOUR, LINE, LINE) ) # move x_gm to ub times # move y_gm to ub times input_info.update( burst_x=int_ceil_div(output_info.burst_num, LINE), burst_y=int_ceil_div(output_info.burst_num * FOUR, LINE) ) output_info.update( rep=output_info.burst_num // VECTOR, overflow=0 ) process_calculate(tik_instance=tik_instance, input_info=input_info, output_info=output_info, current_handling_times=current_handling_times, data_boundary_predictions=data_boundary_predictions, data_anchors=data_anchors, data_z=data_z) current_handling_times = total_handling_times if last_handling_n > 0: output_info.set_burst_num(burst_num=last_handling_n) # number of LINE*LINE output_info.update( n_vector=int_ceil_div(output_info.burst_num, MATRIX_NUM), n_matrix=int_ceil_div(output_info.burst_num * FOUR, MATRIX_NUM) ) output_info.update( shape_vector=(output_info.n_vector, LINE, LINE), shape_matrix=(output_info.n_matrix * FOUR, LINE, LINE) ) # move x_gm to ub times # move y_gm to ub times input_info.update( burst_x=int_ceil_div(output_info.burst_num, LINE), burst_y=int_ceil_div(output_info.burst_num * FOUR, LINE) ) output_info.update( rep=0, overflow=output_info.burst_num - VECTOR * (output_info.burst_num // VECTOR) ) process_end(tik_instance=tik_instance, input_info=input_info, output_info=output_info, current_handling_times=current_handling_times, data_boundary_predictions=data_boundary_predictions, data_anchors=data_anchors, data_z=data_z) # build_cce tik_instance.BuildCCE( kernel_name=kernel_name, inputs=[data_boundary_predictions, data_anchors], outputs=[data_z]) return tik_instance
def __init__(self, var, indices, updates, var_out, nd_flag, kernel_name, compute_type): """ Init scatter base parameters Parameters ---------- var: dict data of input datatype suports float32,float16,int32,int8,uint8 indices: dict data of indices datatype supports int32 updates: dict data of updates datatype supports float32,float16,int32,int8,uint8 var_out: dict data of input nd_flag: bool if this op is nd operator kernel_name: str the name of the operator compute_type: str the compute type of scatter Returns ------- None """ self.tik_instance = tik.Tik(tik.Dprofile()) self.nd_flag = nd_flag self.var_shape = var.get("shape") self.var_dtype = var.get("dtype").lower() self.indices_shape = indices.get("shape") self.indices_dtype = indices.get("dtype").lower() self.updates_shape = updates.get("shape") self.updates_dtype = updates.get("dtype").lower() self.var_ele_num = functools_reduce(lambda x, y: x * y, self.var_shape) self.indices_num = functools_reduce(lambda x, y: x * y, self.indices_shape) self.updates_num = functools_reduce(lambda x, y: x * y, self.updates_shape) self.kernel_name = kernel_name if self.indices_shape == (1,) and \ len(self.var_shape)-len(self.updates_shape) == 1: if not nd_flag: self.updates_shape = (1, ) + self.updates_shape self.check_param(var_out) if nd_flag: if self.indices_shape[-1] == len(self.var_shape): self.update_data_num = 1 else: self.update_data_num = functools_reduce( lambda x, y: x * y, self.var_shape[self.indices_shape[-1]:]) self.max_indice = functools_reduce( lambda x, y: x * y, self.var_shape[0:self.indices_shape[-1]]) self.index_dims = self.indices_shape[-1] else: if len(self.var_shape) > 1: self.update_data_num = functools_reduce( lambda x, y: x * y, self.var_shape[1:]) else: self.update_data_num = 1 self.max_indice = self.var_shape[0] self.index_dims = 1 self.compute_type = compute_type self.ub_size_bytes = ( tbe_platform.cce_conf.get_soc_spec(tbe_platform.cce_conf.UB_SIZE) - 8192) self.var_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len( self.var_dtype) // 8 self.indices_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len( self.indices_dtype) // 8 self.var_data_each_block = 32 // self.var_dtype_bytes_size self.indices_data_each_block = 32 // self.indices_dtype_bytes_size self.indices_ub_number = 0 self.updates_ub_number = 0 self.index_loop_num = 0 self.max_num_one_repeat = 128 if self.var_dtype in ("float32", "int32"): self.max_num_one_repeat = 64 if self.update_data_num < self.var_data_each_block: self.block_num = 1 else: ai_core_num = tbe_platform.cce_conf.get_soc_spec( tbe_platform.cce_conf.CORE_NUM) self.indice_step = math.ceil(self.max_indice / ai_core_num) self.block_num = math.ceil(self.max_indice / self.indice_step) self.var_gm = self.tik_instance.Tensor(self.var_dtype, self.var_shape, name="var_gm", scope=tik.scope_gm) self.indices_gm = self.tik_instance.Tensor(self.indices_dtype, self.indices_shape, name="indices_gm", scope=tik.scope_gm) self.updates_gm = self.tik_instance.Tensor(self.updates_dtype, self.updates_shape, name="updates_gm", scope=tik.scope_gm) self.out_gm = self.tik_instance.Tensor(self.var_dtype, self.var_shape, name="out_gm", scope=tik.scope_gm) self.vconv_dst_dtype = "float16" self.init_ub_tensor_para() self.var_vconv_ub = None self.updates_vconv_ub = None self.var_tile_vconv_ub = None self.updates_tile_vconv_ub = None self.var_ub = None self.updates_ub = None self.indices_ub = None self.var_tile_ub = None self.updates_tile_ub = None self.var_read_index = None self.updates_read_index = None self.indices_loop_index = None self.indices_tmp = None
def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"): """CusMatMulCubeDenseRight""" shape_a_temp = (128, 63, 16, 16) shape_b_temp = (128, 128, 16, 16) shape_output = output_y.get("shape") matrix_max_shape = (1, ) support_shape = [ (shape_a_temp, shape_b_temp, matrix_max_shape), ] shape_a_input = input_x1.get("shape") shape_b_input = input_x2.get("shape") matrix_max_input = input_x3.get("shape") input_shape = (tuple(shape_a_input), tuple(shape_b_input), tuple(matrix_max_input)) if input_shape not in support_shape: raise RuntimeError("input_shape %s is not supported" % str(input_shape)) if shape_a_temp[0] == 128 and shape_a_temp[1] == 63 and shape_b_temp[ 0] == 128 and shape_b_temp[1] == 128: if util.get_product_version() == util.VERSION_MINI: tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) else: tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm) input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm) input_x3 = tik_instance.Tensor("float32", [ 1, ], name="matrix_max", scope=tik.scope_gm) resMatmul = tik_instance.Tensor("float32", shape_output, name="output", scope=tik.scope_gm) with tik_instance.for_range(0, 32, block_num=32) as block_index: core_m_idx = block_index // 16 core_n_idx = block_index % 16 matrix_max_scalar = tik_instance.Scalar("float32") matrix_max_local_UB = tik_instance.Tensor( "float32", (8, ), scope=tik.scope_ubuf, name="matrix_max_local_UB") tik_instance.data_move(matrix_max_local_UB, input_x3, 0, 1, 1, 0, 0) matrix_max_scalar.set_as(matrix_max_local_UB[0]) resMatmul_local_UB = tik_instance.Tensor("float32", (256 * 128, ), scope=tik.scope_ubuf, name="resMatmul_local_UB") resMatmul_local_UB1 = tik_instance.Tensor( "float32", (240 * 128, ), scope=tik.scope_ubuf, name="resMatmul_local_UB1") resMatmul_local_UB_local_L0C = tik_instance.Tensor( "float32", (256 * 128, ), scope=tik.scope_cc, name="resMatmul_local_UB_local_L0C") resMatmul_local_UB_local_L0C1 = tik_instance.Tensor( "float32", (240 * 128, ), scope=tik.scope_cc, name="resMatmul_local_UB_local_L0C1") input_1_local_L1_local_L0A = tik_instance.Tensor( "float16", (256 * 128, ), scope=tik.scope_ca, name="input_1_local_L1_local_L0A") input_2_local_L1 = tik_instance.Tensor("float16", (8 * 128 * 16, ), scope=tik.scope_cbuf, name="input_2_local_L1") input_2_local_L11 = tik_instance.Tensor("float16", (8 * 128 * 16, ), scope=tik.scope_cbuf, name="input_2_local_L11") input_1_local_L1 = tik_instance.Tensor("float16", (8 * 256 * 16, ), scope=tik.scope_cbuf, name="input_1_local_L1") input_1_local_L11 = tik_instance.Tensor("float16", (8 * 240 * 16, ), scope=tik.scope_cbuf, name="input_1_local_L11") input_2_local_L1_local_L0B = tik_instance.Tensor( "float16", (128 * 128, ), scope=tik.scope_cb, name="input_2_local_L1_local_L0B") input_2_local_L1_local_L0B1 = tik_instance.Tensor( "float16", (128 * 128, ), scope=tik.scope_cb, name="input_2_local_L1_local_L0B1") with tik_instance.if_scope(core_m_idx == 0): with tik_instance.for_range(0, 2) as cc1: tik_instance.data_move( input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0) tik_instance.data_move( input_1_local_L1, input_x1[core_n_idx * 129024 + cc1 * 4096], 0, 8, 256, 752, 0) with tik_instance.for_range(0, 8) as cc10: tik_instance.load2dv1( input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256], 0, 8, 8, 0, True) with tik_instance.for_range(0, 16) as cc101: tik_instance.load2dv1( input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256], 0, 8, 16, 0, False) tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 256, 128, 128, 0) tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0) tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255, 1, 1, 8, 8) tik_instance.vmuls(64, resMatmul_local_UB[255 * 64], resMatmul_local_UB[255 * 64], matrix_max_scalar, 255, 1, 1, 8, 8) tik_instance.vmuls(64, resMatmul_local_UB[510 * 64], resMatmul_local_UB[510 * 64], matrix_max_scalar, 2, 1, 1, 8, 8) tik_instance.data_move( resMatmul[core_n_idx * 129024 + cc1 * 4096], resMatmul_local_UB, 0, 8, 512, 0, 1504) with tik_instance.else_scope(): tik_instance.data_move( input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0) tik_instance.data_move( input_1_local_L1, input_x1[core_n_idx * 129024 + 2 * 4096], 0, 8, 256, 752, 0) with tik_instance.for_range(0, 8) as cc10: tik_instance.load2dv1( input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256], 0, 8, 8, 0, True) with tik_instance.for_range(0, 16) as cc101: tik_instance.load2dv1( input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256], 0, 8, 16, 0, False) tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 256, 128, 128, 0) tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0) tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255, 1, 1, 8, 8) tik_instance.vmuls(64, resMatmul_local_UB[255 * 64], resMatmul_local_UB[255 * 64], matrix_max_scalar, 255, 1, 1, 8, 8) tik_instance.vmuls(64, resMatmul_local_UB[510 * 64], resMatmul_local_UB[510 * 64], matrix_max_scalar, 2, 1, 1, 8, 8) tik_instance.data_move( resMatmul[core_n_idx * 129024 + 2 * 4096], resMatmul_local_UB, 0, 8, 512, 0, 1504) tik_instance.data_move( input_2_local_L11, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0) tik_instance.data_move(input_1_local_L11, input_x1[core_n_idx * 129024 + 12288], 0, 8, 240, 768, 0) with tik_instance.for_range(0, 8) as cc102: tik_instance.load2dv1( input_2_local_L1_local_L0B1[cc102 * 2048], input_2_local_L11[cc102 * 256], 0, 8, 8, 0, True) with tik_instance.for_range(0, 16) as cc103: tik_instance.load2dv1( input_1_local_L1_local_L0A[cc103 * 2048], input_1_local_L11[cc103 * 256], 0, 8, 15, 0, False) tik_instance.mmad(resMatmul_local_UB_local_L0C1, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B1, 240, 128, 128, 0) tik_instance.data_move(resMatmul_local_UB1, resMatmul_local_UB_local_L0C1, 0, 1, 120, 0, 0) tik_instance.vmuls(64, resMatmul_local_UB1, resMatmul_local_UB1, matrix_max_scalar, 255, 1, 1, 8, 8) tik_instance.vmuls(64, resMatmul_local_UB1[255 * 64], resMatmul_local_UB1[255 * 64], matrix_max_scalar, 225, 1, 1, 8, 8) tik_instance.data_move(resMatmul[core_n_idx * 129024 + 12288], resMatmul_local_UB1, 0, 8, 480, 0, 1536) tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2, input_x3], outputs=[resMatmul]) return tik_instance
def CusTranspose02314(input_x, output, kernel_name="transpose021354"): """CusTranspose02314""" input_x_shape = input_x.get("shape") output_shape = output.get("shape") perm = (0, 2, 3, 1, 4) input_x_shape = tuple(input_x_shape) support_shape = [(32, 128, 7, 7, 16), (32, 32, 7, 7, 16), (32, 32, 14, 14, 16), (32, 64, 14, 14, 16), (32, 16, 14, 14, 16), (32, 16, 28, 28, 16), (32, 32, 28, 28, 16), (32, 8, 28, 28, 16), (32, 8, 56, 56, 16), (32, 16, 56, 56, 16), (32, 4, 56, 56, 16), (32, 4, 112, 112, 16)] if input_x_shape not in support_shape: raise RuntimeError("input_shape %s is not supported" % str(input_x_shape)) if util.get_product_version() == util.VERSION_MINI: tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) else: tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) input_x = tik_instance.Tensor("float16", input_x_shape, name="input_x", scope=tik.scope_gm) res = tik_instance.Tensor("float16", output_shape, name="res", scope=tik.scope_gm) dtype = "float16" if tuple(input_x_shape) == (32, 4, 112, 112, 16): with tik_instance.for_range(0, 32, block_num=32) as block_idx: with tik_instance.for_range(0, 14) as cc1_db: with tik_instance.for_range(0, 2, thread_num=2) as db_idx: input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf) T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf) zero = tik_instance.Scalar(dtype="float16", init_value=0) tik_instance.data_move(input_1_local_UB, input_x[block_idx * 802816 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448, 12096, 0) with tik_instance.for_range(0, 448) as cc7: with tik_instance.for_range(0, 4) as cc8: tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16], input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0) elif tuple(input_x_shape) == (32, 4, 56, 56, 16): with tik_instance.for_range(0, 32, block_num=32) as block_idx: zero = tik_instance.Scalar(dtype="float16", init_value=0) with tik_instance.for_range(0, 3) as cc1_db: with tik_instance.for_range(0, 2, thread_num=2) as db_idx: input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf) T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB, input_x[block_idx * 200704 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448, 2688, 0) with tik_instance.for_range(0, 448) as cc7: with tik_instance.for_range(0, 4) as cc8: tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16], input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0) input_1_local_UB2 = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB2", scope=tik.scope_ubuf) T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB2, input_x[block_idx * 200704 + 43008], 0, 4, 448, 2688, 0) with tik_instance.for_range(0, 448) as cc72: with tik_instance.for_range(0, 4) as cc82: tik_instance.vadds(16, T_transpose_local_UB2[cc72 * 64 + cc82 * 16], input_1_local_UB2[7168 * cc82 + cc72 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 200704 + 172032], T_transpose_local_UB2, 0, 1, 1792, 0, 0) elif tuple(input_x_shape) == (32, 16, 56, 56, 16): with tik_instance.for_range(0, 32, block_num=32) as block_idx: zero = tik_instance.Scalar(dtype="float16", init_value=0) with tik_instance.for_range(0, 14) as cc1_db: with tik_instance.for_range(0, 2, thread_num=2) as db_idx: input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf) T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB, input_x[block_idx * 802816 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 3024, 0) with tik_instance.for_range(0, 112) as cc7: with tik_instance.for_range(0, 16) as cc8: tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0) elif tuple(input_x_shape) == (32, 8, 56, 56, 16): with tik_instance.for_range(0, 32, block_num=32) as block_idx: zero = tik_instance.Scalar(dtype="float16", init_value=0) with tik_instance.for_range(0, 7) as cc1_db: with tik_instance.for_range(0, 2, thread_num=2) as db_idx: input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf) T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 7168 + 3584 * db_idx], 0, 8, 224, 2912, 0) with tik_instance.for_range(0, 224) as cc7: with tik_instance.for_range(0, 16) as cc8: tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16], input_1_local_UB[3584 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0) elif tuple(input_x_shape) == (32, 8, 28, 28, 16): with tik_instance.for_range(0, 32, block_num=32) as block_idx: zero = tik_instance.Scalar(dtype="float16", init_value=0) with tik_instance.for_range(0, 2) as cc1_db: with tik_instance.for_range(0, 2, thread_num=2) as db_idx: input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB", scope=tik.scope_ubuf) T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB, input_x[block_idx * 100352 + cc1_db * 6272 + 3136 * db_idx], 0, 8, 196, 588, 0) with tik_instance.for_range(0, 196) as cc7: with tik_instance.for_range(0, 8) as cc8: tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16], input_1_local_UB[3136 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 100352 + cc1_db * 50176 + 25088 * db_idx], T_transpose_local_UB, 0, 1, 1568, 0, 0) elif tuple(input_x_shape) == (32, 32, 28, 28, 16): with tik_instance.for_range(0, 32, block_num=32) as block_idx: zero = tik_instance.Scalar(dtype="float16", init_value=0) with tik_instance.for_range(0, 7) as cc1_db: with tik_instance.for_range(0, 2, thread_num=2) as db_idx: input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf) T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 1792 + 896 * db_idx], 0, 32, 56, 728, 0) with tik_instance.for_range(0, 56) as cc7: with tik_instance.for_range(0, 32) as cc8: tik_instance.vadds(16, T_transpose_local_UB[cc7 * 512 + cc8 * 16], input_1_local_UB[896 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0) elif tuple(input_x_shape) == (32, 16, 28, 28, 16): with tik_instance.for_range(0, 32, block_num=32) as block_idx: zero = tik_instance.Scalar(dtype="float16", init_value=0) with tik_instance.for_range(0, 3) as cc1_db: with tik_instance.for_range(0, 2, thread_num=2) as db_idx: input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf) T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB, input_x[block_idx * 200704 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 672, 0) with tik_instance.for_range(0, 112) as cc7: with tik_instance.for_range(0, 16) as cc8: tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0) input_1_local_UB2 = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB2", scope=tik.scope_ubuf) T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB2, input_x[block_idx * 200704 + 10752], 0, 16, 112, 672, 0) with tik_instance.for_range(0, 112) as cc7: with tik_instance.for_range(0, 16) as cc8: tik_instance.vadds(16, T_transpose_local_UB2[cc7 * 256 + cc8 * 16], input_1_local_UB2[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 200704 + 172032], T_transpose_local_UB2, 0, 1, 1792, 0, 0) elif tuple(input_x_shape) == (32, 16, 14, 14, 16): with tik_instance.for_range(0, 32, block_num=32) as block_idx: zero = tik_instance.Scalar(dtype="float16", init_value=0) with tik_instance.for_range(0, 2, thread_num=2) as db_idx: input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB", scope=tik.scope_ubuf) T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB, input_x[block_idx * 50176 + 1568 * db_idx], 0, 16, 98, 98, 0) with tik_instance.for_range(0, 98) as cc7: with tik_instance.for_range(0, 16) as cc8: tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1568 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 50176 + 25088 * db_idx], T_transpose_local_UB, 0, 1, 1568, 0, 0) elif tuple(input_x_shape) == (32, 128, 7, 7, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16": with tik_instance.for_range(0, 32, block_num=32) as block_idx: with tik_instance.for_range(0, 7, thread_num=2) as cc1: input_x_ub = tik_instance.Tensor(dtype, [1, 128, 1, 7, 16], name="input_1_local_UB", scope=tik.scope_ubuf) transpose_ub = tik_instance.Tensor(dtype, [1, 1, 7, 128, 16], name="transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_x_ub, input_x[block_idx, 0, cc1, 0, 0], 0, 128, 7, 42, 0) with tik_instance.for_range(0, 7) as cc7: with tik_instance.for_range(0, 128) as cc8: tik_instance.vadds(16, transpose_ub[0, 0, cc7, cc8, 0], input_x_ub[0, cc8, 0, cc7, 0], 0, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 100352 + 14336 * cc1], transpose_ub, 0, 1, 896, 0, 0) elif tuple(input_x_shape) == (32, 32, 7, 7, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16": with tik_instance.for_range(0, 32, block_num=32) as block_idx: input_x_ub = tik_instance.Tensor(dtype, [1, 32, 7, 7, 16], name="input_1_local_UB", scope=tik.scope_ubuf) transpose_ub = tik_instance.Tensor(dtype, [1, 7, 7, 32, 16], name="transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_x_ub, input_x[block_idx, 0, 0, 0, 0], 0, 1, 1568, 0, 0) with tik_instance.for_range(0, 7) as cc1: with tik_instance.for_range(0, 7) as cc2: with tik_instance.for_range(0, 32) as cc3: tik_instance.vadds(16, transpose_ub[0, cc1, cc2, cc3, 0], input_x_ub[0, cc3, cc1, cc2, 0], 0, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 25088], transpose_ub, 0, 1, 1568, 0, 0) elif tuple(input_x_shape) == (32, 32, 14, 14, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16": def _inner_compute(split_index): input_x_ub = tik_instance.Tensor(dtype, [1, 32, 2, 14, 16], name="input_1_local_UB", scope=tik.scope_ubuf) transpose_ub = tik_instance.Tensor(dtype, [1, 2, 14, 32, 16], name="transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_x_ub, input_x[block_idx, 0, split_index * 2, 0, 0], 0, 32, 28, 168, 0) with tik_instance.for_range(0, 2) as cc2: with tik_instance.for_range(0, 14) as cc3: with tik_instance.for_range(0, 32) as cc4: tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0], 0, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 100352 + split_index * 2 * 7168], transpose_ub, 0, 1, 896, 0, 0) with tik_instance.for_range(0, 32, block_num=32) as block_idx: with tik_instance.for_range(0, 6, thread_num=2) as cc1: _inner_compute(cc1) _inner_compute(6) elif tuple(input_x_shape) == (32, 64, 14, 14, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16": def _inner_compute(split_index, block_idx): input_x_ub = tik_instance.Tensor(dtype, [1, 64, 2, 14, 16], name="input_1_local_UB", scope=tik.scope_ubuf) transpose_ub = tik_instance.Tensor(dtype, [1, 2, 14, 64, 16], name="transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_x_ub, input_x[block_idx, 0, split_index * 2, 0, 0], 0, 64, 28, 168, 0) with tik_instance.for_range(0, 2) as cc2: with tik_instance.for_range(0, 14) as cc3: with tik_instance.for_range(0, 64) as cc4: tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0], 0, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 200704 + split_index * 2 * 14336], transpose_ub, 0, 1, 1792, 0, 0) with tik_instance.for_range(0, 32, block_num=32) as block_idx: with tik_instance.for_range(0, 6, thread_num=2) as cc1: _inner_compute(cc1, block_idx) _inner_compute(6, block_idx) tik_instance.BuildCCE(kernel_name, inputs=[input_x], outputs=[res]) return tik_instance