def __init__(self, var, indices, updates, var_out, use_locking, kernel_name): self.tik_instance = tik.Tik(tik.Dprofile()) self.var_dtype = var.get("dtype").lower() self.indices_dtype = indices.get("dtype").lower() self.updates_dtype = updates.get("dtype").lower() self.out_dtype = var_out.get("dtype").lower() indices_support_dtype_list = ("int32", ) var_support_dtype_list = ("float32", ) check_dtype(self.indices_dtype, indices_support_dtype_list, param_name="indices") check_dtype(self.var_dtype, var_support_dtype_list, param_name="var") if self.var_dtype != self.updates_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "updates", "var", self.updates_dtype, self.var_dtype) if self.var_dtype != self.out_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "out", "var", self.out_dtype, self.var_dtype) self.kernel_name = kernel_name self.ai_core_num = tbe_platform.cce_conf.get_soc_spec( tbe_platform.cce_conf.CORE_NUM) self.ub_size_bytes = ( tbe_platform.cce_conf.get_soc_spec(tbe_platform.cce_conf.UB_SIZE) - RESERVED_UB_SIZE) self.var_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len( self.var_dtype) // 8 self.indices_dtype_bytes_size = tbe_platform.cce_intrin.get_bit_len( self.indices_dtype) // 8 self.var_data_each_block = 32 // self.var_dtype_bytes_size self.indices_data_each_block = 32 // self.indices_dtype_bytes_size self.tiling_gm = self.tik_instance.Tensor("int32", (TILING_ARG_NUM, ), name="tiling_gm", scope=tik.scope_gm) self.var_gm = self.tik_instance.Tensor(self.var_dtype, (MAX_INT32, ), name="var_gm", scope=tik.scope_gm) self.indices_gm = self.tik_instance.Tensor(self.indices_dtype, (MAX_INT32, ), name="indices_gm", scope=tik.scope_gm) self.updates_gm = self.tik_instance.Tensor(self.updates_dtype, (MAX_INT32, ), name="updates_gm", scope=tik.scope_gm) self.out_gm = self.tik_instance.Tensor(self.var_dtype, (MAX_INT32, ), name="out_gm", scope=tik.scope_gm) self.updates_ub = None self.indices_ub = None self.var_read_index = None self.updates_read_index = None self.indices_loop_index = None
def __init__(self, var, indices, updates, var_out, use_locking, kernel_name): self.tik_instance = tik.Tik(tik.Dprofile()) self.indicesdtype = indices.get("dtype").lower() self.updatesdtype = updates.get("dtype").lower() self.vardtype = var.get("dtype").lower() self.var_out_dtype = var_out.get("dtype").lower() indices_support_dtype_list = ("int32", ) check_dtype(self.indicesdtype, indices_support_dtype_list, param_name="indices") updates_support_dtype_list = ("float32", ) check_dtype(self.updatesdtype, updates_support_dtype_list, param_name="updates") self.tiling_dtype = "int32" if self.updatesdtype != self.vardtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "updates", "var", self.updatesdtype, self.vardtype) if self.vardtype != self.var_out_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "var_out", "var", self.var_out_dtype, self.vardtype) self.kernel_name = kernel_name self.var_read_index = self.tik_instance.Scalar("int32") self.updates_read_index = self.tik_instance.Scalar("int32") self.indices_loop_index = self.tik_instance.Scalar("int32") self.zero_var = self.tik_instance.Scalar(dtype=self.updatesdtype, name="zero_var") self.zero_var.set_as(0) self.indices_ub = None self.updates_ub = None self.core_num = self._tik_get_core_num() self.ub_size = self._tik_get_ub_size() self.tiling_gm = self.tik_instance.Tensor(self.tiling_dtype, (32, ), name="tiling_gm", scope=tik.scope_gm) self.input_var = self.tik_instance.Tensor(self.updatesdtype, (MAX_ZERO_DIM_VAR, ), name="input_var", scope=tik.scope_gm) self.input_indices = self.tik_instance.Tensor(self.indicesdtype, (MAX_ZERO_DIM_INDICE, ), name="input_indices", scope=tik.scope_gm) self.input_updates = self.tik_instance.Tensor(self.updatesdtype, (MAX_ZERO_DIM_INDICE, ), name="input_updates", scope=tik.scope_gm) self.output_var = self.tik_instance.Tensor(self.updatesdtype, (MAX_ZERO_DIM_VAR, ), name="output_var", scope=tik.scope_gm)
def __check_params(input_values, axis): _check_shape(input_values, "shape") _check_shape(input_values, "ori_shape") dim_num = len(input_values[0].get("ori_shape")) if axis >= dim_num or axis < -dim_num: error_manager.raise_err_input_value_invalid( "concat", "concat_dim", "between " + str(min(-dim_num, dim_num - 1)) + " and " + str(max(-dim_num, dim_num - 1)), axis) shape_value = [] for _, tensor_dict in enumerate(input_values): shape_value.append(tensor_dict.get("ori_shape")) first_input_shape = input_values[0].get("ori_shape") # dims must equal except merge axis axis_new = axis % dim_num for j, _ in enumerate(first_input_shape): if j == axis_new: continue dim_values = set() for _, element_shape in enumerate(shape_value): dim_values.add(element_shape[j]) if -1 in dim_values: dim_values.remove(-1) if len(dim_values) > 1: error_manager.raise_err_check_params_rules( "concat", "Dims must be equal except merge concat axis[%s]" % axis, "input_values", shape_value) dtype_lists = [] for input_value in input_values: input_format = input_value.get("format") dtype_lists.append(input_value.get("dtype")) supported_formats = {"ND", "NHWC", "NCHW"} if input_format not in supported_formats: error_manager.raise_err_input_format_invalid( 'concat', 'input_values', ','.join(supported_formats), input_format) dtype = dtype_lists[0] for index, dtype_ in enumerate(dtype_lists): if dtype != dtype_: error_manager.raise_err_inputs_dtype_not_equal( "concat", "input_values[0]", "input_values[%s]" % index, dtype, dtype_)
def __init__(self, indices, x, shape, y, kernel_name): self.indices_dtype = indices.get("dtype").lower() self.updates_dtype = x.get("dtype").lower() self.shape_dtype = shape.get("dtype").lower() self.y_dtype = y.get("dtype").lower() indices_support_dtype_list = ("int32", ) check_dtype(self.indices_dtype, indices_support_dtype_list, param_name="indices") updates_support_dtype_list = ("float32", ) check_dtype(self.updates_dtype, updates_support_dtype_list, param_name="updates") shape_support_dtype_list = ("int32", ) check_dtype(self.shape_dtype, shape_support_dtype_list, param_name="shape") if self.y_dtype != self.updates_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal(kernel_name, "y", "x", self.y_dtype, self.updates_dtype) self.tiling_dtype = "int32" self.tik_instance = tik.Tik(tik.Dprofile()) self.kernel_name = kernel_name self.core_start = self.tik_instance.Scalar("int32") self.core_end = self.tik_instance.Scalar("int32") self.var_read_index = self.tik_instance.Scalar("int32") self.updates_read_index = self.tik_instance.Scalar("int32") self.indices_var = self.tik_instance.Scalar("int32") self.block_idx = self.tik_instance.Scalar("int32") self.zero_var = self.tik_instance.Scalar(self.updates_dtype) self.zero_var.set_as(0) self.var_ub = None self.indices_ub = None self.updates_ub = None self.shape_ub = None self.updates_ub_one = None self.indices_ub_one = None self.cur_var = self.tik_instance.Scalar(dtype=self.updates_dtype) self.cur_update = self.tik_instance.Scalar(dtype=self.updates_dtype) self.acc_var = self.tik_instance.Scalar(dtype=self.updates_dtype) self.updates_var = self.tik_instance.Scalar(dtype=self.updates_dtype) self.aicore_num = self._tik_get_core_num() self.ub_size = self._tik_get_ub_size() self.tbe_product = self._tik_get_platform() self.tiling_gm = self.tik_instance.Tensor(self.tiling_dtype, (32,), name="tiling_gm", scope=tik.scope_gm) self.input_indices = self.tik_instance.Tensor(self.indices_dtype, (MAX_INPUT_SIZE, ), name="input_indices", scope=tik.scope_gm) self.input_updates = self.tik_instance.Tensor(self.updates_dtype, (MAX_INPUT_SIZE, ), name="input_updates", scope=tik.scope_gm) self.input_shape = self.tik_instance.Tensor(self.indices_dtype, (MAX_SHAPE, ), name="input_shape", scope=tik.scope_gm) #check platform if self.updates_dtype == "float32" and self.tbe_product in ("Ascend910", "Ascend610"): self.output_var = self.tik_instance.Tensor(self.updates_dtype, (MAX_SHAPE, ), name="output_var", scope=tik.scope_gm, is_atomic_add=True) else: self.output_var = self.tik_instance.Tensor(self.updates_dtype, (MAX_SHAPE, ), name="output_var", scope=tik.scope_gm)
def check_input_params(self): """ to the check whether the input parameters is valid or not """ if self.input_dtype != self.output_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( "split_d", "self.input_dtype", "self.output_dtype", self.input_dtype, self.output_dtype) dtype_list = ( "float16, float32, int32, int8, int16, int64, uint8, uint16, uint32, uint64" ) check_dtype(self.input_dtype, dtype_list, param_name="x")
def leaky_relu_grad(g, x, y, negative_slope=0, kernel_name="leaky_relu_grad"): """ calculate the backpropagation of leaky_relu operation y = gradients(x>0) or negative_slope*gradients(x<=0). support dtype:float16,float32 Parameters ---------- g : dict the backpropagated gradients to the corresponding leaky_relu operation x : dict the x passed as output of leaky_relu operation y : dict the output of leaky_relu back propagation negative_slope : float or int allow non-zero slope for negative inputs to speed up optimization kernel_name : str kernel name, default value is "leaky_relu_grad" Returns ------- None """ g_dtype = g.get("dtype").lower() x_dtype = x.get("dtype").lower() check_list = ("float16", "float32") check_dtype(g_dtype, check_list, param_name="input_g") check_dtype(x_dtype, check_list, param_name="input_x") check_elewise_shape_range([g, x], support_broadcast=True) if g_dtype != x_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "g", "x", g_dtype, x_dtype) ins = classify([g, x], Mode.ELEWISE_WITH_BROADCAST) schedules, tensors = [], [] for (g, x) in ins: with te.op.compute(): g_shape, x_shape = variable_shape([g, x], support_broadcast=True) g_shape, x_shape = refine_shapes_for_broadcast(g_shape, x_shape) tensor_g = tvm.placeholder(g_shape, g_dtype, "tensor_g") tensor_x = tvm.placeholder(x_shape, x_dtype, "tensor_x") res = leaky_relu_grad_compute(tensor_g, tensor_x, y, negative_slope, kernel_name) tensors.append((tensor_g, tensor_x, res)) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def real_div(x1, x2, y, kernel_name="real_div"): """ algorithm: real_div calculating data's real_div, c = a / b Parameters ---------- x1 : dict shape and dtype of first input, only support float16, float32, int32 x2 : dict shape and dtype of second input, only support float16, float32, int32 y: dict shape and dtype of output, should be broadcast shape and type as input kernel_name : str cce kernel name, default value is real_div Returns ------- None """ x_dtype = x1.get("dtype").lower() y_dtype = x2.get("dtype").lower() check_list = ("float16", "float32") check_dtype(x_dtype, check_list, param_name="input_x") check_dtype(y_dtype, check_list, param_name="input_y") check_elewise_shape_range([x1, x2], support_broadcast=True) if x_dtype != y_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "x1", "x2", x_dtype, y_dtype) ins = classify([x1, x2], Mode.ELEWISE_WITH_BROADCAST) schedules, tensors = [], [] for (x1, x2) in ins: with te.op.compute(): x_shape, y_shape = variable_shape([x1, x2], support_broadcast=True) x_shape, y_shape = refine_shapes_for_broadcast(x_shape, y_shape) tensor_x = tvm.placeholder(x_shape, x_dtype, "tensor_x") tensor_y = tvm.placeholder(y_shape, y_dtype, "tensor_y") res = real_div_compute(tensor_x, tensor_y, y, kernel_name) tensors.append([tensor_x, tensor_y, res]) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) # build config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def sigmoid_grad(x, dx, out, kernel_name="sigmoid_grad"): """ do sigmoid grad sigmoid_grad = (sigmoid - sigmoid*sigmoid)*grad Parameters: ---------- x : dictionary shape of sigmoid input dx : dictionary shape of grad out: dictionary output kernel_name : cce kernel name, default value is "sigmoid_grad_cce" Returns ------- None """ x_dtype = x.get("dtype").lower() dx_dtype = dx.get("dtype").lower() check_list = ("float16", "float32") check_dtype(x_dtype, check_list, param_name="input_x") check_dtype(dx_dtype, check_list, param_name="input_dx") check_elewise_shape_range([x, dx], support_broadcast=False) if x_dtype != dx_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "x", "dx", x_dtype, dx_dtype) ins = classify([x, dx], Mode.ELEWISE) schedules, tensors = [], [] for (sig, dx) in ins: with te.op.compute(): shape_sig, shape_dx = variable_shape([sig, dx], support_broadcast=False) shape_sig, shape_dx = refine_shapes_for_broadcast( shape_sig, shape_dx) tensor_sig = tvm.placeholder(shape_sig, x_dtype, "tensor_x") tensor_dx = tvm.placeholder(shape_dx, dx_dtype, "tensor_dx") res = sigmoid_grad_compute(tensor_sig, tensor_dx, out, kernel_name) tensors.append([tensor_sig, tensor_dx, res]) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def pad_d(input_x, output_x, paddings, kernel_name="pad_d"): """ calculating pad tensor by paddings parameters Parameters ---------- input_x : dict shape and dtype of input output_x: dict shape and dtype of output paddings: list or tuple. For each dimension D of input, paddings[D, 0] indicates how many values to add before the contents of tensor in that dimension, and paddings[D, 1] indicates how many values to add after the contents of tensor in that dimension. kernel_name : str cce kernel name, default value is "pad_d" Returns ------- None. """ in_shape = list(input_x.get("shape")) pads = [] for i in paddings: pads.append(list(i)) src_dtype = input_x.get("dtype").lower() dst_dtype = output_x.get("dtype").lower() if len(in_shape) != len(pads): error_detail = "Length of input must be as same as paddings" error_manager_vector.raise_err_two_input_shpae_invalid( "PadD", "input_x", "paddings", error_detail) if src_dtype != dst_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( "PadD", "src_dtype", "dst_dtype", src_dtype, dst_dtype) if src_dtype not in ["float32", "float16", "int32"]: error_detail = "Only support float, float16 and int32" error_manager_vector.raise_err_two_input_dtype_invalid( "PadD", "src_dtype", "dst_dtype", error_detail) tik_obj = tik.Tik() pad = pad_common.PadInit(pads, src_dtype, kernel_name, tik_obj, True) return pad_compute(pad)
def sqrt_grad(x, dx, out, kernel_name="sqrt_grad"): """ algorithm: sqrt_grad_cce Parameters ---------- x : dict of data: dict dx : dict of data_grad: dict out : dict of output: dict kernel_name : cce kernel name, default value is "sqrt_grad": str Returns ------- None """ x_dtype = x.get("dtype").lower() dx_dtype = dx.get("dtype").lower() check_list = ("float16", "float32") check_dtype(x_dtype, check_list, param_name="x") check_dtype(dx_dtype, check_list, param_name="dx") check_elewise_shape_range([x, dx], support_broadcast=False) if x_dtype != dx_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "x", "dx", x_dtype, dx_dtype) ins = classify([x, dx], Mode.ELEWISE) schedules, tensors = [], [] for (x, dx) in ins: with te.op.compute(): x_shape, dx_shape = variable_shape([x, dx], support_broadcast=False) x_shape, dx_shape = refine_shapes_for_broadcast(x_shape, dx_shape) tensor_x = tvm.placeholder(x_shape, x_dtype, "tensor_x") tensor_dx = tvm.placeholder(dx_shape, dx_dtype, "tensor_dx") res = sqrt_grad_compute(tensor_x, tensor_dx, out, kernel_name) tensors.append([tensor_x, tensor_dx, res]) with tvm.target.cce(): sch = generic.auto_schedule(res) schedules.append(sch) config = {"name": kernel_name, "tensor_list": tensors} te.lang.dynamic.build(schedules, config)
def __init__(self, params_dict, indices_dict, axis_dict, y_dict, kernel_name): """ constructor of GatherV2 Parameters ---------- params_dict: dict shape and dtype of input params indices_dict: dict shape and dtype of input indices axis_dict: dict shape and dtype of input axis y_dict: dict shape and dtype of output, should be same dtype as input kernel_name: str kernel name, default value is "GatherV2" Returns ------- None """ self.params_dtype = params_dict.get("dtype").lower() self.indices_dtype = indices_dict.get("dtype").lower() self.axis_dtype = axis_dict.get("dtype").lower() self.y_dtype = y_dict.get("dtype").lower() self.tiling_dtype = INT32 dtype_list = ("int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "float16", "float32") indices_support_dtype_list = ("int32", "int64") check_dtype(self.params_dtype, dtype_list, param_name="x") check_dtype(self.indices_dtype, indices_support_dtype_list, param_name="indices") check_dtype(self.axis_dtype, (INT32,), param_name="axis") if self.y_dtype != self.params_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal(kernel_name, "y", "x", self.y_dtype, self.params_dtype) profile = tik.Dprofile() self.ub_size = profile.get_unified_buffer_size() self.l1_size = profile.get_l1_buffer_size() self.core_num = profile.get_aicore_num() self.tik_instance = tik.Tik(profile, disable_debug=True) self.kernel_name = kernel_name self.axis_shape = (1,) self.x_shape = (PARAMS_SIZE,) self.indices_shape = (INDICES_NUM,) self.y_shape = (PARAMS_SIZE,) self.params_dsize = TYPE_LEN_DICT.get(self.params_dtype) self.indices_dsize = TYPE_LEN_DICT.get(self.indices_dtype) self.block_elem = BLOCK_SIZE // self.params_dsize self.x = None self.indices = None self.axis = None self.tiling_gm = None self.y = None self.params_pre = None self.params_axis = None self.params_row = None self.indices_num = None self.cache_params = None self.need_core_num = None self.tail_process_core = None self.indices_num_each_core = None self.indices_num_remaining = None self.indices_loop_num = None self.indices_row_num_once = None self.indices_row_num_last = None self.row_num_once_ub = None self.row_num_once_tail_ub = None self.inner_loop_num = None self.row_num_last_ub = None self.row_num_last_tail_ub = None self.inner_loop_num_last = None
def _check_equal_bias_dtype(p, name): if p["dtype"] != bias_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( "DynamicGRU", 'b', name, bias_dtype, p["dtype"])
def sparse_apply_proximal_adagrad_d(var_dict, accum_dict, lr_dict, l1_dict, l2_dict, grad_dict, indices_dict, var_out_dict, accum_out_dict, use_locking=False, kernel_name="SparseApplyProximalAdagradD"): """ sparse_apply_proximal_adagrad_d op entry interface Parameters ---------- var_dict: var params shape, dtype and range accum_dict: accum shape, dtype and range lr_dict: lr shape, dtype and range l1_dict: l1 shape, dtype and range l2_dict: l2 shape, dtype and range grad_dict: grad shape, dtype and range indices_dict: indices shape, dtype and range var_out_dict: var output shape, dtype and range accum_out_dict: accum output shape, dtype and range use_locking: default value is "False" kernel_name: kernel name of SparseApplyProximalAdagradD op Returns ------- compile info """ var_dtype_check_list = ("float32") indices_dtype_check_list = ("int32") var_dtype = var_dict.get("dtype").lower() check_dtype(var_dtype, var_dtype_check_list, param_name="var_dict") accum_dtype = accum_dict.get("dtype").lower() check_dtype(accum_dtype, var_dtype_check_list, param_name="accum_dict") lr_dtype = lr_dict.get("dtype").lower() check_dtype(lr_dtype, var_dtype_check_list, param_name="lr_dict") l1_dtype = l1_dict.get("dtype").lower() check_dtype(l1_dtype, var_dtype_check_list, param_name="l1_dict") l2_dtype = l2_dict.get("dtype").lower() check_dtype(l2_dtype, var_dtype_check_list, param_name="l2_dict") grad_dtype = grad_dict.get("dtype").lower() check_dtype(grad_dtype, var_dtype_check_list, param_name="grad_dict") indices_dtype = indices_dict.get("dtype").lower() check_dtype(indices_dtype, indices_dtype_check_list, param_name="indices_dict") var_out_dtype = var_out_dict.get("dtype").lower() check_dtype(var_out_dtype, var_dtype_check_list, param_name="var_out_dict") accum_out_dtype = accum_out_dict.get("dtype").lower() check_dtype(accum_out_dtype, var_dtype_check_list, param_name="accum_out_dict") if var_dtype != var_out_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "var", "var_out", var_dtype, var_out_dtype) if accum_dtype != accum_out_dtype: error_manager_vector.raise_err_inputs_dtype_not_equal( kernel_name, "accum", "accum_out", accum_dtype, accum_out_dtype) obj = SparseApplyProximalAdagradD(var_dtype, indices_dtype, kernel_name) obj.sparse_apply_proximal_adagrad_d() # add compile info te.op.add_compile_info( "vars", { "ub_size": obj.ub_size, "core_num": obj.core_num, "ub_tensor_num": obj.ub_tensor_num })