def inv_grad(input_y, input_dy, output_z, kernel_name="inv_grad"): """ algorithm: inv_grad calculating data's reciprocal grad,dx = -1*dy*y*y, where `y = 1/x`, and `dy` is the corresponding input gradient. Parameters ---------- input_y: dict shape and dtype of input_y, only support float16, float32, int32, int8 input_dy: dict shape and dtype of input_dy, should be same shape and type as input_y output_z: dict shape and dtype of output, should be same shape and type as input_y kernel_name: str kernel name, default value is "inv_grad" Returns ------- None """ shape_input_y = input_y.get("shape") shape_input_dy = input_dy.get("shape") dtype_input_y = input_y.get("dtype") dtype_input_dy = input_dy.get("dtype") check_shape(shape_input_y, param_name="input_y") check_shape(shape_input_dy, param_name="input_dy") shape_input_y = util.shape_refine(shape_input_y) shape_input_dy = util.shape_refine(shape_input_dy) if list(shape_input_y) != list(shape_input_dy): raise RuntimeError("the shape of input must be equal!") dtype_input_y = dtype_input_y.lower() dtype_input_dy = dtype_input_dy.lower() if dtype_input_dy != dtype_input_y: raise RuntimeError("the dtype of input must be equal!") check_list = ("float16", "float32", "int32", "int8") check_dtype(dtype_input_y, check_list, param_name="input_y") shape_input_dy, shape_input_y = refine_shapes_for_broadcast(shape_input_dy, shape_input_y) data_dy = tvm.placeholder(shape_input_dy, name="data_dy", dtype=dtype_input_dy) data_y = tvm.placeholder(shape_input_y, name="data_y", dtype=dtype_input_y) res = inv_grad_compute(data_y, data_dy, output_z, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_y, data_dy, res]} te.lang.cce.cce_build_code(sch, config)
def CusSquare(input_x, output_y, kernel_name="square"): """ algorithm: square calculating data's square,y= x*x Parameters ---------- input_x : dict shape and dtype of input, only support float32 output_y: dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "square" Returns ------- None """ shape = input_x.get("shape") dtype = input_x.get("dtype").lower() shape = util.shape_refine(shape) data = tvm.placeholder(shape, name="data", dtype=dtype.lower()) with tvm.target.cce(): res = square_compute(data, output_y, kernel_name) sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)
def caffe_reduction_layer_compute(placeholders, shape, dtype, axis, op, coeff, kernel_name="cce_reductionLayer", need_build=False, need_print=False): """ Since the shape of placeholder created by caffe_reduce is not same as input_shape, fusion_op could not process the fusion of two op which have different shape. So, caffe_reduce op could not be fused until tvm supports reshape in D. """ data = placeholders[0] inp_dtype = dtype.lower() axis = util.axis_check(len(shape), axis) shape = list(shape) shape1 = shape[:axis] + [ functools_reduce(lambda x, y: x * y, shape[axis:]) ] shape1, axis = util.shape_refine(shape1, axis) if not axis: axis = [0] shape1 = [1] + shape1 if op == "ASUM": data_tmp_input = te.lang.cce.vabs(data) cof = coeff tmp = te.lang.cce.vmuls(data_tmp_input, cof) elif op == "SUMSQ": data_tmp_input = te.lang.cce.vmul(data, data) cof = coeff tmp = te.lang.cce.vmuls(data_tmp_input, cof) elif op == "MEAN": size = shape1[-1] cof = float(coeff) * (size**(-1)) if inp_dtype == "int8" \ or inp_dtype == "uint8": data1 = te.lang.cce.vmuls(data, 1.0) data_cast = te.lang.cce.cast_to(data1, "float32") tmp = te.lang.cce.vmuls(data_cast, cof) else: tmp = te.lang.cce.vmuls(data, cof) elif op == "SUM": cof = coeff data_tmp_input = te.lang.cce.vmuls(data, cof) tmp = data_tmp_input res = te.lang.cce.sum(tmp, axis=axis) # Although the data type (int8/uint8) has changed, # the data values remain integer # during the calculation of other operators (SUM/ASUM/SUMSQ). if op != "MEAN": res = te.lang.cce.cast_to(res, inp_dtype, f1628IntegerFlag=True) return res
def custom_sign(shape, dtype, kernel_name="cce_custom_sign", need_build=False, need_print=False): """ x*32768 algrithm: sign = round(-------------------------) 2 ** (-15) + |x*32768| calculating data type is float16 Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32, int32 kernel_name : cce kernel name, default value is "cce_sign" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32", "int32"] if not dtype.lower() in check_list: raise RuntimeError( "custom_sign_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) shape = util.shape_refine(shape) inp_dtype = dtype.lower() data = tvm.placeholder(shape, name="data", dtype=inp_dtype) with tvm.target.cce(): res = custom_sign_compute([data], shape, dtype, kernel_name, need_build, need_print) sch = generic.auto_schedule(res) config = { "print_ir": need_print, "need_build": need_build, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)
def reduce_prod_d(x, y, axes, keep_dims=None, kernel_name="reduce_prod_d"): """ Reduce a tensor on a certain axes based on product. Parameters: ---------- x : dict shape and dtype of input y: dict shape and dtype of output axes : int, list, tuple, NoneType The dimensions to reduce. If None (the default), reduces all dimensions. Must be in the range [-rank(input_tensor), rank(input_tensor)). keep_dims : bool, NoneType if true, retains reduced dimensions with length 1, default value is None. kernel_name : str cce kernel name, default value is reduce_prod_d Returns ------- None """ shape = x.get("shape") check_shape(shape, param_name="x") inp_dtype = x.get("dtype").lower() check_list = ["float16", "float32", "int8", "uint8"] check_dtype(inp_dtype, check_list, param_name="x") shape_len = len(shape) if not axes: axes = range(shape_len) if hasattr(axes, 'index'): axes = list(axes) axes = util.axis_check(shape_len, axes) util.check_reduce_shape_rule(shape) shape, axes = util.shape_refine(list(shape), axes) shape, axes = util.simplify_axis_shape(shape, axes) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) with tvm.target.cce(): res = reduce_prod_d_compute(data_input, y, axes, keep_dims, kernel_name) sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_input, res] } te.lang.cce.cce_build_code(sch, config)
def abs(x, y, kernel_name="abs"): """ algorithm: abs calculating data's abs,y= |x| Parameters ---------- x : dict shape and dtype of input, only support float16, float32, int32 y: dict shape and dtype of output, should be same shape and type as input kernel_name : str cce kernel name, default value is abs Returns ------- None """ shape = x.get("shape") check_shape(shape, param_name="x") check_list = ["float16", "float32", "int32"] inp_dtype = x.get("dtype").lower() check_dtype(inp_dtype, check_list, param_name="x") shape = util.shape_refine(shape) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape) data = tvm.placeholder(fuseshape, name="data", dtype=inp_dtype) res = abs_compute(data, y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)
def square(input_x, output_y, kernel_name="square"): """ algorithm: square calculating data's square,y= x*x Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32, int32 output_y: dict shape and dtype of output, should be same shape and type as input kernel_name : str kernel name, default value is "square" Returns ------- None """ shape = input_x.get("shape") dtype = input_x.get("dtype").lower() check_shape(shape, param_name="input_x") check_list = ["float16", "float32", "int32"] if not dtype in check_list: raise RuntimeError("square only support float16, float32, int32") shape = util.shape_refine(shape) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape) data = tvm.placeholder(fuseshape, name="data", dtype=dtype.lower()) with tvm.target.cce(): res = square_compute(data, output_y, kernel_name) sch = generic.auto_schedule(res) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)
def erf(input_x, output_y, kernel_name="erf"): """ algorithm: erf Computes the Gauss error function of `x` element-wise Parameters ---------- input_x: dict shape and dtype of input, only support float16, float32 output_y: dict shape and dtype of output, should be same shape and type as input kernel_name: str kernel name, default value is "erf" Returns ------- None """ shape_input = input_x.get("shape") dtype_input = input_x.get("dtype") check_shape(shape_input, param_name="input_x") dtype_input = dtype_input.lower() check_list = ("float16", "float32") check_dtype(dtype_input, check_list, param_name="input_x") shape_input = util.shape_refine(shape_input) reshape_input = (functools_reduce(lambda x, y: x * y, shape_input[:]),) data_input = tvm.placeholder(reshape_input, name="data_input", dtype=dtype_input) erf_result = erf_compute(data_input, output_y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(erf_result) config = {"name": kernel_name, "tensor_list": [data_input, erf_result]} te.lang.cce.cce_build_code(sch, config)
def fills(x, y, value, kernel_name="fills"): """ do fill operation Parameters: ---------- x : the dict of output y : the dict of output value: scalar value, kernel_name : cce kernel name, default value is "fill" Returns ------- None """ # get the shape and dtype shape = x.get("shape") dtype = x.get("dtype").lower() # check whether dtypes are right check_list = ("int32", "float16", "float32") check_dtype(dtype, check_list) # fuse shapes shape = util.shape_refine(shape) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x * y, shape) data_x = tvm.placeholder(fuseshape, name="data_x", dtype=dtype) res = fills_compute(data_x, value, dtype) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": (data_x, res), "print_ir": False } te.lang.cce.cce_build_code(sch, config)
def softsign(x, y, kernel_name="softsign"): """ Computes for softsign. Parameters ---------- x: dict data of input. source data type, support "float16", "float32". y: dict data of output. kernel_name : str kernel name, default value is "softsign". Returns ------- None """ shape_input = x.get("shape") dtype_input = x.get("dtype") check_shape(shape_input, param_name="x") check_list = ("float16", "float32") check_dtype(dtype_input.lower(), check_list, param_name="x") shape = util.shape_refine(shape_input) shape_x = (functools_reduce(lambda x, y: x*y, shape[:]),) input_dtype = dtype_input.lower() data = tvm.placeholder(shape_x, name="data", dtype=input_dtype) res = softsign_compute(data, y, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data, res]} te.lang.cce.cce_build_code(sch, config)
def custom_Reduction(shape, dtype, axis, op, coeff, kernel_name="cce_reductionLayer", need_build=False, need_print=False): """ Reduce a tensor on a certain axis, and scale output with coeff Parameters ---------- shape : shape of data dtype : source data type, only support float16, float32, int8, uint8 axis : the first axis to reduce, may be negative to index from the end (e.g., -1 for the last axis). If axis == 0, the output Blob always has the empty shape (count 1), performing reduction across the entire input. op : can only be one of "SUM, ASUM (sum of abs), SUMSQ (sum of sqr), MEAN" coeff : scale for output kernel_name : cce kernel name, default value is "cce_reductionLayer" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape) check_list = ["float16", "float32", "int8", "uint8"] if not dtype.lower() in check_list: raise RuntimeError( "reductionLayer_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) reduction_op = ("SUM", "ASUM", "SUMSQ", "MEAN") if not isinstance(axis, int): raise RuntimeError("type of axis value should be int") if op not in reduction_op: raise RuntimeError("op can only be one of SUM, ASUM, SUMSQ , MEAN") if not isinstance(coeff, int) and not isinstance(coeff, float): raise RuntimeError("coeff must be a value") axis_origin = axis shape_origin = shape axis = util.axis_check(len(shape), axis) util.check_reduce_shape_rule(shape) shape = list(shape) shape1 = shape[:axis] + [ functools_reduce(lambda x, y: x * y, shape[axis:]) ] shape1, axis = util.shape_refine(shape1, axis) if not axis: axis = [0] shape1 = [1] + shape1 inp_dtype = dtype.lower() data = tvm.placeholder(shape1, name="data_input", dtype=inp_dtype) with tvm.target.cce(): res = caffe_reduction_layer_compute([data], shape_origin, dtype, axis_origin, op, coeff, kernel_name, need_build, need_print) if op == "MEAN" and (inp_dtype == "int8" or inp_dtype == "uint8"): util.check_shape_size(shape, SHAPE_SIZE_LIMIT) res = te.lang.cce.cast_to(res, inp_dtype) schedule = tvm.create_schedule(res.op) if need_print: with build_config: print(tvm.lower(schedule, [data, res], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [data, res], "cce", name=kernel_name) else: with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": need_print, "need_build": need_build, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)
def clip_by_value(input_t, clip_value_min, clip_value_max, output_t, kernel_name="clip_by_value"): """ algorithm: clip_by_value Clips tensor values to a specified min and max. Given a tensor t, this operation returns a tensor of the same type and shape as t with its values clipped to clip_value_min and clip_value_max. Any values less than clip_value_min are set to clip_value_min. Any values greater than clip_value_max are set to clip_value_max. Parameters ---------- input_t: dict with keys(shape and dtype) input tensor clip_value_min: dict with keys(shape and dtype) or scaler The minimum value to clip by. clip_value_max: dict with keys(shape and dtype) or scaler The minimum value to clip by. output_t: dict info of output tensor with the same shape as input. kernel_name: str kernel name, default value is "clip_by_value" Returns ------- None """ shape_x = input_t.get("shape") dtype = input_t.get("dtype") shape_min = clip_value_min.get("shape") shape_max = clip_value_max.get("shape") input_dtype = dtype.lower() check_dtype(input_dtype, ("float16", "float32", "int32"), param_name="input_t") if (shape_min != 0 and shape_max != 0): if (len(shape_min) > 1 and list(shape_min) != list(shape_x)): for i in range(0, len(shape_x)): if shape_min[i] != shape_x[i] and shape_min[i] != 1: raise RuntimeError("min/max: A 0-D (scalar) Tensor, " "or a Tensor with the same shape as t, " "or a Tensor broadcast to shape as t.") if (len(shape_max) > 1 and list(shape_max) != list(shape_x)): for i in range(0, len(shape_x)): if shape_max[i] != shape_x[i] and shape_max[i] != 1: raise RuntimeError("min/max: A 0-D (scalar) Tensor, " "or a Tensor with the same shape as t, " "or a Tensor broadcast to shape as t.") check_shape(shape_x, param_name="input_t") shape_x = util.shape_refine(shape_x) data_x = tvm.placeholder(shape_x, name="data_x", dtype=input_dtype) data_value = {} check_shape(shape_min, param_name="clip_value_min") shape_min = util.shape_refine(shape_min) if len(shape_min) != len(shape_x) and len(shape_min) == 1: list_min = [1] * (len(shape_x) - 1) shape_min = shape_min + list_min data_value["min"] = tvm.placeholder(shape_min, name="data_min", dtype=input_dtype) check_shape(shape_max, param_name="clip_value_max") shape_max = util.shape_refine(shape_max) if len(shape_max) != len(shape_x) and len(shape_max) == 1: list_max = [1] * (len(shape_x) - 1) shape_max = shape_max + list_max data_value["max"] = tvm.placeholder(shape_max, name="data_max", dtype=input_dtype) res = clip_by_value_compute(data_x, data_value["min"], data_value["max"], output_t, kernel_name) with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "name": kernel_name, "tensor_list": [data_x, data_value["min"], data_value["max"], res] } te.lang.cce.cce_build_code(sch, config)
def eltwise(x, y, mode=1, coeff=[], kernel_name="eltwise"): """ Compute elementwise modes, such as 0:PRODUCT, 1:SUM and 2:MAX Parameters ---------- x : the list of input data, it's element is dict:{"shape":[], "dtype":""} y : the dict of output mode : 0:product,1:sum,2:max;default is 1:sum. coeff : input_num should be equal with coeff size. kernel_name : cce kernel name, default value is "eltwise" Returns ------- None """ tensor_num = len(x) shapes = [item.get("shape") for item in x] shape0 = shapes[0] for i in range(1, tensor_num): if shapes[i] != shape0: errorInfo = {} errorInfo['errCode'] = "E81003" errorInfo['op_name'] = 'eltwise' errorInfo['shapes_list'] = str(shapes) raise RuntimeError(errorInfo, "In op[%s], the shapes[%s] of inputs should" " be the same." % (errorInfo['op_name'], errorInfo['shapes_list'])) _eltwise_check_para(x, y, mode=mode, coeff=coeff, kernel_name=kernel_name) shape = x[0].get("shape") dtype = x[0].get("dtype").lower() shape = util.shape_refine(shape) fuseshape = [1] fuseshape[0] = reduceIns(lambda x, y: x*y, shape) tlist = [] is_l1_depth_fusion = False with tvm.target.cce(): for i in range(0, tensor_num): datan_name = 'data%d' % i l1_fusion_type = x[i].get("L1_fusion_type", -1) if l1_fusion_type == 1: raise RuntimeError("eltwise does not support l1 width fusion") is_l1_depth_fusion = (l1_fusion_type == 0) or is_l1_depth_fusion addr_type = x[i].get("addr_type", 0) valid_shape = x[i].get("valid_shape", []) slice_offset = x[i].get("slice_offset", []) attr_x = {"addr_type": addr_type, "valid_shape": valid_shape, "slice_offset": slice_offset, "L1_fusion_type": l1_fusion_type} datan_tmp = tvm.placeholder(fuseshape, name=datan_name, dtype=dtype, attrs=attr_x) tlist.append(datan_tmp) res = eltwise_compute(tlist, y, mode, coeff, kernel_name) sch = generic.auto_schedule(res) tlist.append(res) config = {"print_ir": False, "need_build": False, "name": kernel_name, "tensor_list": tlist, "l1_fusion_option": is_l1_depth_fusion} te.lang.cce.cce_build_code(sch, config)
def log_softmax_v2(input_x, output_y, axis=-1, kernel_name="log_softmax_v2", impl_mode="high_performance"): """ algorithm: log_softmax calculating data's log_softmax, x - log(sum(exp(x))) Parameters ---------- input_x : dict shape and dtype of input, only support float16, float32 output_y: dict shape and dtype of output, should be same shape and type as input axis: int, list or tuple the data's axis, range is [-d, d-1] kernel_name : str cce kernel name, default value is log_softmax_v2 Returns ------- None """ check_list = ("float16", "float32") shape = input_x.get("shape") input_dtype = input_x.get("dtype").lower() shape_len = len(shape) shape_list = list(shape) if not isinstance(axis, int): axis = list(axis) check_shape(shape, param_name="input_x") check_dtype(input_dtype, check_list, param_name="input_x") axis = util.axis_check(shape_len, axis) if not isinstance(axis, int): for i in axis: if shape_list[i] == 1: raise RuntimeError("Cannot reduce on an axis with dimension 1") else: if shape_list[axis] == 1: raise RuntimeError("Cannot reduce on an axis with dimension 1") shape, axis = util.shape_refine(list(shape), axis) shape, axis = util.simplify_axis_shape(shape, axis) data_input = tvm.placeholder(shape, name="data_input", dtype=input_dtype) result = log_softmax_v2_compute(data_input, output_y, axis=axis, kernel_name=kernel_name, impl_mode=impl_mode) with tvm.target.cce(): sch = generic.auto_schedule(result) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_input, result] } te.lang.cce.cce_build_code(sch, config)
def reduce_min_d(input_min, output_min, axis, keep_dims=None, kernel_name="reduce_min_d"): """ Reduce a tensor on a certain axis based on min Parameters: ---------- input_min: dict dict of input, which contains shape and dtype output_min: dict dict of output, which contains shape and dtype axis: int or None The dimensions to reduce. If None (the default), reduces all dimensions. Must be in the range (-rank(input_tensor), rank(input_tensor)) keep_dims: True or False if true, retains reduced dimensions with length 1, default value is None kernel_name: str cce kernel name, default value is "reduce_min_d" Returns ------- None """ shape_input = input_min.get("shape") dtype_input = input_min.get("dtype") check_shape(shape_input, param_name="input_min") check_list = ("float16", "float32", "int8", "uint8") check_dtype(dtype_input.lower(), check_list, param_name="input_min") shape_len = len(shape_input) if not axis: axis = range(shape_len) if hasattr(axis, 'index'): axis = list(axis) axis = util.axis_check(shape_len, axis) is_5hdc = util.check_and_init_5hdc_reduce_support(input_min, axis) if not is_5hdc: shape_input, axis = util.shape_refine(list(shape_input), axis) shape_input, axis = util.simplify_axis_shape(shape_input, axis) data_input = tvm.placeholder(shape_input, name="data_input_" + kernel_name, dtype=dtype_input.lower()) shape_len = len(shape_input) if dtype_input.lower() in ("float32", "int32") and len(axis) == 1 \ and ((axis[0] == (shape_len - 1)) or (axis[0] == -1)): input_min["shape"] = tuple(shape_input) reduce_min_d_tik.reduce_min_d_tik(input_min, output_min, -1, kernel_name) else: res = reduce_min_d_compute(data_input, output_min, axis, keep_dims, kernel_name) if is_5hdc: res.ori_shape = input_min["ori_shape"] res.ori_format = input_min["ori_format"] with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.cce.cce_build_code(sch, config)
def reduce_any_d(x, y, axes, keepdims=None, kernel_name="reduce_any_d"): """ Reduce a tensor on a certain axes based on max Parameters: ---------- x : shape and dtype of input_data, only support int8 y : shape and dtype of output_res, reserved parameter, not used now axes : the first axes to reduce, may be negative to index from the end (e.g., -1 for the last axes). aixs may be int or list(e.g. [1,2]) keepdims : if true, retains reduced dimensions with length 1, default value is None kernel_name : cce kernel name, default value is "reduce_any_d" Returns ------- None """ shape = x.get("shape") dtype = x.get("dtype") check_shape(shape, param_name="x") if dtype == "bool": dtype = "int8" check_list = ("int8", ) check_dtype(dtype, check_list, param_name="x") shape_len = len(shape) if not axes: axes = range(shape_len) if hasattr(axes, 'index'): axes = list(axes) axes = util.axis_check(shape_len, axes) is_5hdc = util.check_and_init_5hdc_reduce_support(x, axes) if not is_5hdc: shape, axes = util.shape_refine(list(shape), axes) shape, axes = util.simplify_axis_shape(shape, axes) inp_dtype = dtype.lower() data_input = tvm.placeholder(shape, name="data_input_" + kernel_name, dtype=inp_dtype) res = reduce_any_d_compute(data_input, y, axes, keepdims, kernel_name) if is_5hdc: res.ori_shape = x["ori_shape"] res.ori_format = x["ori_format"] with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.cce.cce_build_code(sch, config)
def custom_Power(shape, dtype, gamma, alpha, beta, kernel_name="cce_caffe_power", need_build=False, need_print=False): """ calculate (alpha * data + beta) ** gamma, calulation method exp(gamma * log(alpha * data + beta)). when alpha * data + beta < 0 , the output is a meaningless value. Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32 gamma : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma alpha : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma beta : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma kernel_name : string kernel name in generated CCE kernal. default value is "cce_caffe_power" need_buid : bool if need to build CCEC kernel need_print : bool if need to print Halide IR Returns ------- None """ supported_dtypes = ["float16", "float32"] device_api = "cc_device_pow" util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not (dtype.lower() in supported_dtypes): raise RuntimeError("power_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim_x = len(shape) v_ndim_y = 0 p_shape_y = 0 p_input_y = "nullptr" block_num = "block_num" block_idx = "block_idx" padC0 = 0 p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale") p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift") p_power = util.create_param_ptr([gamma], inp_dtype, "p_power") p_shape_x = util.create_param_ptr(shape, "int32", "p_shape_x") # scale --> alpha, shitf --> beta, power --> gamma output = tvm.extern( shape, [data_input, p_scale, p_shift, p_power, p_shape_x], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[1].access_ptr("r"), # scale ins[2].access_ptr("r"), # shift ins[3].access_ptr("r"), # power v_ndim_x, ins[4].access_ptr("r"), # shape padC0, ins[0].access_ptr("r"), # input x v_ndim_y, v_ndim_y, p_shape_y, padC0, p_input_y, outs[0].access_ptr("w")), name="output", dtype=inp_dtype) s = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(s, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [data_input, output], "cce", name=kernel_name)
def reduce_all_d(input_data, output_data, axes, keepdims=None, kernel_name="reduce_all_d"): """ Reduce a tensor on a certain axes based on min Parameters: ---------- input_data: dict shape and dtype of input_data, only support int8 output_data: dict source data type, only support int8 axes : int, list ,tuple or None. the first axes to reduce, may be negative to index from the end (e.g., -1 for the last axes). axes may be int or list(e.g. [1,2]) keepdims : bool or None . if true, retains reduced dimensions with length 1, default value is None kernel_name : str cce kernel name, default value is "cce_all" Returns ------- None """ input_shape = input_data.get("shape") input_dtype = input_data.get("dtype").lower() if input_dtype == "bool": input_dtype = "int8" check_shape(input_shape, param_name="input_data") check_dtype(input_dtype, ("int8"), param_name="input_data") shape_len = len(input_shape) if not axes: axes = range(shape_len) if hasattr(axes, 'index'): axes = list(axes) axes = util.axis_check(shape_len, axes) if not isinstance(axes, int): for i in axes: if i >= len(input_shape): raise RuntimeError("axes should be less than dimension") else: if axes >= len(input_shape): raise RuntimeError("axes should be less than dimension") # 5HD Special param for 5hd schedule is_5hdc = util.check_and_init_5hdc_reduce_support(input_data, axes) if not is_5hdc: input_shape, axes = util.shape_refine(list(input_shape), axes) input_shape, axes = util.simplify_axis_shape(input_shape, axes) data_input = tvm.placeholder(input_shape, name="data_input_" + kernel_name, dtype=input_dtype) result = reduce_all_d_compute(data_input, output_data, axes, keepdims, kernel_name) if is_5hdc: result.ori_shape = input_data["ori_shape"] result.ori_format = input_data["ori_format"] with tvm.target.cce(): sch = generic.auto_schedule(result) config = { "print_ir": False, "name": kernel_name, "tensor_list": [data_input, result] } te.lang.cce.cce_build_code(sch, config)
def custom_exp(shape, dtype, kernel_name="cce_tf_exp", need_build=False, need_print=False): """ algorithm: exp calculating data's exp,y= e ** x ,dtype is float16, Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32 kernel_name : cce kernel name, default value is "cce_tf_exp" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ supported_dtypes = ["float16", "float32"] device_api = "DeviceExp" util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not (dtype.lower() in supported_dtypes): raise RuntimeError("tf_exp_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim = len(shape) block_num = "block_num" block_idx = "block_idx" padC0 = 0 p_scale = util.create_param_ptr([1], inp_dtype, "p_scale") p_shift = util.create_param_ptr([0], inp_dtype, "p_shift") p_base = util.create_param_ptr([-1], inp_dtype, "p_base") p_shape = util.create_param_ptr(shape, "int32", "p_shape") output = tvm.extern( shape, [data_input, p_scale, p_shift, p_base, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[1].access_ptr("r"), # scale ins[2].access_ptr("r"), # shift ins[3].access_ptr("r"), # base v_ndim, ins[4].access_ptr("r"), # shape padC0, ins[0].access_ptr("r"), # input x outs[0].access_ptr("w")), name="output", dtype=inp_dtype) s = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(s, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [data_input, output], "cce", name=kernel_name)
def reduce_max_d(x, y, axis, keepdims=False, kernel_name="reduce_max_d"): """ calculating data Parameters ---------- x : dict shape and dtype of input y : dict shape and dtype of output, should be same shape and type as input axis: list the first axis to reduce,may be negative to index from the end (e.g., -1 for the last axis). axis may be int or list(e.g. [1,2]) keepdims: bool if true, retains reduced dimensions with length 1, default value is None kernel_name : str kernel name, default value is "reduce_max_d" Returns ------- None """ shape = x.get("shape") dtype = x.get("dtype") input_dtype = dtype.lower() check_shape(shape, param_name="x") check_list = ["float16", "float32", "int8", "uint8", "int32"] check_dtype(input_dtype, check_list, param_name="x") shape_len = len(shape) if not axis: axis = range(shape_len) if hasattr(axis, 'index'): axis = list(axis) axis = util.axis_check(shape_len, axis) # Shape should not be modified in 5HD mode # 5HD Special param for 5hd schedule is_5hdc = util.check_and_init_5hdc_reduce_support(x, axis) if not is_5hdc: shape, axis = util.shape_refine(list(shape), axis) shape, axis = util.simplify_axis_shape(shape, axis) shape_len = len(shape) x["shape"] = shape if input_dtype in ("float32", "int32") and len(axis) == 1 \ and ((axis[0] == (shape_len - 1)) or (axis[0] == -1)): reduce_max_d_tik(x, y, axis[0], kernel_name) else: data_input = tvm.placeholder(shape, name="data_input_" + kernel_name, dtype=input_dtype) res = reduce_max_d_compute(data_input, y, axis, keepdims, kernel_name) if is_5hdc: res.ori_shape = x["ori_shape"] res.ori_format = x["ori_format"] with tvm.target.cce(): sch = generic.auto_schedule(res) config = {"name": kernel_name, "tensor_list": [data_input, res]} te.lang.cce.cce_build_code(sch, config)