def custom_equal(shape_x, shape_y, dtype, kernel_name="cce_tf_equal", need_build=False, need_print=False): """ do element-wise equal operation between two input tensors Parameters: ---------- shape_x : shape of input x shape_y : shape of input y dtype : source data type, support float16,float32,int32,int8,uint8 kernel_name : cce kernel name, default value is "cce_tf_equal" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_rule(shape_y) check_list = ["float16", "float32", "int32", "int8", "uint8", "bool"] dtype = dtype.lower() if not (dtype in check_list): raise RuntimeError( "tf_equal_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT) shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y) util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT) x = tvm.placeholder(shape_x, dtype=dtype, name="x") y = tvm.placeholder(shape_y, dtype=dtype, name="y") x_tmp = te.lang.cce.broadcast(x, shape_max) y_tmp = te.lang.cce.broadcast(y, shape_max) res = tvm.compute(shape_max, lambda *i: x_tmp(*i) == y_tmp(*i), name='res') sch = tvm.create_schedule(res.op) if need_print: with build_config: print(tvm.lower(sch, [x, y, res], simple_mode=True)) if need_build: with build_config: tvm.build(sch, [x, y, res], "cce", name=kernel_name)
def custom_logical_not(shape, dtype, kernel_name="cce_tf_logical_not", need_build=False, need_print=False): """ logical not for the input tensor Parameters ---------- shape : input shape of data dtype : the data type, support bool kernel_name : cce kernel name, default value is "cce_logical_not" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape) check_list = ["bool"] if not dtype.lower() in check_list: raise RuntimeError( "logical_not_cce ony supports %s while dtype is %s" % (",".join(check_list), dtype)) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) inp_dtype = dtype.lower() data = tvm.placeholder(shape, name="data", dtype=inp_dtype) with tvm.target.cce(): result = tvm.compute( shape, lambda *i: tvm.select(data[i] is True, False, True), name="result") schedule = tvm.create_schedule(result.op) if need_print: with build_config: print(tvm.lower(schedule, [data, result], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [data, result], "cce", name=kernel_name)
def custom_Exp(shape, dtype, gamma, alpha, beta, kernel_name="cce_exp", need_build=False, need_print=False): """ calculate gamma **(alpha * data + beta), calculate exp(log(gamma) * alpha * data) * (gamma ** beta) Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support \ float16, float32 gamma : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma, base alpha : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma, scale beta : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma, shift kernel_name : cce kernel name, default value is "cce_exp" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ supported_dtypes = ["float16", "float32"] device_api = "DeviceExp" util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not dtype.lower() in supported_dtypes: raise RuntimeError( "caffe_exp_layer_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) if gamma != -1 and gamma <= 0: # api cc_device_exp_c handle gamma == -1 as e raise ValueError( "please ensure gamma is greater than 0, where gamma = %s" % str(gamma)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim = len(shape) block_num = "block_num" block_idx = "block_idx" pad_c0 = 0 p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale") p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift") p_base = util.create_param_ptr([gamma], inp_dtype, "p_base") p_shape = util.create_param_ptr(shape, "int32", "p_shape") # scale --> alpha, shitf --> beta, base --> gamma output = tvm.extern( shape, [data_input, p_scale, p_shift, p_base, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[1].access_ptr("r"), # scale ins[2].access_ptr("r"), # shift ins[3].access_ptr("r"), # base v_ndim, ins[4].access_ptr("r"), # shape pad_c0, ins[0].access_ptr("r"), # input x outs[0].access_ptr("w")), name="output", dtype=inp_dtype) schedule = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(schedule, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [data_input, output], "cce", name=kernel_name)
def custom_truncatemod(shape1, shape2, dtype, kernel_name="cce_tf_truncatemod", need_build=False, need_print=False): """ do element-wise truncatemod operation between two input tensors Parameters: ---------- shape1 : shape of input data1 shape2 : shape of input data2 dtype : source data type, support float16,float32,int32 kernel_name : cce kernel name, default value is "cce_tf_truncatemod" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ max_dim = 8 shape1_len = len(shape1) shape2_len = len(shape2) if shape1_len > max_dim or shape2_len > max_dim: raise RuntimeError( "mod_cce only support up to %d dimensions while the shape's \ dimensions is %d, %d" % (max_dim, shape1_len, shape2_len)) util.check_kernel_name(kernel_name) util.check_shape_rule(shape1) util.check_shape_rule(shape2) util.check_shape_size(shape1, SHAPE_SIZE_LIMIT) util.check_shape_size(shape2, SHAPE_SIZE_LIMIT) check_list = ["float16", "float32", "int32"] device_api_map = {"float16": "cc_device_truncatemod_float16", "float32": "cc_device_truncatemod_float", "int32": "cc_device_truncatemod_int32"} dtype = dtype.lower() if dtype not in check_list: raise RuntimeError( "tf_truncatemod_cce only support %s while dtype is %s" % ( ",".join(check_list), dtype)) shape1, shape2, shape_out = util.produce_shapes(shape1, shape2) util.check_shape_size(shape_out, SHAPE_SIZE_LIMIT) inp_dtype = dtype.lower() device_api = device_api_map[inp_dtype] # block block_num = "block_num" block_idx = "block_idx" # x param v_xndim_cnt = tvm.const(len(shape1), "int32") p_xshape = util.create_param_ptr(shape1, "int32", "p_xshape") xpad_c0 = tvm.const(0, "int32") data_input_x = tvm.placeholder(shape1, name="data_input_x", dtype=inp_dtype) # y param v_yndim_cnt = tvm.const(len(shape2), "int32") p_yshape = util.create_param_ptr(shape2, "int32", "p_yshape") ypad_c0 = tvm.const(0, "int32") data_input_y = tvm.placeholder(shape2, name="data_input_y", dtype=inp_dtype) # output v_out_ndim_cnt = tvm.const(len(shape_out), "int32") p_out_shape = util.create_param_ptr(shape_out, "int32", "p_yshape") out_padc0 = tvm.const(0, "int32") output = tvm.extern(shape_out, [p_xshape, data_input_x, p_yshape, data_input_y, p_out_shape], lambda ins, outs: tvm.call_extern("int32_t", device_api, block_num, block_idx, v_xndim_cnt, ins[0].access_ptr("r"), # shape x xpad_c0, ins[1].access_ptr("r"), # input x v_yndim_cnt, ins[2].access_ptr("r"), # shape y ypad_c0, ins[3].access_ptr("r"), # input y v_out_ndim_cnt, ins[4].access_ptr("r"), # shape out out_padc0, outs[0].access_ptr("w")), name="output", dtype=inp_dtype) schedule = tvm.create_schedule(output.op) # print IR if need_print: with build_config: print(tvm.lower(schedule, [data_input_x, data_input_y, output], simple_mode=True)) # Compile to generate the cce file if need_build: with build_config: tvm.build(schedule, [data_input_x, data_input_y, output], "cce", name=kernel_name)
def custom_round(shape, dtype, kernel_name="cce_round", need_build=False, need_print=False): """ doing round operations, calculating data type is float16 or float32 or int32 Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype kernel_name : cce kernel name, default value is "cce_round" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ check_list = ["float16", "float32", "int32"] device_api_map = { "float16": "cc_device_round_float16", "float32": "cc_device_round_float", "int32": "cc_device_round_int32" } max_dim = 8 shape_len = len(shape) if shape_len > max_dim: raise RuntimeError( "round_cce only support up to %d dimensions while the shape's dimension is %d" % (max_dim, shape_len)) util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not (dtype.lower() in check_list): raise RuntimeError("round_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) device_api = device_api_map[inp_dtype] block_num = "block_num" block_idx = "block_idx" v_ndim = tvm.const(len(shape), "int32") padC0 = tvm.const(0, "int32") p_shape = util.create_param_ptr(shape, "int32", "p_shape") output = tvm.extern( shape, [data_input, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_ndim, ins[1].access_ptr("r"), # shape padC0, ins[0].access_ptr("r"), # input x outs[0].access_ptr("w")), name="output", dtype=inp_dtype) s = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(s, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [data_input, output], "cce", name=kernel_name)
def custom_pow(shape, shape_y, dtype, kernel_name="cce_tf_pow", need_build=False, need_print=False): """ calculate x^y, calculating data type is float16 or float32 or int32 when x < 0 , the output is a meaningless value. Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32, int32 kernel_name : cce kernel name, default value is "tf_pow_cce" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ supported_dtypes = ["float16", "float32", "int32"] device_api = "cc_device_pow" util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not dtype.lower() in supported_dtypes: raise RuntimeError("tf_pow_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_lhs = tvm.placeholder(shape, name="data_lhs", dtype=inp_dtype) data_rhs = tvm.placeholder(shape, name="data_rhs", dtype=inp_dtype) v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim = len(shape) block_num = "block_num" block_idx = "block_idx" pad_c0 = 0 p_scale = util.create_param_ptr([0], inp_dtype, "p_scale") p_shift = util.create_param_ptr([0], inp_dtype, "p_shift") p_power = util.create_param_ptr([0], inp_dtype, "p_power") p_shape = util.create_param_ptr(shape, "int32", "p_shape") output = tvm.extern( shape, [data_lhs, data_rhs, p_scale, p_shift, p_power, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[2].access_ptr("r"), # scale ins[3].access_ptr("r"), # shift ins[4].access_ptr("r"), # power v_ndim, ins[5].access_ptr("r"), # shape pad_c0, ins[0].access_ptr("r"), # input x v_ndim, v_ndim, ins[5].access_ptr("r"), # shape pad_c0, ins[1].access_ptr("r"), # input y outs[0].access_ptr("w")), name="output", dtype=inp_dtype) schedule = tvm.create_schedule(output.op) if need_print: with build_config: print( tvm.lower(schedule, [data_lhs, data_rhs, output], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [data_lhs, data_rhs, output], "cce", name=kernel_name)
def custom_Upsample(shape, dtype, scale, data_format="channels_last", kernel_name="cce_darknet_upsample", need_build=False, need_print=False): """ Parameters ---------- shape: input tensor's shape dtype: input tensor's dtype, support:`float16,float32 scale: the upsampling factors data_format: "channels_last" or "channels_first" kernel_name : kernel name, default value is "MyUpsample" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ """ TODO: Please refer to the TE DSL Manual, And code here with TE DSL. """ inp_dtype = dtype.lower() check_list = ["float16", "float32", "int32", "int8", "uint8"] if inp_dtype not in check_list: raise RuntimeError("upsample only support %s while dtype is %s" % (",".join(check_list), dtype)) util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) size = (scale, scale) shape_size = len(shape) if not (shape_size == 4 or shape_size == 5): raise RuntimeError( "upsample only support 4D or 5D while len(shape):%d" % len(shape)) input_tensor = tvm.placeholder(shape, name="input_tensor", dtype=inp_dtype) res = None if shape_size == 5: # shape_size == 5 D-sepecial (N, C1, H, W, C0) output_shape = (shape[0], shape[1], shape[2] * size[0], shape[3] * size[1], shape[4]) res = tvm.compute( output_shape, lambda n, c0, h, w, c: input_tensor[n, c0, h // size[ 0], w // size[1], c]) else: if data_format == "channels_last": output_shape = (shape[0], shape[1] * size[0], shape[2] * size[1], shape[3]) res = tvm.compute( output_shape, lambda n, h, w, c: input_tensor[n, h // size[0], w // size[1], c]) elif data_format == "channels_first": output_shape = (shape[0], shape[1], shape[2] * size[0], shape[3] * size[1]) res = tvm.compute( output_shape, lambda n, c, h, w: input_tensor[n, c, h // size[ 0], w // size[1]]) else: raise RuntimeError( "upsample only support channels_last|channels_first " "while input type %s" % data_format) schedule = tvm.create_schedule(res.op) if need_print: with build_config: print(tvm.lower(schedule, [input_tensor, res], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [input_tensor, res], "cce", name=kernel_name)
def custom_expm1(shape, dtype, kernel_name="cce_tf_expm1", need_build=False, need_print=False): """ algorithm: expm1 calculating data's expm1, y= (e ** x) - 1,dtype is float16 or float32. Parameters ---------- shape : shape of data. dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32. kernel_name : cce kernel name, default value is "cce_tf_expm1". need_buid : if need to build CCEC kernel, default value is False. need_print : if need to print the ir, default value is False. Returns ------- None """ # [aicpu] int32_t cc_device_exp(uint32_t blockNum, uint32_t blockIdx, int32_t dataType, const void *scale, const void *shift, # const void *base, int32_t dimCnt, int32_t *shape, uint32_t padC0, const void *x, void *y); supported_dtypes = ["float16", "float32"] util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not (dtype.lower() in supported_dtypes): raise RuntimeError("tf_expm1_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) # step 1. calculate y = exp ** x by aicpu api device_api = "DeviceExp" v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim = len(shape) block_num = "block_num" block_idx = "block_idx" padC0 = 0 p_scale = util.create_param_ptr([1], inp_dtype, "p_scale") p_shift = util.create_param_ptr([0], inp_dtype, "p_shift") p_base = util.create_param_ptr([-1], inp_dtype, "p_base") p_shape = util.create_param_ptr(shape, "int32", "p_shape") output_exp = tvm.extern( shape, [data_input, p_scale, p_shift, p_base, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[1].access_ptr("r"), # scale ins[2].access_ptr("r"), # shift ins[3].access_ptr("r"), # base v_ndim, ins[4].access_ptr("r"), # shape padC0, ins[0].access_ptr("r"), # input x outs[0].access_ptr("w")), name="output_exp", dtype=inp_dtype) offset = tvm.const((-1), dtype=inp_dtype) # step 2. cauculate y = exp ** x - 1 by tvm output = tvm.compute( shape, lambda *indice: output_exp(*indice) + offset.astype(inp_dtype), name="output") # step 3. schedule the computation by tvm s = tvm.create_schedule(output.op) # step 4. build by tvm if need_print: with build_config: print(tvm.lower(s, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [data_input, output], "cce", name=kernel_name)
def SpatialTransformer(input_shape, out_shape, dtype="float32", kernel_name="SpatialTransformer", need_build = True, need_print = False): """Spatial Transformer Layer Implements a spatial transformer layer as described in [1]_. Based on [2]_. Parameters ---------- input_shape : the shape of input tensor [num_batch, height, width, num_channels] out_shape: float the height and width of output tensor [out_height, out_width]. out_size: tuple of two ints The size of the output of the network (height, width) dtype: data type kernel_name : kernel name, default value is "SpatialTransformer" need_buid : if need to build CCEC kernel, default value is True need_print : if need to print the ir, default value is False Returns ------- tvm.Tensor References ---------- .. [1] Spatial Transformer Networks Max Jaderberg, Karen Simonyan, Andrew Zisserman, Koray Kavukcuoglu .. [2] https://github.com/tensorflow/models/tree/master/research/transformer """ def _meshgrid(height, width): y0 = tvm.compute((height,), lambda i: -1 + i * 2.0 / (height - 1), name = 'y0') x0 = tvm.compute((width,), lambda i: -1 + i * 2.0 / (width - 1), name = 'x0') y = tvm.compute((height * width,), lambda i: y0[i // width], name = 'y') x = tvm.compute((height * width,), lambda i: x0[i % width], name = 'x') y = topi.reshape(y, (1, height * width)) x = topi.reshape(x, (1, height * width)) ones = tvm.compute((1, height * width), lambda i,j:1, name = 'ones') grid = tvm.compute((3, height * width),lambda i,j: 0.5 * (i - 1) * (i - 2) * x[0,j] + i * (2 - i) * y[0,j] + 0.5 * i * (i-1) * ones[0,j], name = 'grid') #grid = topi.concatenate((x,y,ones),0) #can not use topi.concatenate return grid def _interpolate(im, im_shape, x, y, out_size, dtype): num_batch = im_shape[0] height = im_shape[1] width = im_shape[2] channels = im_shape[3] out_height = out_size[0] out_width = out_size[1] max_y = int(im_shape[1] - 1) max_x = int(im_shape[2] - 1) #[-1,1] -> [0, width-1] x = topi.multiply(topi.add(x, tvm.const(1, dtype=dtype)), width / tvm.const(2, dtype=dtype)) y = topi.multiply(topi.add(y, tvm.const(1, dtype=dtype)), height / tvm.const(2, dtype=dtype)) # do sampling dim3 = out_height * out_width * num_batch x0 = topi.cast(topi.floor(x), 'int32') y0 = topi.cast(topi.floor(y), 'int32') x1 = topi.add(x0,tvm.const(1, dtype="int32")) y1 = topi.add(y0,tvm.const(1, dtype="int32")) x0 = topi.clip(x0, 0, max_x) x1 = topi.clip(x1, 0, max_x) y0 = topi.clip(y0, 0, max_y) y1 = topi.clip(y1, 0, max_y) dim2 = width dim1 = width * height base = tvm.compute((dim3,),lambda i:(i // (out_height * out_width)) * width * height, name = 'base') base_y0 = topi.add(base, topi.multiply(y0, dim2)) base_y1 = topi.add(base, topi.multiply(y1, dim2)) idx_a = topi.add(base_y0, x0) idx_b = topi.add(base_y1, x0) idx_c = topi.add(base_y0, x1) idx_d = topi.add(base_y1, x1) im_flat = topi.reshape(im, (num_batch * height * width, channels)) im_flat = topi.cast(im_flat, dtype) Ia = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_a[i], j], name = 'Ia') Ib = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_b[i], j], name = 'Ib') Ic = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_c[i], j], name = 'Ic') Id = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_d[i], j], name = 'Id') x0_f = topi.cast(x0, dtype) x1_f = topi.cast(x1, dtype) y0_f = topi.cast(y0, dtype) y1_f = topi.cast(y1, dtype) wa = topi.expand_dims(topi.multiply(topi.subtract(x1_f, x), topi.subtract(y1_f, y)), 1) wb = topi.expand_dims(topi.multiply(topi.subtract(x1_f, x), topi.subtract(y, y0_f)), 1) wc = topi.expand_dims(topi.multiply(topi.subtract(x, x0_f), topi.subtract(y1_f, y)), 1) wd = topi.expand_dims(topi.multiply(topi.subtract(x, x0_f), topi.subtract(y, y0_f)), 1) output = topi.add(topi.add(topi.add(topi.multiply(wa, Ia), topi.multiply(wb, Ib)),topi.multiply(wc, Ic)), topi.multiply(wd, Id)) return output def _transform(theta, input_dim, out_size, input_shape, dtype): num_batch = input_shape[0] height = input_shape[1] width = input_shape[2] num_channels = input_shape[3] theta = topi.reshape(theta, (num_batch, 2, 3)) theta = topi.cast(theta, dtype) out_height = out_size[0] out_width = out_size[1] grid = _meshgrid(out_height, out_width) grid = topi.reshape(grid, (num_batch, 3, out_height*out_width)) grid = topi.cast(grid, dtype=dtype) k = tvm.reduce_axis((0, 3), 'k') T_g = tvm.compute((num_batch, 2, out_height*out_width),lambda b, y, x: tvm.sum(theta[b, y, k] * grid[b, k, x], axis = k), name = 'T_g') x_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,0,k], name = 'x_s') y_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,1,k], name = 'y_s') x_s_flat = topi.reshape(x_s, (num_batch*out_height*out_width,)) y_s_flat = topi.reshape(y_s, (num_batch*out_height*out_width,)) input_transformed = _interpolate(input_dim, input_shape, x_s_flat, y_s_flat, out_size, dtype) output = topi.reshape(input_transformed, [num_batch, out_height, out_width, num_channels]) return output num_batch = input_shape[0] input_height = input_shape[1] input_width = input_shape[2] channel = input_shape[3] U = tvm.placeholder((num_batch, input_height, input_width, channel), name="U", dtype=dtype) theta = tvm.placeholder((num_batch, 6, 1, 1), dtype=dtype) output = _transform(theta, U, out_shape, input_shape, dtype) s = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(s, [U, theta, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [U, theta, output], "cce", name=kernel_name)
def custom_batch_matmul(shape_x, shape_y, dtype, trans_a=False, trans_b=False, kernel_name="cce_tf_batch_matmul", need_build=False, need_print=False): """ Multiplies slices of two tensors in batches(each slice can be viewed as an element of a batch), the output is of the same batch size. Each of the individual slices can optionally be transposed before multiplication by setting the trans_a or trans_b flag to True, which are by default False. The input tensors are 2-D or higher with the shape [..., r_x, c_x] and [..., r_y, c_y]. The output tensor is 2-D or higher with the shape [..., r_o, c_o], where r_o = c_x if trans_a else r_x c_o = r_y if trans_b else c_y Parameters ---------- shape_x : shape of the first tensor x with rank > 1 shape_y : shape of the second tensor y with the same type and shape with x dtype : the data type, support int8, uint8,float16,float32,int32 kernel_name : cce kernel name, default value is "cce_batch_matmul" trans_a : if True, shape_x is transposed before multiplication trans_b : if True, shape_y is transposed before multiplication need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_rule(shape_y) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT) data_dtype = dtype.lower() check_list = ["int8", "uint8", "float16", "float32", "int32"] if data_dtype not in check_list: raise RuntimeError( "batch_matmul_cce ony supports %s while dtype is %s" % (",".join(check_list), dtype)) def transpose_tensor(shape, size): """Transpose the shape, e.g., the shape [..., r_x, c_x] is transposed to [..., c_x, r_x]. Parameters ---------- shape : shape of a tensor size : length of the shape Returns ------- shape_ori : the transposed shape """ shape_ori = () if size == 1: shape_ori = shape_ori + shape elif size == 2: shape_ori = shape_ori + (shape[1], ) + (shape[0], ) else: shape_ori = shape_ori + (shape[:(size - 2)]) + ( shape[size - 1], ) + (shape[size - 2], ) return shape_ori def check_matmul(shape_x, shape_y): """Check whether batch_matmul is supported or not. Parameters ---------- shape_x : shape of the first tensor x shape_y : shape of the second tensor y with the same type and shape with x Returns ------- None """ len_x = len(shape_x) len_y = len(shape_y) if (len_x < 2) or (len_y < 2): raise RuntimeError("Only tensors of rank>=2 are supported!") if shape_x[len_x - 1] != shape_y[len_y - 2]: raise RuntimeError( "Invalid matrix multiplication for the inner 2 dimensions!") if (len_x == len_y) and (len_x > 2): for i in range(len_x - 2): if shape_x[i] != shape_y[i]: raise RuntimeError("Outer dimensions do not match!") return elif (len_x == len_y) and (len_x == 2): return else: raise RuntimeError("The input tensors are not with the same rank!") def _compute(output_shape, x, y, K, trans_a, trans_b, *indices): """matmul compuation in terms of the output shape and the transposes Parameters ---------- output_shape : the final output shape, e.g., shape_x = (2, 6), shape_y = (8, 2), trans_a = True, True_b = True, then, output_shape = (6, 8). x : the first input tensor according to shape_x. y : the second input tensor according to shape_y. K : the number of the axis for sum, in the above example, K = 2. trans_a : if True, x needs to be transposed. trans_b : if True, y needs to be transposed. *indices : the output shape space for tvm.compute. Returns ------- tvm.Tensor """ n_len = len(output_shape) k = tvm.reduce_axis((0, K), 'k') if trans_a is True and trans_b is False: # For example, A: (6, 7, 8), B: (6, 7, 9), so the length is n = 3 # C = A' * B : (6, 8, 9), A' means the transpose of A # indices means the space of (6, 8, 9), k = 7 # x_indices = indices[:1]+(7, )+indices[1:2] = (6, 7, 8) # y_indices = indices[:1]+(7, )+indices[2:] = (6, 7, 9) x_indices = indices[:(n_len - 2)] + (k, ) + indices[(n_len - 2): (n_len - 1)] y_indices = indices[:(n_len - 2)] + (k, ) + indices[(n_len - 1):] return tvm.sum(x(*x_indices) * y(*y_indices), axis=k) elif not trans_a and trans_b: # For example, A: (6, 7, 8), B: (6, 9, 8), C = A * B' : (6, 7, 9) # indices means the space of (6, 7, 9), n=3, k = 8 # x_indices = indices[:2]+(8, ) = (6, 7, 8) # y_indices = indices[:1]+indices[2:]+(8, ) = (6, 9, 8) x_indices = indices[:(n_len - 1)] + (k, ) y_indices = indices[:(n_len - 2)] + indices[(n_len - 1):] + (k, ) return tvm.sum(x(*x_indices) * y(*y_indices), axis=k) elif trans_a and trans_b: # For example, A: (6, 8, 10), B: (6, 12, 8), C = A' * B' : \ # (6, 10, 12) # indices means the space of (6, 10, 12), n=3, k = 8 # x_indices = indices[:1]+(8, )+indices[1:2] = (6, 8, 10) # y_indices = indices[:1]+indices[2:]+(8, ) = (6, 12, 8) x_indices = indices[:(n_len - 2)] + (k, ) + indices[(n_len - 2): (n_len - 1)] y_indices = indices[:(n_len - 2)] + indices[(n_len - 1):] + (k, ) return tvm.sum(x(*x_indices) * y(*y_indices), axis=k) else: # For example, A: (6, 15, 16), B: (6, 16, 18), C = A * B : \ # (6, 15, 18) # indices means the space of (6, 15, 18), n=3, k = 16 # x_indices = indices[:2]+(16, ) = (6, 15, 16) # y_indices = indices[:1]+(16, )+indices[2:] = (6, 16, 18) x_indices = indices[:(n_len - 1)] + (k, ) y_indices = indices[:(n_len - 2)] + (k, ) + indices[(n_len - 1):] return tvm.sum(x(*x_indices) * y(*y_indices), axis=k) def check_supportted_shape_size(shape_x, shape_y, limit, trans_a, trans_b): """ check shape size for operator ---------- shape: shape of data limit: limit of the product Returns ------- None """ # This function is used to check whether the shape is too large to \ # cause a timeout. # shape_x = (a,b,c,d,e,k) shape_y = (a,b,c,d,k,f) # t_1 : time consumed by each addition operation # t_2 : time consumed by each multiplication operation # t_all : time consumed by a complete calculation # t_all is approximately equal to (a*b*c*d)*(e*k*f)*(t_1+t_2) # As (t_1 + t_2) is a constant, so t_all is proportional to \ # (a * b * c * d * e * k * f) len_x = len(shape_x) len_y = len(shape_y) if (len_x < 2) or (len_y < 2): raise RuntimeError("Only tensors of rank>=2 are supported!") shape_x = list(shape_x) shape_y = list(shape_y) tmp_shape_x = shape_x[:] if trans_a: tmp_shape_x = shape_x[:-2] + [shape_x[-1], shape_x[-2]] tmp_shape_y = shape_y[:] if trans_b: tmp_shape_y = shape_y[:-2] + [shape_y[-1], shape_y[-2]] union_shape = tmp_shape_x + [tmp_shape_y[-1]] union_size = reduce(lambda i, j: i * j, union_shape) if union_size > limit: raise RuntimeError("the shape is too large to calculate") if data_dtype in ["float16", "float32", "int32"]: type_shape_map = { 'float16': SHAPE_SIZE_FP16_LIMIT, 'float32': SHAPE_SIZE_FP32_LIMIT, 'int32': SHAPE_SIZE_INT32_LIMIT } check_supportted_shape_size(shape_x, shape_y, type_shape_map[data_dtype], trans_a, trans_b) x_size = len(shape_x) y_size = len(shape_y) shape_a = shape_x shape_b = shape_y if trans_a is True: shape_x = transpose_tensor(shape_x, x_size) if trans_b is True: shape_y = transpose_tensor(shape_y, y_size) check_matmul(shape_x, shape_y) last_axis = shape_x[x_size - 1] x_temp = tvm.placeholder(shape_a, name="input_1", dtype=data_dtype) y_temp = tvm.placeholder(shape_b, name="input_2", dtype=data_dtype) # output shape output_shape = () for i in range(x_size - 1): output_shape = output_shape + (shape_x[i], ) output_shape = output_shape + (shape_y[x_size - 1], ) result = tvm.compute( output_shape, lambda *indices: _compute(output_shape, x_temp, y_temp, last_axis, trans_a, trans_b, *indices), name="result") schedule = tvm.create_schedule(result.op) if need_print: with build_config: print( tvm.lower(schedule, [x_temp, y_temp, result], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [x_temp, y_temp, result], "cce", name=kernel_name)
def custom_Reduction(shape, dtype, axis, op, coeff, kernel_name="cce_reductionLayer", need_build=False, need_print=False): """ Reduce a tensor on a certain axis, and scale output with coeff Parameters ---------- shape : shape of data dtype : source data type, only support float16, float32, int8, uint8 axis : the first axis to reduce, may be negative to index from the end (e.g., -1 for the last axis). If axis == 0, the output Blob always has the empty shape (count 1), performing reduction across the entire input. op : can only be one of "SUM, ASUM (sum of abs), SUMSQ (sum of sqr), MEAN" coeff : scale for output kernel_name : cce kernel name, default value is "cce_reductionLayer" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape) check_list = ["float16", "float32", "int8", "uint8"] if not dtype.lower() in check_list: raise RuntimeError( "reductionLayer_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) reduction_op = ("SUM", "ASUM", "SUMSQ", "MEAN") if not isinstance(axis, int): raise RuntimeError("type of axis value should be int") if op not in reduction_op: raise RuntimeError("op can only be one of SUM, ASUM, SUMSQ , MEAN") if not isinstance(coeff, int) and not isinstance(coeff, float): raise RuntimeError("coeff must be a value") axis_origin = axis shape_origin = shape axis = util.axis_check(len(shape), axis) util.check_reduce_shape_rule(shape) shape = list(shape) shape1 = shape[:axis] + [ functools_reduce(lambda x, y: x * y, shape[axis:]) ] shape1, axis = util.shape_refine(shape1, axis) if not axis: axis = [0] shape1 = [1] + shape1 inp_dtype = dtype.lower() data = tvm.placeholder(shape1, name="data_input", dtype=inp_dtype) with tvm.target.cce(): res = caffe_reduction_layer_compute([data], shape_origin, dtype, axis_origin, op, coeff, kernel_name, need_build, need_print) if op == "MEAN" and (inp_dtype == "int8" or inp_dtype == "uint8"): util.check_shape_size(shape, SHAPE_SIZE_LIMIT) res = te.lang.cce.cast_to(res, inp_dtype) schedule = tvm.create_schedule(res.op) if need_print: with build_config: print(tvm.lower(schedule, [data, res], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [data, res], "cce", name=kernel_name) else: with tvm.target.cce(): sch = generic.auto_schedule(res) config = { "print_ir": need_print, "need_build": need_build, "name": kernel_name, "tensor_list": [data, res] } te.lang.cce.cce_build_code(sch, config)
def custom_logical_and(shape_x, shape_y, dtype, kernel_name="cce_tf_logical_and", need_build=False, need_print=False): """ do element-wise logical-and operation between two input tensors Parameters: ---------- shape_x : shape of input data1 shape_y : shape of input data2 dtype : source data type, support "bool" kernel_name : cce kernel name, default value is "cce_tf_logical_and" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_rule(shape_y) check_list = ["bool"] if not (dtype.lower() in check_list): raise RuntimeError( "logical_and_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT) inp_dtype = dtype.lower() shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y) data1 = tvm.placeholder(shape_x, dtype=inp_dtype, name="data1") data2 = tvm.placeholder(shape_y, dtype=inp_dtype, name="data2") with tvm.target.cce(): data1_tmp1 = te.lang.cce.broadcast(data1, shape_max) data1_tmp2 = te.lang.cce.broadcast(data2, shape_max) min_value = tvm.const(0, dtype=inp_dtype) res = tvm.compute( shape_max, lambda *i: tvm.select( tvm.all( tvm.any( data1_tmp1(*i) > min_value, data1_tmp1(*i) < -min_value), tvm.any( data1_tmp2(*i) > min_value, data1_tmp2(*i) < -min_value)), True, False), name="res") sch = tvm.create_schedule(res.op) if need_print: with build_config: print(tvm.lower(sch, [data1, data2, res], simple_mode=True)) if need_build: with build_config: tvm.build(sch, [data1, data2, res], "cce", name=kernel_name)
def custom_Power(shape, dtype, gamma, alpha, beta, kernel_name="cce_caffe_power", need_build=False, need_print=False): """ calculate (alpha * data + beta) ** gamma, calulation method exp(gamma * log(alpha * data + beta)). when alpha * data + beta < 0 , the output is a meaningless value. Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32 gamma : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma alpha : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma beta : the data type must be same with dtype parameter args in (alpha * data + beta) ** gamma kernel_name : string kernel name in generated CCE kernal. default value is "cce_caffe_power" need_buid : bool if need to build CCEC kernel need_print : bool if need to print Halide IR Returns ------- None """ supported_dtypes = ["float16", "float32"] device_api = "cc_device_pow" util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not (dtype.lower() in supported_dtypes): raise RuntimeError("power_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim_x = len(shape) v_ndim_y = 0 p_shape_y = 0 p_input_y = "nullptr" block_num = "block_num" block_idx = "block_idx" padC0 = 0 p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale") p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift") p_power = util.create_param_ptr([gamma], inp_dtype, "p_power") p_shape_x = util.create_param_ptr(shape, "int32", "p_shape_x") # scale --> alpha, shitf --> beta, power --> gamma output = tvm.extern( shape, [data_input, p_scale, p_shift, p_power, p_shape_x], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[1].access_ptr("r"), # scale ins[2].access_ptr("r"), # shift ins[3].access_ptr("r"), # power v_ndim_x, ins[4].access_ptr("r"), # shape padC0, ins[0].access_ptr("r"), # input x v_ndim_y, v_ndim_y, p_shape_y, padC0, p_input_y, outs[0].access_ptr("w")), name="output", dtype=inp_dtype) s = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(s, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [data_input, output], "cce", name=kernel_name)
def custom_tile(shape, multiples, dtype, kernel_name="cce_tile", need_build=False, need_print=False): """Operation and Schedule for tile, construct an array by repeating shape the number of times given by multiply_shape. Parameters ---------- shape:shape of Tensor multiples: shape of Tensor dtype: the data type. only support float16, float32, int32, int8, uint8 kernel_name : cce kernel name, default value is "cce_tile" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ check_list = ["float16", "float32", "int32", "int8", "uint8"] if not (dtype.lower() in check_list): raise RuntimeError("tile_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) tensor_l = [] inp_dtype = dtype.lower() util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) tensor_l.append(tvm.placeholder(shape, name="shape", dtype=inp_dtype)) for i in range(len(multiples)): if not isinstance(multiples[i], int): raise RuntimeError("InvalidArgumentError: Expected int value") if multiples[i] < 0: raise RuntimeError( "InvalidArgumentError: Expected int value or multiples[%d] >= 0, but got %d!" % (i, multiples[i])) tensor_l.append( tvm.placeholder(multiples, name="multiples", dtype=inp_dtype)) out_tensor = compute_tile_cce(a_tuple=tensor_l) s = schedule_tile_cce(out_tensor) if need_print: with build_config: print( tvm.lower(s, [tensor_l[0], tensor_l[1], out_tensor], simple_mode=True)) if need_build: with build_config: tvm.build(s, tensor_l + [out_tensor], "cce", name=kernel_name)
def custom_exp(shape, dtype, kernel_name="cce_tf_exp", need_build=False, need_print=False): """ algorithm: exp calculating data's exp,y= e ** x ,dtype is float16, Parameters ---------- shape : shape of data dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32 kernel_name : cce kernel name, default value is "cce_tf_exp" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ supported_dtypes = ["float16", "float32"] device_api = "DeviceExp" util.check_kernel_name(kernel_name) util.check_shape_rule(shape) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) if not (dtype.lower() in supported_dtypes): raise RuntimeError("tf_exp_cce only support %s while dtype is %s" % (",".join(supported_dtypes), dtype)) inp_dtype = dtype.lower() shape = util.shape_refine(shape) data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype) v_datatype = util.get_device_api_dtype(inp_dtype) v_ndim = len(shape) block_num = "block_num" block_idx = "block_idx" padC0 = 0 p_scale = util.create_param_ptr([1], inp_dtype, "p_scale") p_shift = util.create_param_ptr([0], inp_dtype, "p_shift") p_base = util.create_param_ptr([-1], inp_dtype, "p_base") p_shape = util.create_param_ptr(shape, "int32", "p_shape") output = tvm.extern( shape, [data_input, p_scale, p_shift, p_base, p_shape], lambda ins, outs: tvm.call_extern( "int32_t", device_api, block_num, block_idx, v_datatype, ins[1].access_ptr("r"), # scale ins[2].access_ptr("r"), # shift ins[3].access_ptr("r"), # base v_ndim, ins[4].access_ptr("r"), # shape padC0, ins[0].access_ptr("r"), # input x outs[0].access_ptr("w")), name="output", dtype=inp_dtype) s = tvm.create_schedule(output.op) if need_print: with build_config: print(tvm.lower(s, [data_input, output], simple_mode=True)) if need_build: with build_config: tvm.build(s, [data_input, output], "cce", name=kernel_name)