def _im2col_fractal_indices(indices, A): """ calculate im2col_fractal tvm lambda function Parameters ---------- indices : indices in lambda function A : feature map ------- Returns : im2col_fractal tvm lambda function """ block_size = config['mac'][1] block_size_M = config['mac'][0] n, hw, c1, kernel_h, kernel_w, c0 = A.shape batch_size, i1, j1, i0, j0 = indices n_index = batch_size hw_index = i1 * block_size_M + i0 c1_index = (((j1 * block_size + j0) // c0.value) // kernel_w.value) // kernel_h.value kh_index = (((j1 * block_size + j0) // c0.value) // kernel_w.value) % kernel_h.value kw_index = ((j1 * block_size + j0) // c0.value) % kernel_w.value c0_index = (j1 * block_size + j0) % c0.value dtype = compute_dtype return tvm.select( tvm.any(hw_index < 0, hw_index > hw.value - 1), tvm.const(0.0, dtype), A(n_index, hw_index, c1_index, kh_index, kw_index, c0_index))
def lambda_func(*indice): new_indice = [0] * 5 if tensor_flag: new_indice[4] = indice[c0_index] new_indice[1] = indice[c1_index] if tensor_flag: return tvm.select( indice[c1_index] < x_shape_list[c1_index], tvm.vdeq_cast(x(*indice), req_scale(*new_indice), "int8", do_relu=relu_flag), tvm.const(0, dtype="int8")) return tvm.select( indice[c1_index] < x_shape_list[c1_index], tvm.deq_cast(x(*indice), req_scale(*new_indice), "int8"), tvm.const(0, dtype="int8"))
def _input_compute_generate(x, in_shape, read_shape, c1_dim, c1_index): """ generate lambda func """ x_shape = te.lang.cce.util.shape_to_list(x.shape) dtype = x.dtype x_slice_offset = _get_input_attr(x, "slice_offset", [], True) l1_fusion_flag = _get_input_attr(x, "l1_fusion_flag", -1, False) if not x_slice_offset: x_slice_offset = [0, 0, 0, 0, 0] if l1_fusion_flag != -1: x_w = x_shape[3] n_offset, _, h_offset, w_offset, _ = x_slice_offset if c1_dim % 2 == 0: input_ub = tvm.compute( in_shape, lambda n, c1, m, c0: x(n + n_offset, c1, (m // x_w) + h_offset, (m % x_w) + w_offset, c0), name="input_ub", attrs={"c_out": c1_dim}) else: input_ub = tvm.compute( read_shape, lambda n, c1, m, c0: tvm.select( c1 <= in_shape[c1_index] - 1, x(n + n_offset, c1, (m // x_w) + h_offset, (m % x_w) + w_offset, c0), tvm.const(0, dtype=dtype)), name='input_ub', attrs={"c_out": c1_dim}) else: if c1_dim % 2 == 0: input_ub = tvm.compute(in_shape, lambda *i: x(*i), name="input_ub", attrs={"c_out": c1_dim}) else: input_ub = tvm.compute( read_shape, lambda *indice: tvm.select( indice[c1_index] <= in_shape[c1_index] - 1, x(*indice), tvm.const(0, dtype=dtype)), name='input_ub', attrs={"c_out": c1_dim}) return input_ub
def custom_logical_not(shape, dtype, kernel_name="cce_tf_logical_not", need_build=False, need_print=False): """ logical not for the input tensor Parameters ---------- shape : input shape of data dtype : the data type, support bool kernel_name : cce kernel name, default value is "cce_logical_not" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape) check_list = ["bool"] if not dtype.lower() in check_list: raise RuntimeError( "logical_not_cce ony supports %s while dtype is %s" % (",".join(check_list), dtype)) util.check_shape_size(shape, SHAPE_SIZE_LIMIT) inp_dtype = dtype.lower() data = tvm.placeholder(shape, name="data", dtype=inp_dtype) with tvm.target.cce(): result = tvm.compute( shape, lambda *i: tvm.select(data[i] is True, False, True), name="result") schedule = tvm.create_schedule(result.op) if need_print: with build_config: print(tvm.lower(schedule, [data, result], simple_mode=True)) if need_build: with build_config: tvm.build(schedule, [data, result], "cce", name=kernel_name)
def lambda_func(*indice): new_indice = [0] * 5 if tensor_flag: new_indice[4] = indice[c0_index] new_indice[1] = indice[c1_index] return tvm.select( indice[c1_index] < x_shape_list[c1_index], tvm.conv_vdeq(res_s16(*indice), req_scale(*new_indice)).astype("int8"), tvm.const(0, dtype="int8"))
def _im2col_row_major_indices(indices, A, kernel_h, kernel_w, padding, stride, dilate): """ calculate im2col_row_major tvm lambda function Parameters ---------- indices : indices in lambda function A : feature map kernel_h: the kernel value in h kernel_w: the kernel value in w padding: the padding shape stride: the stride value ------- Returns im2col_row_major tvm lambda function """ in_n, in_c1, inH, in_w, in_c0 = A.shape n, hw, c1, kh, kw, c0 = indices stride_h, stride_w = stride dilate_h, dilate_w = dilate padding_top, padding_bottom, padding_left, padding_right = padding kernel_dilate_w = (kernel_w - 1) * dilate[1] + 1 width_out = (in_w.value + padding_left + padding_right - kernel_dilate_w) // (stride_w) + 1 n_index = n c1_index = c1 h_index = (hw // width_out) * stride_h + (kh * dilate_h) w_index = (hw % width_out) * stride_w + (kw * dilate_w) c0_index = c0 return tvm.select( tvm.any(h_index < padding_top, h_index > inH.value + padding_top - 1, w_index < padding_left, w_index > in_w.value + padding_left - 1), tvm.const(0.0, compute_dtype), A(n_index, c1_index, h_index - padding_top, w_index - padding_left, c0_index))
def _copy_from_input_value(index, shape): for idx, _ in enumerate(shape): i = len(shape) - idx - 1 if idx == 0: select_result = tvm.select( begin[i] <= index[i], input_value_ub(*_map_input_value_index(*index))) select_result = tvm.select( (index[i] - begin[i]) % strides[i] == 0, select_result) select_result = tvm.select(end[i] > index[i], select_result) else: select_result = tvm.select(begin[i] <= index[i], select_result) select_result = tvm.select( (index[i] - begin[i]) % strides[i] == 0, select_result) select_result = tvm.select(end[i] > index[i], select_result) return select_result
def avg_pool_grad_compute(input_shape, weight, out, vealuemean, k_sizes, strides, padding): """ Computes the gradients of avg pool, insert input. Parameters ---------- input_shape: a list or tuple representing the shape of input, 6D format [N, C1, 1, H, W, C0] weight: a tensor, 5D with shape [C1, Hf*Wf, 1, C0, C0] out: a tensor, 6D format [N, Co1, 1, Ho, Wo, C0] weight_sizes: a list or tuple of two ints,[H, W] strides: a list or tuple of two ints,[H, W] padding: only support "SAME" yet, the type of padding algorithm to use Returns ------- dx_res: compute of the gradients of avg pool grad """ out_type = out.dtype _, _, _, input_h, input_w, _ = input_shape k_height, k_width = k_sizes out_shape = (int(i.value) for i in out.shape) out_n, out_cgroup, out_c1, out_h, out_w, out_c0 = out_shape out_mul_shape = out_n, out_cgroup, out_c1, out_h, out_w, out_c0 out_mul = tvm.compute(out_mul_shape, lambda *i: out(*i) * vealuemean(*i), name='out_mul') dilated_shape, dilated_pad = calculation_dilation(input_shape, k_sizes, strides, padding) dilated_strides = (1, 1) # compute of out_backprop dilation out_dilated = tvm.compute( dilated_shape, lambda n, cg, c1, h, w, c0: tvm.select( tvm.all(h % strides[0] == 0, w % strides[1] == 0), out_mul[ n, cg, c1, h // strides[0], w // strides[1], c0], tvm.const(0, out.dtype)), attrs={'strides': strides}, name='out_dilated') # image to column of dilated out_backprop out_im2col_row_major_shape = (out_n, out_cgroup, input_h * input_w, out_c1, k_height, k_width, BLOCK_SIZE) out_col = common.im2col_6d(out_dilated, out_im2col_row_major_shape, k_height, k_width, dilated_pad, dilated_strides) hiwi_mad = (input_h * input_w + BLOCK_SIZE - 1) // BLOCK_SIZE * BLOCK_SIZE dout_im2col_fractal_shape = (out_n, out_cgroup, hiwi_mad // BLOCK_SIZE, out_c1 * k_height * k_width, BLOCK_SIZE, BLOCK_SIZE) dout_col_pad = common.im2col_fractal_6d(dout_im2col_fractal_shape, out_col) # unuse , waiting for delect weight_unuse = tvm.compute(weight.shape, lambda *index: weight(*index), name='weight_rotated') res_dtype = "float32" # matrix multiplication of dilated out_backprop and rotated weight mad_shape = (out_n, out_cgroup, out_c1, hiwi_mad, out_c0) mad_res = common.mad(mad_shape, dout_col_pad, weight_unuse, res_dtype) # cast dX from float32 to float16 dx_cast = tvm.compute(mad_res.shape, lambda *index: mad_res(*index).astype(out_type), name='dx_cast') # remove the padding of dX res_shape = (out_n, out_cgroup, out_c1, input_h * input_w, out_c0) dx_res = tvm.compute(res_shape, lambda *index: dx_cast(*index).astype(out_type), name='dx_res', attrs={ 'weight_height': k_height, 'weight_width': k_width, 'dilated_pad': dilated_pad, 'dilated_strides': dilated_strides }) return dx_res
def tanh_split_input_by_val(shape, input_x, symbol): """ split input into two tensor by 0.5 shape : tensor shape input_x : tensor symbol : tensor symbol return: res, operations, scope """ res = {} operation = {} scope = {} dtype_x = input_x.dtype const_zero = tvm.const(0.0, dtype="float16") const_0 = tvm.const(0.5, dtype="float16") key = "input_abs_" + symbol input_abs = tvm.compute(shape, lambda *i: tvm.abs(input_x(*i)), name=key) res[key] = input_abs operation[key] = "vector_abs" scope[key] = cce.scope_ubuf # vcmp only support fp16 if dtype_x == "float32": key = "cmp_val_fp16_" + symbol cmp_val_fp16 = tvm.compute( shape, lambda *i: topi.cast(input_abs(*i), "float16"), name=key) res[key] = cmp_val_fp16 operation[key] = "vector_conv" scope[key] = cce.scope_ubuf key = "input_val_fp16_" + symbol input_val_fp16 = tvm.compute( shape, lambda *i: topi.cast(input_x(*i), "float16"), name=key) res[key] = input_val_fp16 operation[key] = "vector_conv" scope[key] = cce.scope_ubuf key = "input_gt_fp16_" + symbol input_gt_fp16 = \ tvm.compute(shape, lambda *i: tvm.select(cmp_val_fp16(*i) > const_0, input_val_fp16(*i), const_zero), name=key) res[key] = input_gt_fp16 operation[key] = "vector_select_gt" scope[key] = cce.scope_ubuf key = "input_lt_fp16_" + symbol input_lt_fp16 = \ tvm.compute(shape, lambda *i: tvm.select(cmp_val_fp16(*i) <= const_0, input_val_fp16(*i), const_zero), name=key) res[key] = input_lt_fp16 operation[key] = "vector_select_le" scope[key] = cce.scope_ubuf key = "input_gt_" + symbol input_gt = tvm.compute( shape, lambda *i: topi.cast(input_gt_fp16(*i), "float32"), name=key) res[key] = input_gt operation[key] = "vector_conv" scope[key] = cce.scope_ubuf key = "input_lt_" + symbol input_lt = tvm.compute( shape, lambda *i: topi.cast(input_lt_fp16(*i), "float32"), name=key) res[key] = input_lt operation[key] = "vector_conv" scope[key] = cce.scope_ubuf else: key = "input_gt_" + symbol input_gt = tvm.compute( shape, lambda *i: tvm.select( input_abs(*i) > const_0, input_x(*i), const_zero), name=key) res[key] = input_gt operation[key] = "vector_select_gt" scope[key] = cce.scope_ubuf key = "input_lt_" + symbol input_lt = tvm.compute( shape, lambda *i: tvm.select( input_abs(*i) <= const_0, input_x(*i), const_zero), name=key) res[key] = input_lt operation[key] = "vector_select_le" scope[key] = cce.scope_ubuf return res, operation, scope
def _max_pool_grad_grad_with_argmax_compute( placeholders, x, argmax, grad, y, ksize, strides, padding="VALID", ori_format_x="NCHW", kernel_name="cce_max_pool_grad_grad_with_argmax"): """ Computes second-order gradients of the maxpooling function. Parameters ---------- x: dict Include info about ori_input, format, ori_format, shape, ori_shape, dtype. grad: dict Include info about grad of ori_input, format, ori_format, shape, ori_shape, dtype. argmax: dict Include info about ori_input, format, ori_format, shape, ori_shape, dtype. y: dict Include info about result of function, format, ori_format, shape, ori_shape, dtype. ksize: list or tuple The size of the window for each dimension of the input tensor. strides: list or tuple The stride of the sliding window of the input tensor. padding: str The type of padding algorithm to use. Only support "VALID" or "SAME" kernel_name: str Cce kernel name, default value is "cce_max_pool_grad_grad_with_argmax" Returns ------- grad_in_l1: process of movement of grad from gm to l1. grad_im2col: process of vm tensor of grad on l1. grad_fractal: process of fractal of grad from l1 to ub. grad_fractal_transp: process of transposition of grad. argmax_ub: process of movement of argmax from gm to ub. tensor_zero_ub: process of movement of zero tensor from gm to ub. grad_grad_col: tensor after selection. grad_grad: tensor after reduce_sum. output_res: output of the calculation. """ argmax_tensor = placeholders[1] grad_tensor = placeholders[2] (grad_n, grad_c1, grad_h, grad_w, grad_c0) = grad.get("shape") if ori_format_x == "NHWC": _, kernel_h, kernel_w, _ = ksize _, stride_h, stride_w, _ = strides else: _, _, kernel_h, kernel_w = ksize _, _, stride_h, stride_w = strides shape_max_pool_h, pad_top, pad_bottom = \ common.tf_get_windowed_output_size_verbose( grad_h, kernel_h, stride_h, padding) shape_max_pool_w, pad_left, pad_right = \ common.tf_get_windowed_output_size_verbose( grad_w, kernel_w, stride_w, padding) pad_list = (pad_top, pad_bottom, pad_left, pad_right) stride = (stride_h, stride_w) # howo must be multiple of 16 howo = _ceil_to(shape_max_pool_h * shape_max_pool_w, BLOCK_SIZE) # copy argmax from ub to gm shape_argmax_ub = (grad_n, grad_c1 * kernel_h * kernel_w, howo // BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE) argmax_ub = tvm.compute(shape_argmax_ub, lambda *i: argmax_tensor(*i), name='argmax_ub') # load3d compute shape_grad = (grad_n, grad_c1, grad_h, grad_w, grad_c0) grad_in_l1 = tvm.compute(shape_grad, lambda *i: grad_tensor[i], name="grad_in_l1") # n howo c1 kh kw c0 shape_grad_vm = (grad_n, shape_max_pool_h * shape_max_pool_w, grad_c1, kernel_h, kernel_w, grad_c0) grad_im2col = common.img2col( grad_in_l1, shape_grad_vm, kernel_h, kernel_w, pad_list, stride, ) # n hw c1 kh kw c0 -> n c1 kh kw hw c0 shape_fractal = (grad_n, howo // BLOCK_SIZE, grad_c1 * kernel_h * kernel_w, BLOCK_SIZE, BLOCK_SIZE) grad_fractal = common.im2col_fractal(shape_fractal, grad_im2col, "ca", tag='') # (n, howo/16, c1khkw, 16, c0) -> (n, c1khkw, howo/16, 16, c0) shape_grad_fratical_transp = (grad_n, grad_c1 * kernel_h * kernel_w, howo // BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE) grad_fractal_transp = tvm.compute( shape_grad_fratical_transp, lambda i, j, k, l, m: grad_fractal[i, k, j, l, m], name='grad_fractal_transp') # declare a zero tensor, and move to ub for vsel dtype_tensor_zero = grad_tensor.dtype shape_tensor_zero = (BLOCK_SIZE, ) tensor_zero_ub = tvm.compute( shape_tensor_zero, lambda *i: tvm.const(0, dtype=dtype_tensor_zero), name='tensor_zero_ub') # vsel compute shape_grad_grad_col = (grad_n, grad_c1 * kernel_h * kernel_w, howo // BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE) grad_grad_col = tvm.compute( shape_grad_grad_col, lambda i, j, k, l, m: tvm.select(argmax_ub[ i, j, k, l, m], grad_fractal_transp[i, j, k, l, m], tensor_zero_ub[ m]), name='grad_grad_col') # reduce_sum # (n, c1khkw, howo/16, 16, c0) -> (n, c1, howo/16, 16, c0) m = tvm.reduce_axis((0, kernel_h * kernel_w), "m") shape_grad_grad = (grad_n, grad_c1, howo // BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE) grad_grad = tvm.compute( shape_grad_grad, lambda i, j, n, p, q: tvm.sum( grad_grad_col[i, j * kernel_h * kernel_w + m, n, p, q], axis=[m]), name="grad_grad") extract_params = {} extract_params["padding_mode"] = padding extract_params["shape_max_pool_h"] = shape_max_pool_h extract_params["shape_max_pool_w"] = shape_max_pool_w extract_params["fmap_shape"] = shape_grad extract_params["ksizes"] = ksize extract_params["strides"] = strides extract_params["pad"] = pad_list extract_params["fmap_vm_shape"] = shape_grad_vm extract_params["fractal_shape"] = shape_fractal extract_params["HoWo"] = howo setfmatrix_dict = { "conv_kernel_h": kernel_h, "conv_kernel_w": kernel_w, "conv_padding_top": pad_top, "conv_padding_bottom": pad_bottom, "conv_padding_left": pad_left, "conv_padding_right": pad_right, "conv_stride_h": stride_h, "conv_stride_w": stride_w, "conv_fm_c": grad_c1 * grad_c0, "conv_fm_h": grad_h, "conv_fm_w": grad_w, } # UB to OUT output_res = tvm.compute( (grad_n, grad_c1, shape_max_pool_h * shape_max_pool_w, BLOCK_SIZE), lambda i, j, l, m: grad_grad[i, j, l // 16, l % 16, m], name="ub_to_out", attrs={ 'extract_params': extract_params, 'setfmatrix_dict': setfmatrix_dict }) return grad_in_l1, grad_im2col, grad_fractal, grad_fractal_transp, \ argmax_ub, tensor_zero_ub, grad_grad_col, grad_grad, output_res
def custom_logical_and(shape_x, shape_y, dtype, kernel_name="cce_tf_logical_and", need_build=False, need_print=False): """ do element-wise logical-and operation between two input tensors Parameters: ---------- shape_x : shape of input data1 shape_y : shape of input data2 dtype : source data type, support "bool" kernel_name : cce kernel name, default value is "cce_tf_logical_and" need_buid : if need to build CCEC kernel, default value is False need_print : if need to print the ir, default value is False Returns ------- None """ util.check_kernel_name(kernel_name) util.check_shape_rule(shape_x) util.check_shape_rule(shape_y) check_list = ["bool"] if not (dtype.lower() in check_list): raise RuntimeError( "logical_and_cce only support %s while dtype is %s" % (",".join(check_list), dtype)) util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT) inp_dtype = dtype.lower() shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y) data1 = tvm.placeholder(shape_x, dtype=inp_dtype, name="data1") data2 = tvm.placeholder(shape_y, dtype=inp_dtype, name="data2") with tvm.target.cce(): data1_tmp1 = te.lang.cce.broadcast(data1, shape_max) data1_tmp2 = te.lang.cce.broadcast(data2, shape_max) min_value = tvm.const(0, dtype=inp_dtype) res = tvm.compute( shape_max, lambda *i: tvm.select( tvm.all( tvm.any( data1_tmp1(*i) > min_value, data1_tmp1(*i) < -min_value), tvm.any( data1_tmp2(*i) > min_value, data1_tmp2(*i) < -min_value)), True, False), name="res") sch = tvm.create_schedule(res.op) if need_print: with build_config: print(tvm.lower(sch, [data1, data2, res], simple_mode=True)) if need_build: with build_config: tvm.build(sch, [data1, data2, res], "cce", name=kernel_name)
def dynamic_lstm(input_x, weight, bias, output_h, kernel_name="dynamic_lstm"): """ x : dict A dict object, contains a Tensor 's type and shape and format, the type can be float32, the format can be [FRACTAL_NZ] w : dict A dict object, contains a Tensor 's type and shape and format, the type can be float32, the format can be [FRACTAL_ZN_LSTM] b : dict A dict object, contains a Tensor 's type and shape and format, the type can be float32, the format can be [ND] output_h : dict A dict object, contains a Tensor 's type and shape and format, the type can be float32, the format can be [FRACTAL_NZ] """ check_dtype(input_x, weight, bias, output_h) shape_x_input = input_x.get("shape") shape_w_input = weight.get("shape") shape_b_input = bias.get("shape") shape_output = output_h.get("shape") check(shape_x_input, shape_w_input, shape_b_input, shape_output) scan_one_num = 1 t_size = shape_x_input[0] + scan_one_num m_size = shape_x_input[2] k_size = shape_w_input[0] n_size = shape_w_input[1] hidden_size = shape_output[1] block_size = n_size // hidden_size in_x = k_size - hidden_size shape_b = (1, k_size, block_size, hidden_size, 16, 16) shape_c = (1, block_size, hidden_size, m_size, 16, 16) shape_bias = (1, block_size, hidden_size, 1, 1, 16) shape_x = (t_size, in_x, m_size, 16, 16) shape_h = (1, k_size - in_x, m_size, 16, 16) shape_i = (1, hidden_size, m_size, 16, 16) shape_i_t = (t_size, hidden_size, m_size, 16, 16) core_num = cce.get_soc_spec("CORE_NUM") # one core use 4 int64 that is 32B align shape_sync = (4 * core_num,) k0_size = 16 input_dtype = input_x.get("dtype") data_dtype = 'float16' sync_dtype = 'int64' # define placeholder input_x = tvm.placeholder(shape_x, dtype=input_dtype, name='input_x') weight = tvm.placeholder(shape_b, dtype=input_dtype, name='weight') bias = tvm.placeholder(shape_bias, name='bias', dtype=input_dtype) s_state_h = tvm.placeholder(shape_h, dtype=input_dtype, name='state_h') s_state_c = tvm.placeholder(shape_i, dtype=input_dtype, name='state_c') sync0 = tvm.placeholder(shape_sync, name="sync0", dtype='int64') # compute # weight need first to ub and cast to float16 weight_ub = \ tvm.compute( shape_b, lambda *indices: weight(*indices), name="weight_ub") weight_fp16 = \ tvm.compute(shape_b, lambda *indices: weight_ub(*indices).astype(data_dtype), name="weight_fp16") # input and s_state_h need first to ub and cast to float16 shape_a_z_bigz = (t_size, m_size, k_size, 16, 16) # input and s_start_h is Nz, need trans to zZ # so change axis 1 and 2 a_ub = tvm.compute(shape_a_z_bigz, lambda *indice: tvm.select(indice[2] < in_x, input_x[indice[0], indice[2], indice[1], indice[3], indice[4]], s_state_h[0, indice[2] - in_x, indice[1], indice[3], indice[4]] ), name="a_ub", tag="concat") shape_a_z_bigz_1 = (1, m_size, k_size, 16, 16) a_ub_fp16 = \ tvm.compute(shape_a_z_bigz_1, lambda *indices: a_ub(*indices).astype(data_dtype), name="a_ub_fp16") a_l1 = tvm.compute(shape_a_z_bigz_1, lambda *indices: a_ub_fp16(*indices), name='a_l1') b_l1 = tvm.compute(shape_b, lambda *indices: weight_fp16(*indices), name='b_l1') # shape_a_z_bigz_1 = (1, m_size, k_size, 16, 16) a_l0a = tvm.compute(shape_a_z_bigz, lambda *indices: a_l1(*indices), name="a_l0a") b_l0b = tvm.compute(shape_b, lambda *indices: b_l1(*indices), name="b_l0b") k1 = tvm.reduce_axis((0, k_size), name='k1') k0 = tvm.reduce_axis((0, k0_size), name='k0') c_l0c = tvm.compute(shape_c, lambda t, nb_0, nb_1, mb, mp, np: tvm.sum((a_l0a[t, mb, k1, mp, k0] * \ b_l0b[t, k1, nb_0, nb_1, np, k0]) \ .astype('float32'), axis=[k1, k0]), name='c_l0c') c_ub = tvm.compute(shape_c, lambda *indices: c_l0c(*indices), name="c_ub") bias_ub = tvm.compute(shape_bias, lambda *indices: bias(*indices), name='bias_ub') bias_bc_ub = te.lang.cce.broadcast(bias_ub, shape_c) c_ub_bias = te.lang.cce.vadd(c_ub, bias_bc_ub) # split matmul res i_t_index = 0 j_t_index = 1 f_t_index = 2 o_t_index = 3 i_t = \ tvm.compute(shape_i, lambda t, i, j, k, l: c_ub_bias(t, i_t_index, i, j, k, l), name="i_t") j_t = \ tvm.compute(shape_i, lambda t, i, j, k, l: c_ub_bias(t, j_t_index, i, j, k, l), name="j_t") f_t = \ tvm.compute(shape_i, lambda t, i, j, k, l: c_ub_bias(t, f_t_index, i, j, k, l), name="f_t") o_t = \ tvm.compute(shape_i, lambda t, i, j, k, l: c_ub_bias(t, o_t_index, i, j, k, l), name="o_t") f_t_sigmoid = sigmoid_compute(f_t) i_t_sigmoid = sigmoid_compute(i_t) o_t_sigmoid = sigmoid_compute(o_t) j_t_tanh = tanh_compute(j_t) c_t_tmp1 = te.lang.cce.vmul(s_state_c, f_t_sigmoid) c_t_tmp2 = te.lang.cce.vmul(j_t_tanh, i_t_sigmoid) update_c = te.lang.cce.vadd(c_t_tmp1, c_t_tmp2) update_c_gm = tvm.compute(shape_i_t, lambda t, i, j, k, l: update_c(0, i, j, k, l), name="update_c_gm") c_t_tanh = tanh_compute(update_c) update_h = te.lang.cce.vmul(c_t_tanh, o_t_sigmoid) update_h_gm = tvm.compute(shape_i_t, lambda t, i, j, k, l: update_h(0, i, j, k, l), name="update_h_gm") update_hc_vn = \ tvm.compute( shape_i_t, lambda t, i, j, k, l: update_c_gm(0, i, j, k, l) +\ update_h_gm(t, i, j, k, l), name="update_hc_vn") update_c_gm_vn = \ tvm.compute( shape_i_t, lambda t, i, j, k, l: update_hc_vn(0, i, j, k, l), name="update_c_gm_vn") update_h_gm_vn = \ tvm.compute( shape_i_t, lambda t, i, j, k, l: update_hc_vn(0, i, j, k, l), name="update_h_gm_vn") update_c_ub = \ tvm.compute( shape_i, lambda t, i, j, k, l: update_c_gm_vn(t, i, j, k, l), name="update_c_ub") update_c_gm_2 = \ tvm.compute(shape_i_t, lambda t, i, j, k, l: update_c_ub(0, i, j, k, l), name="update_c_gm_2") update_h_ub = \ tvm.compute( shape_i, lambda t, i, j, k, l: update_h_gm_vn(t, i, j, k, l), name="update_h_ub") update_h_gm_2 = \ tvm.compute( shape_i_t, lambda t, i, j, k, l: update_h_ub(0, i, j, k, l) +\ update_c_gm_2(t, i, j, k, l), name="update_h_gm_2") update_h_gm_2_dummy = \ tvm.compute(shape_i_t, lambda t, i, j, k, l: update_h_gm_2(t, i, j, k, l), name="update_h_gm_2_dummy") # state init init_shape = (1, hidden_size, m_size, 16, 16) s_state_h_ub = \ tvm.compute(shape_h, lambda *indices: tvm.const(0.0, dtype=input_dtype), name='s_state_h_ub') s_state_c_ub = \ tvm.compute(shape_i, lambda *indices: tvm.const(0.0, dtype=input_dtype), name='s_state_c_ub') s_init_h = \ tvm.compute( init_shape, lambda _, i, j, k, l: s_state_h_ub[0, i, j, k, l], name="s_init_h") s_init_c = \ tvm.compute( init_shape, lambda _, i, j, k, l: s_state_c_ub[0, i, j, k, l], name="s_init_c") # scan scan_h, scan_c = tvm.scan( [s_init_h, s_init_c], [update_h_ub, update_c_ub], [s_state_h, s_state_c], scan_update=[update_h_gm_2, update_h_gm_2_dummy], name="lstm_scan") # end compute # schedule s = tvm.create_schedule([scan_h.op, scan_c.op]) new_build_list = [input_x, weight, bias, update_h_gm, update_c_gm, sync0, update_h_gm_vn, update_c_gm_vn] def gen_reversed_subgraph_list(out_tensor, tensor_list): """ traverse tensors by Depth-First-Search """ if out_tensor is None: return stack = [out_tensor] visited_list = [] while stack: cur_tensor = stack.pop() visited_list.append(cur_tensor) for in_tensor in cur_tensor.op.input_tensors: if in_tensor not in visited_list: stack.append(in_tensor) if "elewise" in in_tensor.op.tag or \ "broadcast" == in_tensor.op.tag: if in_tensor not in tensor_list: tensor_list.append(in_tensor) elewise_tensors = [] gen_reversed_subgraph_list(update_h_gm, elewise_tensors) barrier_tensor = c_ub_bias elewise_before_barrier_tensors = [bias_bc_ub] # set scope s[a_l1].set_scope(cce.scope_cbuf) s[b_l1].set_scope(cce.scope_cbuf) s[a_l0a].set_scope(cce.scope_ca) s[b_l0b].set_scope(cce.scope_cb) s[c_l0c].set_scope(cce.scope_cc) s[c_ub].set_scope(cce.scope_ubuf) s[s_init_h].set_scope(cce.scope_ubuf) s[bias_ub].set_scope(cce.scope_ubuf) s[bias_bc_ub].set_scope(cce.scope_ubuf) s[scan_h].set_scope(cce.scope_ubuf) s[scan_c].set_scope(cce.scope_ubuf) s[update_h_ub].set_scope(cce.scope_ubuf) s[update_c_ub].set_scope(cce.scope_ubuf) s[s_state_h_ub].set_scope(cce.scope_ubuf) s[s_state_c_ub].set_scope(cce.scope_ubuf) s[weight_ub].set_scope(cce.scope_ubuf) s[weight_fp16].set_scope(cce.scope_ubuf) s[a_ub].set_scope(cce.scope_ubuf) s[a_ub_fp16].set_scope(cce.scope_ubuf) for tensor in elewise_tensors: s[tensor].set_scope(cce.scope_ubuf) # compute inline compute_inline_tensors = [i_t, j_t, f_t, o_t] for tensor in compute_inline_tensors: s[tensor].compute_inline() # matmul tiling factor_l1_m, factor_l1_n, factor_l1_k, \ factor_l0_m, factor_l0_n, factor_l0_k = \ _get_lstm_tiling(m_size, k_size, n_size) l1_n_outer, l1_n_inner = \ s[c_l0c].split(c_l0c.op.axis[2], factor=factor_l1_n // block_size) l1_m_outer, l1_m_inner = \ s[c_l0c].split(c_l0c.op.axis[3], factor=factor_l1_m) l1_k_outer, l1_k_inner = \ s[c_l0c].split(c_l0c.op.reduce_axis[0], factor=factor_l1_k) l0_n_outer, l0_n_inner = s[c_l0c].split(l1_n_inner, factor=factor_l0_n) l0_m_outer, l0_m_inner = s[c_l0c].split(l1_m_inner, factor=factor_l0_m) l0_k_outer, l0_k_inner = s[c_l0c].split(l1_k_inner, factor=factor_l0_k) s[c_l0c].reorder(l1_n_outer, c_l0c.op.axis[1], l1_m_outer, l1_k_outer, l0_n_outer, l0_m_outer, l0_k_outer, l0_n_inner, l0_m_inner, c_l0c.op.axis[3 + 1], c_l0c.op.axis[4 + 1], l0_k_inner, c_l0c.op.reduce_axis[1]) s[weight_ub].compute_at(s[c_l0c], l1_k_outer) s[weight_fp16].compute_at(s[c_l0c], l1_k_outer) s[a_ub].compute_at(s[c_l0c], l1_k_outer) s[a_ub_fp16].compute_at(s[c_l0c], l1_k_outer) s[a_l0a].compute_at(s[c_l0c], l0_k_outer) s[b_l0b].compute_at(s[c_l0c], l0_k_outer) s[a_l1].compute_at(s[c_l0c], l1_k_outer) s[b_l1].compute_at(s[c_l0c], l1_k_outer) ub_n_outer, ub_n_inner = \ s[c_ub].split(c_ub.op.axis[2], factor=factor_l1_n // block_size) ub_m_outer, ub_m_inner = s[c_ub].split(c_ub.op.axis[3], factor=factor_l1_m) s[c_ub].reorder(ub_n_outer, c_ub.op.axis[1], ub_m_outer, ub_n_inner, ub_m_inner, c_ub.op.axis[4], c_ub.op.axis[5]) s[c_l0c].compute_at(s[c_ub], ub_n_outer) # elewise compute_at barrier_outer, barrier_inner = \ s[barrier_tensor].split(barrier_tensor.op.axis[2], factor=factor_l1_n // block_size) s[barrier_tensor].reorder( barrier_tensor.op.axis[0], barrier_outer, barrier_tensor.op.axis[1], barrier_inner, barrier_tensor.op.axis[3], barrier_tensor.op.axis[4], barrier_tensor.op.axis[5]) s[c_ub].compute_at(s[barrier_tensor], barrier_outer) s[bias_ub].compute_at(s[barrier_tensor], barrier_outer) for tensor in elewise_before_barrier_tensors: s[tensor].compute_at(s[barrier_tensor], barrier_outer) vn_outer, vn_inner = \ s[update_hc_vn].split(update_hc_vn.op.axis[0 + 1], factor=factor_l1_n // block_size) second_split_factor = \ (hidden_size // (factor_l1_n // block_size)) // core_num vn_o_outer, vn_o_inner = \ s[update_hc_vn].split(vn_outer, factor=second_split_factor) s[barrier_tensor].compute_at(s[update_hc_vn], vn_o_inner) for tensor in elewise_tensors: if tensor not in elewise_before_barrier_tensors: s[tensor].compute_at(s[update_hc_vn], vn_o_inner) s[update_c_gm].compute_at(s[update_hc_vn], vn_o_inner) s[update_h_gm].compute_at(s[update_hc_vn], vn_o_inner) second_split_factor = hidden_size // core_num res_h_outer, res_h_inner = \ s[update_h_gm_2].split(update_h_gm_2.op.axis[1], factor=hidden_size) s[update_hc_vn].compute_at(s[update_h_gm_2], update_h_gm_2.op.axis[0]) s[update_c_gm_vn].compute_at(s[update_h_gm_2], res_h_outer) s[update_h_gm_vn].compute_at(s[update_h_gm_2], res_h_outer) s[update_c_ub].compute_at(s[update_h_gm_2], res_h_outer) s[update_c_gm_2].compute_at(s[update_h_gm_2], res_h_outer) s[update_h_ub].compute_at(s[update_h_gm_2], res_h_outer) s[update_h_gm_vn].bind_buffer( update_h_gm_vn.op.axis[0], 0, scan_h.op.scan_axis + res_h_outer) s[update_c_gm_vn].bind_buffer( update_c_gm_vn.op.axis[0], 0, scan_h.op.scan_axis + res_h_outer) # bind s[update_hc_vn].bind(vn_o_outer, tvm.thread_axis("blockIdx.x")) # multi core sync s[update_hc_vn].pragma(update_hc_vn.op.axis[0], pragma_type="multicore_sync_wait_after", pragma_value=sync0[0]) s[update_hc_vn].pragma(update_hc_vn.op.axis[0], pragma_type="multicore_sync_set_after", pragma_value=sync0[0]) # modify for extend s[input_x].bind_buffer(0, 0, scan_h.op.scan_axis) s[update_h_gm].buffer_tile((scan_h.op.scan_axis*1, 1), (None, None), (None, None), (None, None), (None, None)) s[update_c_gm].buffer_tile((scan_h.op.scan_axis*1, 1), (None, None), (None, None), (None, None), (None, None)) s[update_h_gm_2].buffer_tile((0, 1), (None, None), (None, None), (None, None), (None, None)) s[update_c_gm_2].buffer_tile((0, 1), (None, None), (None, None), (None, None), (None, None)) # buffer reuse s[update_h_gm].reused_by(update_h_gm_vn) s[update_c_gm].reused_by(update_c_gm_vn) # emit_insn s[a_l1].emit_insn(a_l1.op.axis[0], 'dma_copy') s[b_l1].emit_insn(b_l1.op.axis[0], 'dma_copy') s[a_l0a].emit_insn(a_l0a.op.axis[0], 'dma_copy') s[b_l0b].emit_insn(b_l0b.op.axis[0], 'dma_copy') s[weight_ub].emit_insn(weight_ub.op.axis[0], 'dma_copy') s[weight_fp16].emit_insn(weight_fp16.op.axis[0], 'vector_conv') s[a_ub].emit_insn(a_ub.op.axis[0], 'dma_copy') s[a_ub_fp16].emit_insn(a_ub_fp16.op.axis[0], 'vector_conv') mad_dict = {"mad_pattern": 0, "k_outer": [l1_k_outer, l0_k_outer]} s[c_l0c].emit_insn(l0_n_inner, 'mad', mad_dict) s[c_ub].emit_insn(ub_n_inner, 'dma_copy') s[s_init_h].emit_insn(s_init_h.op.axis[0], 'dma_copy') s[s_init_c].emit_insn(s_init_c.op.axis[0], 'dma_copy') s[bias_bc_ub].emit_insn(bias_bc_ub.op.axis[0], 'unified_broadcast') s[s_state_h_ub].emit_insn(s_state_h_ub.op.axis[0], 'broadcast') s[s_state_c_ub].emit_insn(s_state_c_ub.op.axis[0], 'broadcast') s[barrier_tensor].emit_insn(barrier_tensor.op.axis[1], 'vector_add') for tensor in elewise_tensors: if tensor != barrier_tensor: insn = get_emit_insn_map(tensor) s[tensor].emit_insn(tensor.op.axis[0], insn) s[bias_ub].emit_insn(bias_ub.op.axis[0], 'dma_copy') s[update_c_gm].emit_insn(s[update_c_gm].op.axis[1], 'dma_copy') s[update_h_gm].emit_insn(s[update_h_gm].op.axis[1], 'dma_copy') s[update_c_ub].emit_insn(update_c_ub.op.axis[1], 'dma_copy') s[update_h_ub].emit_insn(update_h_ub.op.axis[1], 'dma_copy') s[update_hc_vn].emit_insn(vn_inner, 'phony_insn') s[update_c_gm_vn].emit_insn(s[update_c_gm_vn].op.axis[0], 'phony_insn') s[update_h_gm_vn].emit_insn(s[update_h_gm_vn].op.axis[0], 'phony_insn') s[update_h_gm_2].emit_insn(res_h_inner, 'phony_insn') s[update_c_gm_2].emit_insn(s[update_c_gm_2].op.axis[0], 'phony_insn') s[update_h_gm_2_dummy].emit_insn( update_h_gm_2_dummy.op.axis[0], 'phony_insn') def _write_workspace_info(shape_list, dtype_list, sync_num, kernel_name): """ modify json after build """ def _write_code(wkspace_dict, fname): fname = os.path.realpath(fname) if fname.startswith(os.getcwd()): if os.path.exists(fname): with open(fname, "r") as f: load_dict = json.load(f) load_dict.update(wkspace_dict) with open(fname, "w") as f: json.dump(load_dict, f, sort_keys=True, indent=4, separators=(',', ':')) def _get_data_width(ele): """ get data width """ m_sea = re.search(r'\d+', ele) if m_sea: return int(m_sea.group(0)) // 8 return 0 if not os.path.exists("kernel_meta"): os.mkdir("kernel_meta") os.chmod("kernel_meta", stat.S_IRWXU + stat.S_IRGRP + stat.S_IXGRP) num = len(shape_list) wkspace_dict = {} if num: total_size = [functools_reduce(lambda x, y: x * y, list_i) for list_i in shape_list] addr_type_list = [] for i, element in enumerate(dtype_list): total_size[i] = total_size[i] * _get_data_width(element) addr_type_list.append(0) if not os.path.exists("kernel_meta"): os.mkdir("kernel_meta") os.chmod("kernel_meta", stat.S_IRWXU + stat.S_IRGRP + stat.S_IXGRP) wkspace_dict["workspace"] = {"num": num, "size": total_size, "type": addr_type_list} if sync_num: parameters_list = \ (len(new_build_list) - 2 - sync_num) * [0, ] + sync_num * [1, ] wkspace_dict["parameters"] = parameters_list if wkspace_dict: _write_code(wkspace_dict, "kernel_meta/" + kernel_name + ".json") with build_config: tvm.build(s, new_build_list, "cce", name=kernel_name) _write_workspace_info( [shape_i_t, shape_sync], [input_dtype, sync_dtype], 1, kernel_name)
def prior_box_compute(feature, img, data_h, data_w, box_height, box_width, y, \ rec_img_h, rec_img_w, step_h, step_w, clip, offset, scale, variance): """ calculating data Parameters ---------- input_x : TVM tensor the placeholder of input_x output_y : dict dict of output_y, include keys(shape and dtype) kernel_name : str kernel name, default value is "prior_box" Returns ------- output tensor """ """ TODO: Please refer to the TE DSL Manual, And code here with TE DSL. """ tensor_dic = {} tensor_list = [] op_list = [] ins_list = [] shape_data_h = data_h.get("shape") shape_data_w = data_w.get("shape") data_dtype = data_h.get("dtype") shape_box = box_height.get("shape") box_dtype = box_height.get("dtype") shape_image = img.get("shape") shape_image_h = shape_image[2] feature_input = tvm.placeholder(shape_data_h, name="feature_input", \ dtype=data_dtype) img_input = tvm.placeholder((shape_image_h,), name="img_input", \ dtype=data_dtype) data_h_input = tvm.placeholder(shape_data_h, name="data_h_input", \ dtype=data_dtype) data_w_input = tvm.placeholder(shape_data_w, name="data_w_input", \ dtype=data_dtype) box_height_input = tvm.placeholder(shape_box, name="box_h_input", \ dtype=box_dtype) box_width_input = tvm.placeholder(shape_box, name="box_w_input", \ dtype=box_dtype) tensor_list.append(feature_input) tensor_list.append(img_input) tensor_list.append(data_h_input) tensor_list.append(data_w_input) tensor_list.append(box_height_input) tensor_list.append(box_width_input) feature_ub = tvm.compute(shape_data_h, lambda *i: feature_input(*i), \ name="feature_ub") img_ub = tvm.compute((shape_image_h,), lambda *i: img_input(*i), \ name="img_ub") tensor_dic["feature_ub"] = feature_ub op_list += [feature_ub] ins_list += ["dma_copy"] tensor_dic["img_ub"] = img_ub op_list += [img_ub] ins_list += ["dma_copy"] move_value = tvm.const(0.0, data_dtype) feature_move = tvm.compute(shape_data_h, \ lambda *i: feature_ub(*i) * move_value, \ name="feature_move") img_move = tvm.compute((shape_image_h,), \ lambda *i: img_ub(*i) * move_value, name="img_move") tensor_dic["feature_move"] = feature_move op_list += [feature_move] ins_list += ["vector_muls"] tensor_dic["img_move"] = img_move op_list += [img_move] ins_list += ["vector_muls"] data_h_ub_temp = tvm.compute(shape_data_h, lambda *i: data_h_input(*i), \ name="data_h_ub_temp") data_w_ub = tvm.compute(shape_data_w, lambda *i: data_w_input(*i), \ name="data_w_ub") box_height_ub = tvm.compute(shape_box, lambda *i: box_height_input(*i), \ name="box_height_ub") box_width_ub = tvm.compute(shape_box, lambda *i: box_width_input(*i), \ name="box_width_ub") tensor_dic["data_h_ub_temp"] = data_h_ub_temp op_list += [data_h_ub_temp] ins_list += ["dma_copy"] tensor_dic["data_w_ub"] = data_w_ub op_list += [data_w_ub] ins_list += ["dma_copy"] tensor_dic["box_height_ub"] = box_height_ub op_list += [box_height_ub] ins_list += ["dma_copy"] tensor_dic["box_width_ub"] = box_width_ub op_list += [box_width_ub] ins_list += ["dma_copy"] data_h_ub_temp1 = tvm.compute(shape_data_h, \ lambda *i: data_h_ub_temp(*i) + feature_move(*i), \ name="data_h_ub_temp1") data_h_ub = tvm.compute(shape_data_h, \ lambda *i: data_h_ub_temp1(*i) + img_move[0], name="data_h_ub") tensor_dic["data_h_ub_temp1"] = data_h_ub_temp1 op_list += [data_h_ub_temp1] ins_list += ["vector_add"] tensor_dic["data_h_ub"] = data_h_ub op_list += [data_h_ub] ins_list += ["vector_adds"] offset_value = tvm.const(offset, data_dtype) step_w_value = tvm.const(step_w, data_dtype) step_h_value = tvm.const(step_h, data_dtype) rec_img_w_value = tvm.const(rec_img_w, data_dtype) rec_img_h_value = tvm.const(rec_img_h, data_dtype) scale_value = tvm.const(scale, data_dtype) scale_oppo = 0 - scale scale_value_oppo = tvm.const(scale_oppo, data_dtype) # define 1 or 4 variance_value if len(variance) == 1: variance_value = tvm.const(variance[0], data_dtype) else: variance_value0 = tvm.const(variance[0], data_dtype) variance_value1 = tvm.const(variance[1], data_dtype) variance_value2 = tvm.const(variance[2], data_dtype) variance_value3 = tvm.const(variance[3], data_dtype) w_offset = tvm.compute(shape_data_w, \ lambda *i: data_w_ub(*i) + offset_value, name="w_offset") h_offset = tvm.compute(shape_data_h, \ lambda *i: data_h_ub(*i) + offset_value, name="h_offset") center_x = tvm.compute(shape_data_w, \ lambda *i: w_offset(*i) * step_w_value, name="center_x") center_y = tvm.compute(shape_data_h, \ lambda *i: h_offset(*i) * step_h_value, name="center_y") tensor_dic["w_offset"] = w_offset op_list += [w_offset] ins_list += ["vector_adds"] tensor_dic["h_offset"] = h_offset op_list += [h_offset] ins_list += ["vector_adds"] tensor_dic["center_x"] = center_x op_list += [center_x] ins_list += ["vector_muls"] tensor_dic["center_y"] = center_y op_list += [center_y] ins_list += ["vector_muls"] box_width_scale = tvm.compute(shape_box, \ lambda *i: box_width_ub(*i) * scale_value, name="box_width_scale") box_height_scale = tvm.compute(shape_box, \ lambda *i: box_height_ub(*i) * scale_value, name="box_height_scale") box_width_scale_oppo = tvm.compute(shape_box, \ lambda *i: box_width_ub(*i) * scale_value_oppo, \ name="box_width_scale_oppo") box_height_scale_oppo = tvm.compute(shape_box, \ lambda *i: box_height_ub(*i) * scale_value_oppo, \ name="box_height_scale_oppo") tensor_dic["box_width_scale"] = box_width_scale op_list += [box_width_scale] ins_list += ["vector_muls"] tensor_dic["box_height_scale"] = box_height_scale op_list += [box_height_scale] ins_list += ["vector_muls"] tensor_dic["box_width_scale_oppo"] = box_width_scale_oppo op_list += [box_width_scale_oppo] ins_list += ["vector_muls"] tensor_dic["box_height_scale_oppo"] = box_height_scale_oppo op_list += [box_height_scale_oppo] ins_list += ["vector_muls"] num_box = shape_box[0] h_length = shape_data_h[0] w_length = shape_data_w[0] center_x_minus_calc = tvm.compute((w_length, num_box), \ lambda w, c: center_x[w, 0, 0, 0] + box_width_scale_oppo[c, 0, 0, 0], \ name="center_x_minus_calc") center_y_minus_calc = tvm.compute((h_length, num_box), \ lambda h, c: center_y[h, 0, 0, 0] + box_height_scale_oppo[c, 0, 0, 0], \ name="center_y_minus_calc") center_x_add_calc = tvm.compute((w_length, num_box), \ lambda w, c: center_x[w, 0, 0, 0] + box_width_scale[c, 0, 0, 0], \ name="center_x_add_calc") center_y_add_calc = tvm.compute((h_length, num_box), \ lambda h, c: center_y[h, 0, 0, 0] + box_height_scale[c, 0, 0, 0], \ name="center_y_add_calc") tensor_dic["center_x_minus_calc"] = center_x_minus_calc op_list += [center_x_minus_calc] ins_list += ["vector_add"] tensor_dic["center_y_minus_calc"] = center_y_minus_calc op_list += [center_y_minus_calc] ins_list += ["vector_add"] tensor_dic["center_x_add_calc"] = center_x_add_calc op_list += [center_x_add_calc] ins_list += ["vector_add"] tensor_dic["center_y_add_calc"] = center_y_add_calc op_list += [center_y_add_calc] ins_list += ["vector_add"] top_data_xmin = tvm.compute((w_length, num_box), \ lambda *i: center_x_minus_calc(*i) * rec_img_w_value, \ name="top_data_xmin") top_data_ymin = tvm.compute((h_length, num_box), \ lambda *i: center_y_minus_calc(*i) * rec_img_h_value, \ name="top_data_ymin") top_data_xmax = tvm.compute((w_length, num_box), \ lambda *i: center_x_add_calc(*i) * rec_img_w_value, \ name="top_data_xmax") top_data_ymax = tvm.compute((h_length, num_box), \ lambda *i: center_y_add_calc(*i) * rec_img_h_value, \ name="top_data_ymax") tensor_dic["top_data_xmin"] = top_data_xmin op_list += [top_data_xmin] ins_list += ["vector_muls"] tensor_dic["top_data_ymin"] = top_data_ymin op_list += [top_data_ymin] ins_list += ["vector_muls"] tensor_dic["top_data_xmax"] = top_data_xmax op_list += [top_data_xmax] ins_list += ["vector_muls"] tensor_dic["top_data_ymax"] = top_data_ymax op_list += [top_data_ymax] ins_list += ["vector_muls"] top_data_res1 = tvm.compute((h_length, w_length, num_box), \ lambda a, b, c: top_data_xmin[b, c] + move_value, \ name="top_data_res1") top_data_res2 = tvm.compute((h_length, w_length, num_box), \ lambda a, b, c: top_data_ymin[a, c] + move_value, \ name="top_data_res2") top_data_res3 = tvm.compute((h_length, w_length, num_box), \ lambda a, b, c: top_data_xmax[b, c] + move_value, \ name="top_data_res3") top_data_res4 = tvm.compute((h_length, w_length, num_box), \ lambda a, b, c: top_data_ymax[a, c] + move_value, \ name="top_data_res4") tensor_dic["top_data_res1"] = top_data_res1 op_list += [top_data_res1] ins_list += ["vector_add"] tensor_dic["top_data_res2"] = top_data_res2 op_list += [top_data_res2] ins_list += ["vector_add"] tensor_dic["top_data_res3"] = top_data_res3 op_list += [top_data_res3] ins_list += ["vector_add"] tensor_dic["top_data_res4"] = top_data_res4 op_list += [top_data_res4] ins_list += ["vector_add"] top_data = tvm.compute((h_length, w_length, num_box, 4), \ lambda a, b, c, idx: tvm.select(idx < 1, top_data_res1[a, b, c], \ tvm.select(idx < 2, top_data_res2[a, b, c], \ tvm.select(idx < 3, top_data_res3[a, b, c], \ top_data_res4[a, b, c], \ ))), name="top_data") tensor_dic["top_data"] = top_data op_list += [top_data] ins_list += ["data_mov"] top_data_true = top_data if clip: top_data_temp = tvm.compute((h_length, w_length, num_box, 4), \ lambda *i: tvm.max(top_data(*i), 0), name="top_data_temp") top_data_true = tvm.compute((h_length, w_length, num_box, 4), \ lambda *i: tvm.min(top_data_temp(*i), 1), name="top_data_true") tensor_dic["top_data_temp"] = top_data_temp op_list += [top_data_temp] ins_list += ["vector_maxs"] tensor_dic["top_data_true"] = top_data_true op_list += [top_data_true] ins_list += ["vector_mins"] if len(variance) == 1: variance_data = tvm.compute((h_length, w_length, num_box, 4), \ lambda a, b, c, idx: tvm.select(idx < 1, variance_value, \ tvm.select(idx < 2, variance_value, \ tvm.select(idx < 3, variance_value, \ variance_value, \ ))), name="variance_data") tensor_dic["variance_data"] = variance_data op_list += [variance_data] ins_list += ["data_mov"] else: variance_data = tvm.compute((h_length, w_length, num_box, 4), \ lambda a, b, c, idx: tvm.select(idx < 1, variance_value0, \ tvm.select(idx < 2, variance_value1, \ tvm.select(idx < 3, variance_value2, \ variance_value3, \ ))), name="variance_data") tensor_dic["variance_data"] = variance_data op_list += [variance_data] ins_list += ["data_mov"] y = tvm.compute((1, 2, h_length, w_length, num_box, 4), \ lambda i, idx, j, k, l, m: tvm.select(idx == 1, \ variance_data[j, k, l, m], top_data_true[j, k, l, m], \ ), name='result') tensor_dic["y"] = y op_list += [y] ins_list += ["dma_copy"] tensor_list.append(y) return op_list, ins_list, tensor_dic, y, tensor_list
def _dynamic_gru_inner(input_list, custom_list): input_x = input_list[0] weight1 = input_list[1] weight2 = input_list[2] bias1 = input_list[3] bias2 = input_list[4] s_init_h_gm = input_list[5] s_state_h_gm_last = input_list[6] is_gate_output = custom_list[0] is_first_round = custom_list[1] is_global_init = custom_list[2] input_dtype = 'float16' bias_dtype = bias1.dtype fp16_input_output = bias_dtype == 'float16' shape_x_input = input_x.shape shape_w1_input = weight1.shape w1_size = 2 w2_size = 1 t_size = shape_x_input[0].value m_size = shape_x_input[2].value k_size = shape_w1_input[1].value hidden_size = shape_w1_input[3].value in_x = k_size - hidden_size shape_b_1 = (1, k_size, w1_size, hidden_size, 16, 16) shape_b_2 = (1, k_size, w2_size, hidden_size, 16, 16) shape_c_1 = (1, w1_size, hidden_size, m_size, 16, 16) shape_c_2 = (1, w2_size, hidden_size, m_size, 16, 16) shape_bias_1 = (1, w1_size, hidden_size, 1, 1, 16) shape_bias_2 = (1, hidden_size, 1, 1, 16) shape_i = (1, hidden_size, m_size, 16, 16) shape_i_t = (t_size, hidden_size, m_size, 16, 16) k0_size = 16 if is_first_round and not is_global_init: s_state_h = tvm.compute( shape_i, lambda *indices: tvm.const(0.0, dtype='float32'), name='s_state_h') s_state_h_fp16 = tvm.compute( shape_i, lambda *indices: s_state_h(*indices).astype('float16'), name="s_state_h_fp16") else: last_h = s_init_h_gm if is_first_round else s_state_h_gm_last if fp16_input_output: s_state_h_fp16 = tvm.compute(shape_i, lambda *indices: last_h(*indices), name='s_state_h_fp16') s_state_h = tvm.compute( shape_i, lambda *indices: s_state_h_fp16(*indices).astype('float32'), name="s_state_h") else: s_state_h = tvm.compute(shape_i, lambda *indices: last_h(*indices), name='s_state_h') s_state_h_fp16 = tvm.compute( shape_i, lambda *indices: s_state_h(*indices).astype('float16'), name="s_state_h_fp16") # compute # input and s_state_h need first to ub and cast to float16 shape_a_z_bigz = (1, m_size, k_size, 16, 16) # input and s_start_h is Nz, need trans to zZ # so change axis 1 and 2 a_l1_1 = tvm.compute( shape_a_z_bigz, lambda *indice: tvm.select( indice[2] < in_x, input_x[indice[0], indice[2], indice[1], indice[ 3], indice[4]], s_state_h_fp16[0, indice[2] - in_x, indice[1], indice[3], indice[4]]), name="a_l1_1", tag="concat") b_l1_1 = tvm.compute(shape_b_1, lambda *indices: weight1(*indices), name='b_l1_1') a_l0a_1 = tvm.compute(shape_a_z_bigz, lambda *indices: a_l1_1(*indices), name="a_l0a_1") b_l0b_1 = tvm.compute(shape_b_1, lambda *indices: b_l1_1(*indices), name="b_l0b_1") k1_1 = tvm.reduce_axis((0, k_size), name='k1_1') k0_1 = tvm.reduce_axis((0, k0_size), name='k0_1') c_l0c_1 = tvm.compute(shape_c_1, lambda t, nb_0, nb_1, mb, mp, np: tvm.sum((a_l0a_1[t, mb, k1_1, mp, k0_1] * \ b_l0b_1[t, k1_1, nb_0, nb_1, np, k0_1]) \ .astype('float32'), axis=[k1_1, k0_1]), name='c_l0c_1') c_ub_1 = tvm.compute(shape_c_1, lambda *indices: c_l0c_1(*indices), name="c_ub_1") bias_ub_1 = tvm.compute(shape_bias_1, lambda *indices: bias1(*indices), name='bias_ub_1') bias_ub_1_fp32 = bias_ub_1 if fp16_input_output: bias_ub_1_fp32 = tvm.compute( shape_bias_1, lambda *indices: bias_ub_1(*indices).astype('float32'), name="bias_ub_1_fp32") bias_bc_ub_1 = tbe.broadcast(bias_ub_1_fp32, shape_c_1) c_ub_bias_1 = tbe.vadd(c_ub_1, bias_bc_ub_1) # split matmul res r_t_index = 0 i_t_index = 1 r_t = tvm.compute( shape_i, lambda t, i, j, k, l: c_ub_bias_1(t, r_t_index, i, j, k, l), name="r_t") i_t = tvm.compute( shape_i, lambda t, i, j, k, l: c_ub_bias_1(t, i_t_index, i, j, k, l), name="i_t") r_t_sigmoid = _sigmoid_compute(r_t) i_t_sigmoid = _sigmoid_compute(i_t) r_t_mid = r_t_sigmoid i_t_mid = i_t_sigmoid if is_gate_output: if fp16_input_output: r_t_sigmoid_fp16 = tvm.compute( shape_i, lambda *indices: r_t_sigmoid(*indices).astype('float16'), name="r_t_sigmoid_fp16") i_t_sigmoid_fp16 = tvm.compute( shape_i, lambda *indices: i_t_sigmoid(*indices).astype('float16'), name="i_t_sigmoid_fp16") r_t_gm = tvm.compute(shape_i, lambda *indices: r_t_sigmoid_fp16(*indices), name="r_t_gm") i_t_gm = tvm.compute(shape_i, lambda *indices: i_t_sigmoid_fp16(*indices), name="i_t_gm") r_t_gm_back = tvm.compute(shape_i, lambda *indices: r_t_gm(*indices), name="r_t_gm_back") i_t_gm_back = tvm.compute(shape_i, lambda *indices: i_t_gm(*indices), name="i_t_gm_back") r_t_gm_back_fp32 = tvm.compute( shape_i, lambda *indices: r_t_gm_back(*indices).astype('float32'), name="r_t_gm_back_fp32") i_t_gm_back_fp32 = tvm.compute( shape_i, lambda *indices: i_t_gm_back(*indices).astype('float32'), name="i_t_gm_back_fp32") r_t_mid = r_t_gm_back_fp32 i_t_mid = i_t_gm_back_fp32 else: r_t_gm = tvm.compute(shape_i, lambda *indices: r_t_sigmoid(*indices), name="r_t_gm") i_t_gm = tvm.compute(shape_i, lambda *indices: i_t_sigmoid(*indices), name="i_t_gm") r_t_gm_back = tvm.compute(shape_i, lambda *indices: r_t_gm(*indices), name="r_t_gm_back") i_t_gm_back = tvm.compute(shape_i, lambda *indices: i_t_gm(*indices), name="i_t_gm_back") r_t_mid = r_t_gm_back i_t_mid = i_t_gm_back r_t_h = tbe.vmul(r_t_mid, s_state_h) r_t_h_fp16 = \ tvm.compute(shape_i, lambda *indices: r_t_h(*indices).astype(input_dtype), name="r_t_h_fp16") # second matmul a_l1_2 = tvm.compute( shape_a_z_bigz, lambda *indice: tvm.select( indice[2] < in_x, input_x[indice[0], indice[2], indice[1], indice[ 3], indice[4]], r_t_h_fp16[0, indice[2] - in_x, indice[1], indice[3], indice[4]]), name="a_l1_2", tag="concat") b_l1_2 = tvm.compute(shape_b_2, lambda *indices: weight2(*indices), name='b_l1_2') a_l0a_2 = tvm.compute(shape_a_z_bigz, lambda *indices: a_l1_2(*indices), name="a_l0a_2") b_l0b_2 = tvm.compute(shape_b_2, lambda *indices: b_l1_2(*indices), name="b_l0b_2") k1_2 = tvm.reduce_axis((0, k_size), name='k1_2') k0_2 = tvm.reduce_axis((0, k0_size), name='k0_2') c_l0c_2 = tvm.compute(shape_c_2, lambda t, nb_0, nb_1, mb, mp, np: tvm.sum((a_l0a_2[t, mb, k1_2, mp, k0_2] * \ b_l0b_2[t, k1_2, nb_0, nb_1, np, k0_2]) \ .astype('float32'), axis=[k1_2, k0_2]), name='c_l0c_2') c_ub_2 = tvm.compute(shape_i, lambda t, h, m, i, j: c_l0c_2(t, 0, h, m, i, j), name="c_ub_2") bias_ub_2 = tvm.compute(shape_bias_2, lambda t, h, m, i, j: bias2(t, h, m, i, j), name='bias_ub_2') bias_ub_2_fp32 = bias_ub_2 if fp16_input_output: bias_ub_2_fp32 = tvm.compute( shape_bias_2, lambda *indices: bias_ub_2(*indices).astype('float32'), name="bias_ub_2_fp32") bias_bc_ub_2 = tbe.broadcast(bias_ub_2_fp32, shape_i) c_ub_bias_2 = tbe.vadd(c_ub_2, bias_bc_ub_2) h_t_tanh = _tanh_compute(c_ub_bias_2) h_t_tanh_mid = h_t_tanh if is_gate_output: if fp16_input_output: h_t_tanh_fp16 = tvm.compute( shape_i, lambda *indices: h_t_tanh(*indices).astype('float16'), name="h_t_tanh_fp16") n_t_gm = tvm.compute(shape_i, lambda *indices: h_t_tanh_fp16(*indices), name="n_t_gm") n_t_gm_back = tvm.compute(shape_i, lambda *indices: n_t_gm(*indices), name="n_t_gm_back") n_t_gm_back_fp32 = tvm.compute( shape_i, lambda *indices: n_t_gm_back(*indices).astype('float32'), name="n_t_gm_back_fp32") h_t_tanh_mid = n_t_gm_back_fp32 else: n_t_gm = tvm.compute(shape_i, lambda *indices: h_t_tanh(*indices), name="n_t_gm") n_t_gm_back = tvm.compute(shape_i, lambda *indices: n_t_gm(*indices), name="n_t_gm_back") h_t_tanh_mid = n_t_gm_back c_t_tmp1 = tbe.vsub(s_state_h, h_t_tanh_mid) c_t_tmp2 = tbe.vmul(c_t_tmp1, i_t_mid) update_h = tbe.vadd(c_t_tmp2, h_t_tanh_mid) update_h_ub = update_h if fp16_input_output: update_h_fp16 = tvm.compute( shape_i_t, lambda *indices: update_h(*indices).astype('float16'), name="update_h_fp16") update_h_ub = update_h_fp16 update_y_gm = tvm.compute(shape_i_t, lambda t, i, j, k, l: update_h_ub(0, i, j, k, l), name="update_y_gm") update_y_gm_back = tvm.compute( shape_i_t, lambda t, i, j, k, l: update_y_gm(0, i, j, k, l), name="update_y_gm_back") update_h_gm = tvm.compute( shape_i_t, lambda t, i, j, k, l: update_y_gm_back(0, i, j, k, l), name="update_h_gm") # end compute # schedule s = tvm.schedule.create_schedule([update_h_gm.op]) def gen_reversed_subgraph_list(out_tensor, tensor_list): """ traverse tensors by Depth-First-Search """ if out_tensor is None: return stack = [out_tensor] visited_list = [] while stack: cur_tensor = stack.pop() visited_list.append(cur_tensor) for in_tensor in cur_tensor.op.input_tensors: if in_tensor not in visited_list: stack.append(in_tensor) if "elewise" in in_tensor.op.tag or \ "broadcast" == in_tensor.op.tag: if in_tensor not in tensor_list: tensor_list.append(in_tensor) elewise_tensors_r_t_h_fp16 = [] gen_reversed_subgraph_list(r_t_h_fp16, elewise_tensors_r_t_h_fp16) elewise_tensors = [] tmp_tensors = [] gen_reversed_subgraph_list(update_h_gm, tmp_tensors) for i in tmp_tensors: if i not in elewise_tensors_r_t_h_fp16: elewise_tensors.append(i) # set scope s[s_state_h].set_scope(tbe_platform.scope_ubuf) s[s_state_h_fp16].set_scope(tbe_platform.scope_ubuf) s[a_l1_1].set_scope(tbe_platform.scope_cbuf) s[b_l1_1].set_scope(tbe_platform.scope_cbuf) s[a_l0a_1].set_scope(tbe_platform.scope_ca) s[b_l0b_1].set_scope(tbe_platform.scope_cb) s[c_l0c_1].set_scope(tbe_platform.scope_cc) s[c_ub_1].set_scope(tbe_platform.scope_ubuf) s[bias_ub_1].set_scope(tbe_platform.scope_ubuf) s[bias_bc_ub_1].set_scope(tbe_platform.scope_ubuf) s[r_t_h_fp16].set_scope(tbe_platform.scope_ubuf) s[a_l1_2].set_scope(tbe_platform.scope_cbuf) s[b_l1_2].set_scope(tbe_platform.scope_cbuf) s[a_l0a_2].set_scope(tbe_platform.scope_ca) s[b_l0b_2].set_scope(tbe_platform.scope_cb) s[c_l0c_2].set_scope(tbe_platform.scope_cc) s[c_ub_2].set_scope(tbe_platform.scope_ubuf) s[bias_ub_2].set_scope(tbe_platform.scope_ubuf) s[bias_bc_ub_2].set_scope(tbe_platform.scope_ubuf) s[update_y_gm_back].set_scope(tbe_platform.scope_ubuf) if is_gate_output: s[r_t_gm_back].set_scope(tbe_platform.scope_ubuf) s[i_t_gm_back].set_scope(tbe_platform.scope_ubuf) s[n_t_gm_back].set_scope(tbe_platform.scope_ubuf) if fp16_input_output: s[r_t_sigmoid_fp16].set_scope(tbe_platform.scope_ubuf) s[i_t_sigmoid_fp16].set_scope(tbe_platform.scope_ubuf) s[h_t_tanh_fp16].set_scope(tbe_platform.scope_ubuf) s[r_t_gm_back_fp32].set_scope(tbe_platform.scope_ubuf) s[i_t_gm_back_fp32].set_scope(tbe_platform.scope_ubuf) s[n_t_gm_back_fp32].set_scope(tbe_platform.scope_ubuf) if fp16_input_output: s[bias_ub_1_fp32].set_scope(tbe_platform.scope_ubuf) s[bias_ub_2_fp32].set_scope(tbe_platform.scope_ubuf) s[update_h_fp16].set_scope(tbe_platform.scope_ubuf) # compute inline compute_inline_tensors = [i_t, r_t] for tensor in compute_inline_tensors: s[tensor].compute_inline() # matmul tiling factor_l1_m, factor_l1_n, factor_l1_k, factor_l0_m, factor_l0_n, factor_l0_k = \ _get_tiling(m_size, k_size, hidden_size) l1_n_outer_1, l1_n_inner_1 = s[c_l0c_1].split(c_l0c_1.op.axis[2], factor=factor_l1_n) l1_m_outer_1, l1_m_inner_1 = s[c_l0c_1].split(c_l0c_1.op.axis[3], factor=factor_l1_m) l1_k_outer_1, l1_k_inner_1 = s[c_l0c_1].split(c_l0c_1.op.reduce_axis[0], factor=factor_l1_k) l0_n_outer_1, l0_n_inner_1 = s[c_l0c_1].split(l1_n_inner_1, factor=factor_l0_n) l0_m_outer_1, l0_m_inner_1 = s[c_l0c_1].split(l1_m_inner_1, factor=factor_l0_m) l0_k_outer_1, l0_k_inner_1 = s[c_l0c_1].split(l1_k_inner_1, factor=factor_l0_k) s[c_l0c_1].reorder(c_l0c_1.op.axis[0], l1_n_outer_1, l1_k_outer_1, c_l0c_1.op.axis[1], l1_m_outer_1, l0_n_outer_1, l0_m_outer_1, l0_k_outer_1, l0_n_inner_1, l0_m_inner_1, c_l0c_1.op.axis[4], c_l0c_1.op.axis[5], l0_k_inner_1, c_l0c_1.op.reduce_axis[1]) s[a_l1_1].double_buffer() s[b_l1_1].double_buffer() s[a_l0a_1].double_buffer() s[b_l0b_1].double_buffer() s[c_l0c_1].double_buffer() s[c_ub_1].double_buffer() s[a_l1_1].compute_at(s[c_l0c_1], l1_k_outer_1) s[b_l1_1].compute_at(s[c_l0c_1], c_l0c_1.op.axis[1]) s[a_l0a_1].compute_at(s[c_l0c_1], l1_k_outer_1) s[b_l0b_1].compute_at(s[c_l0c_1], l0_k_outer_1) c_ub_bias_1_outer, c_ub_bias_1_inner = s[c_ub_bias_1].split( c_ub_bias_1.op.axis[2], factor=factor_l1_n) s[c_ub_bias_1].reorder(c_ub_bias_1.op.axis[0], c_ub_bias_1_outer, c_ub_bias_1.op.axis[1], c_ub_bias_1_inner, c_ub_bias_1.op.axis[3], c_ub_bias_1.op.axis[4], c_ub_bias_1.op.axis[5]) s[c_l0c_1].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer) s[c_ub_1].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer) s[bias_ub_1].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer) s[bias_bc_ub_1].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer) if fp16_input_output: s[bias_ub_1_fp32].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer) s[c_ub_bias_1].emit_insn(c_ub_bias_1.op.axis[1], 'vector_add') r_t_h_fp16_outer, r_t_h_fp16_inner = s[r_t_h_fp16].split( r_t_h_fp16.op.axis[1], factor=factor_l1_n) for tensor in elewise_tensors_r_t_h_fp16: s[tensor].set_scope(tbe_platform.scope_ubuf) if tensor == c_ub_bias_1: continue s[tensor].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer) insn = _get_emit_insn_map(tensor) s[tensor].emit_insn(tensor.op.axis[0], insn) if is_gate_output: s[r_t_gm].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer) s[r_t_gm_back].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer) if fp16_input_output: s[r_t_sigmoid_fp16].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer) s[r_t_gm_back_fp32].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer) s[r_t_h_fp16].emit_insn(r_t_h_fp16_inner, 'vector_conv') l1_n_outer_2, l1_n_inner_2 = s[c_l0c_2].split(c_l0c_2.op.axis[2], factor=factor_l1_n) l1_m_outer_2, l1_m_inner_2 = s[c_l0c_2].split(c_l0c_2.op.axis[3], factor=factor_l1_m) l1_k_outer_2, l1_k_inner_2 = s[c_l0c_2].split(c_l0c_2.op.reduce_axis[0], factor=factor_l1_k) l0_n_outer_2, l0_n_inner_2 = s[c_l0c_2].split(l1_n_inner_2, factor=factor_l0_n) l0_m_outer_2, l0_m_inner_2 = s[c_l0c_2].split(l1_m_inner_2, factor=factor_l0_m) l0_k_outer_2, l0_k_inner_2 = s[c_l0c_2].split(l1_k_inner_2, factor=factor_l0_k) s[c_l0c_2].reorder(c_l0c_2.op.axis[0], l1_n_outer_2, l1_k_outer_2, c_l0c_2.op.axis[1], l1_m_outer_2, l0_n_outer_2, l0_m_outer_2, l0_k_outer_2, l0_n_inner_2, l0_m_inner_2, c_l0c_2.op.axis[4], c_l0c_2.op.axis[5], l0_k_inner_2, c_l0c_2.op.reduce_axis[1]) s[a_l1_2].double_buffer() s[b_l1_2].double_buffer() s[a_l0a_2].double_buffer() s[b_l0b_2].double_buffer() s[c_l0c_2].double_buffer() s[c_ub_2].double_buffer() s[a_l1_2].compute_at(s[c_l0c_2], l1_k_outer_2) s[b_l1_2].compute_at(s[c_l0c_2], c_l0c_2.op.axis[1]) s[a_l0a_2].compute_at(s[c_l0c_2], l1_k_outer_2) s[b_l0b_2].compute_at(s[c_l0c_2], l0_k_outer_2) update_h_gm_outer, update_h_gm_inner = s[update_h_gm].split( update_h_gm.op.axis[1], factor=factor_l1_n) s[c_l0c_2].compute_at(s[update_h_gm], update_h_gm_outer) s[c_ub_2].compute_at(s[update_h_gm], update_h_gm_outer) s[bias_ub_2].compute_at(s[update_h_gm], update_h_gm_outer) s[bias_bc_ub_2].compute_at(s[update_h_gm], update_h_gm_outer) s[c_ub_bias_2].compute_at(s[update_h_gm], update_h_gm_outer) s[update_y_gm].compute_at(s[update_h_gm], update_h_gm_outer) s[update_y_gm_back].compute_at(s[update_h_gm], update_h_gm_outer) if fp16_input_output: s[bias_ub_2_fp32].compute_at(s[update_h_gm], update_h_gm_outer) s[update_h_fp16].compute_at(s[update_h_gm], update_h_gm_outer) if is_gate_output: s[i_t_gm].compute_at(s[update_h_gm], update_h_gm_outer) s[i_t_gm_back].compute_at(s[update_h_gm], update_h_gm_outer) s[n_t_gm].compute_at(s[update_h_gm], update_h_gm_outer) s[n_t_gm_back].compute_at(s[update_h_gm], update_h_gm_outer) if fp16_input_output: s[i_t_sigmoid_fp16].compute_at(s[update_h_gm], update_h_gm_outer) s[i_t_gm_back_fp32].compute_at(s[update_h_gm], update_h_gm_outer) s[h_t_tanh_fp16].compute_at(s[update_h_gm], update_h_gm_outer) s[n_t_gm_back_fp32].compute_at(s[update_h_gm], update_h_gm_outer) for tensor in elewise_tensors: s[tensor].set_scope(tbe_platform.scope_ubuf) s[tensor].compute_at(s[update_h_gm], update_h_gm_outer) insn = _get_emit_insn_map(tensor) s[tensor].emit_insn(tensor.op.axis[0], insn) # emit insn if is_first_round and not is_global_init: s[s_state_h].emit_insn(s_state_h.op.axis[0], 'broadcast') s[s_state_h_fp16].emit_insn(s_state_h_fp16.op.axis[0], 'vector_conv') else: if fp16_input_output: s[s_state_h_fp16].emit_insn(s_state_h_fp16.op.axis[0], 'dma_copy') s[s_state_h].emit_insn(s_state_h.op.axis[0], 'vector_conv') else: s[s_state_h].emit_insn(s_state_h.op.axis[0], 'dma_copy') s[s_state_h_fp16].emit_insn(s_state_h_fp16.op.axis[0], 'vector_conv') s[a_l1_1].emit_insn(a_l1_1.op.axis[0], 'dma_copy') s[b_l1_1].emit_insn(b_l1_1.op.axis[0], 'dma_copy') s[a_l0a_1].emit_insn(a_l0a_1.op.axis[0], 'dma_copy') s[b_l0b_1].emit_insn(b_l0b_1.op.axis[0], 'dma_copy') mad_dict = {"mad_pattern": 0, "k_outer": [l1_k_outer_1, l0_k_outer_1]} s[c_l0c_1].emit_insn(l0_n_inner_1, 'mad', mad_dict) s[c_ub_1].emit_insn(c_ub_1.op.axis[0], 'dma_copy') s[bias_ub_1].emit_insn(bias_ub_1.op.axis[0], 'dma_copy') if fp16_input_output: s[bias_ub_1_fp32].emit_insn(bias_ub_1_fp32.op.axis[0], 'vector_conv') s[bias_ub_2_fp32].emit_insn(bias_ub_2_fp32.op.axis[0], 'vector_conv') s[update_h_fp16].emit_insn(update_h_fp16.op.axis[0], 'vector_conv') s[bias_bc_ub_1].emit_insn(bias_bc_ub_1.op.axis[0], 'unified_broadcast') s[a_l1_2].emit_insn(a_l1_2.op.axis[0], 'dma_copy') s[b_l1_2].emit_insn(b_l1_2.op.axis[0], 'dma_copy') s[a_l0a_2].emit_insn(a_l0a_2.op.axis[0], 'dma_copy') s[b_l0b_2].emit_insn(b_l0b_2.op.axis[0], 'dma_copy') mad_dict = {"mad_pattern": 0, "k_outer": [l1_k_outer_2, l0_k_outer_2]} s[c_l0c_2].emit_insn(l0_n_inner_2, 'mad', mad_dict) s[c_ub_2].emit_insn(c_ub_2.op.axis[0], 'dma_copy') s[bias_ub_2].emit_insn(bias_ub_2.op.axis[0], 'dma_copy') s[bias_bc_ub_2].emit_insn(bias_bc_ub_2.op.axis[0], 'unified_broadcast') s[update_y_gm].emit_insn(update_y_gm.op.axis[0], 'dma_copy') s[update_y_gm_back].emit_insn(update_y_gm_back.op.axis[0], 'phony_insn') s[update_y_gm_back].reused_by(update_h_ub) if is_gate_output: s[r_t_gm].emit_insn(r_t_gm.op.axis[0], 'dma_copy') s[i_t_gm].emit_insn(i_t_gm.op.axis[0], 'dma_copy') s[n_t_gm].emit_insn(n_t_gm.op.axis[0], 'dma_copy') s[r_t_gm_back].emit_insn(r_t_gm_back.op.axis[0], 'phony_insn') s[i_t_gm_back].emit_insn(i_t_gm_back.op.axis[0], 'phony_insn') s[n_t_gm_back].emit_insn(n_t_gm_back.op.axis[0], 'phony_insn') if fp16_input_output: s[r_t_sigmoid_fp16].emit_insn(r_t_sigmoid_fp16.op.axis[0], 'vector_conv') s[i_t_sigmoid_fp16].emit_insn(i_t_sigmoid_fp16.op.axis[0], 'vector_conv') s[h_t_tanh_fp16].emit_insn(h_t_tanh_fp16.op.axis[0], 'vector_conv') s[r_t_gm_back_fp32].emit_insn(r_t_gm_back_fp32.op.axis[0], 'phony_insn') s[i_t_gm_back_fp32].emit_insn(i_t_gm_back_fp32.op.axis[0], 'phony_insn') s[n_t_gm_back_fp32].emit_insn(n_t_gm_back_fp32.op.axis[0], 'phony_insn') s[r_t_gm_back_fp32].reused_by(r_t_sigmoid) s[i_t_gm_back_fp32].reused_by(i_t_sigmoid) s[n_t_gm_back_fp32].reused_by(h_t_tanh) s[r_t_gm_back].reused_by(r_t_sigmoid_fp16) s[i_t_gm_back].reused_by(i_t_sigmoid_fp16) s[n_t_gm_back].reused_by(h_t_tanh_fp16) else: s[r_t_gm_back].reused_by(r_t_sigmoid) s[i_t_gm_back].reused_by(i_t_sigmoid) s[n_t_gm_back].reused_by(h_t_tanh) s[update_h_gm].emit_insn(update_h_gm_inner, 'dma_copy') output_list = [update_y_gm, update_h_gm] if is_gate_output: output_list.append(r_t_gm) output_list.append(i_t_gm) output_list.append(n_t_gm) return output_list, s
def get_matmul_tensor(x, h, c, w, b, build_list, tensor_list, scope_list, operation_list, is_hisi_es): shape_x = x.get("shape") shape_h = h.get("shape") shape_c = c.get("shape") dtype_x = x.get("dtype").lower() dtype_c = c.get("dtype").lower() dtype_b = b.get("dtype").lower() input_dim, batch_dim = shape_x[0:2] hidden_dim = shape_h[0] output_dim = hidden_dim shape_b = b.get("shape") shape_b = (shape_b[0] // 16, 16) shape_xh = (batch_dim, input_dim + hidden_dim, C0, C0) shape_w = w.get("shape") shape_w_split = list(shape_w) shape_w_split[1] = shape_w_split[1] // 4 # Inputs in gm tensor_x = tvm.placeholder(shape_x, name='tensor_x', dtype=dtype_x) tensor_h = tvm.placeholder(shape_h, name='tensor_h', dtype=dtype_x) tensor_c = tvm.placeholder(shape_c, name='tensor_c', dtype=dtype_c) tensor_w = tvm.placeholder(shape_w, name='tensor_w', dtype=dtype_x) tensor_b = tvm.placeholder(shape_b, name='tensor_b', dtype=dtype_c) build_list["x"] = tensor_x build_list["h"] = tensor_h build_list["c"] = tensor_c build_list["w"] = tensor_w build_list["b"] = tensor_b symbol = ["it", "jt", "ft", "ot"] def _index_w(str_name, *index): if str_name == "it": return index[0], index[1], index[2], index[3] elif str_name == "jt": return index[0], index[1] + output_dim, index[2], index[3] elif str_name == "ft": return index[0], index[1] + output_dim * 2, index[2], index[3] return index[0], index[1] + output_dim * 3, index[2], index[3] def _index_bias(str_name): if str_name == "it": return 0 elif str_name == "jt": return 1 elif str_name == "ft": return 2 return 3 matmul_type = "float32" if is_hisi_es: matmul_type = "float16" for t in symbol: # caoncat x and h into 1 tensor,copy to L1 tensor_xh_l1_tmp = tvm.compute( shape_xh, lambda *indice: tvm.select( indice[1] < input_dim, tensor_x[indice[1], indice[0], indice[ 2], indice[3]], tensor_h[indice[1] - input_dim, indice[0], indice[2], indice[3]]), name="tensor_xh_l1_" + t, tag="concat") tensor_list["tensor_xh_l1_" + t] = tensor_xh_l1_tmp scope_list["tensor_xh_l1_" + t] = cce.scope_cbuf # optimazition: copy one time operation_list["tensor_xh_l1_" + t] = "dma_copy" # copy xh to L1 tensor_xh_l0a_tmp = tvm.compute(shape_xh, lambda *i: tensor_xh_l1_tmp(*i), name='tensor_xh_l0a_' + t) tensor_list["tensor_xh_l0a_" + t] = tensor_xh_l0a_tmp scope_list["tensor_xh_l0a_" + t] = cce.scope_ca # optimazition: copy one time operation_list["tensor_xh_l0a_" + t] = "dma_copy" # copy w to L1 buf tensor_w_l1_tmp = tvm.compute(shape_w_split, lambda *i: tensor_w(*_index_w(t, *i)), name='tensor_w_l1_' + t) tensor_list["tensor_w_l1_" + t] = tensor_w_l1_tmp scope_list["tensor_w_l1_" + t] = cce.scope_cbuf operation_list["tensor_w_l1_" + t] = "dma_copy" # copy W from L1 to L0 B tensor_w_l0b_tmp = tvm.compute(shape_w_split, lambda *i: tensor_w_l1_tmp(*i), name='tensor_w_l0b_' + t) tensor_list["tensor_w_l0b_" + t] = tensor_w_l0b_tmp scope_list["tensor_w_l0b_" + t] = cce.scope_cb operation_list["tensor_w_l0b_" + t] = "dma_copy" # copy bias to ubuf ,split the tensor_b_ub_tmp = tvm.compute( shape_b, lambda i0, i1: tensor_b[_index_bias(t) * output_dim + i0, i1], name='tensor_b_ub_' + t) tensor_list["tensor_b_ub_" + t] = tensor_b_ub_tmp scope_list["tensor_b_ub_" + t] = cce.scope_ubuf operation_list["tensor_b_ub_" + t] = "dma_copy" # tensor_b_ub_true_tmp = tensor_b_ub_tmp if not is_hisi_es and dtype_b == "float16": tensor_b_ub_true_tmp = tvm.compute( shape_b, lambda *i: topi.cast(tensor_b_ub_tmp(*i), "float32"), name="tensor_b_ub_true_" + t) tensor_list["tensor_b_ub_true_" + t] = tensor_b_ub_true_tmp scope_list["tensor_b_ub_true_" + t] = cce.scope_ubuf operation_list["tensor_b_ub_true_" + t] = "vector_conv" # broadcast bias from [ouput_dim//16,16] to [output_dim//16,N//16,16,16] tensor_b_loc_tmp = tvm.compute( shape_h, lambda i0, i1, i2, i3: tensor_b_ub_true_tmp[i0, i3], name='tensor_b_loc_' + t) tensor_list["tensor_b_loc_" + t] = tensor_b_loc_tmp scope_list["tensor_b_loc_" + t] = cce.scope_cc operation_list["tensor_b_loc_" + t] = "dma_copy" # DO MATMUL reduce_kb = tvm.reduce_axis((0, input_dim + output_dim), name='reduce_kb_' + t) reduce_kp = tvm.reduce_axis((0, C0), name='reduce_kp_' + t) tensor_matmul_l0c_tmp = tvm.compute( shape_h, lambda nb, mb, mp, np: tvm.sum((tensor_xh_l0a_tmp[ mb, reduce_kb, mp, reduce_kp] * tensor_w_l0b_tmp[ reduce_kb, nb, np, reduce_kp]).astype(matmul_type), axis=[reduce_kb, reduce_kp]), name='tensor_matmul_l0c_' + t, attrs={'input_order': 'positive'}) tensor_list["tensor_matmul_l0c_" + t] = tensor_matmul_l0c_tmp scope_list["tensor_matmul_l0c_" + t] = cce.scope_cc # Matmul accumulation it + b_it tensor_matmul_result_l0c_tmp = tvm.compute( shape_h, lambda *i: tensor_b_loc_tmp(*i) + tensor_matmul_l0c_tmp(*i), name="tensor_matmul_result_l0c_" + t) tensor_list["tensor_matmul_result_l0c_" + t] = tensor_matmul_result_l0c_tmp scope_list["tensor_matmul_result_l0c_" + t] = cce.scope_cc operation_list["tensor_matmul_result_l0c_" + t] = "phony_insn" # copy matmul result from l0c to ub gate_ub_tmp = tvm.compute( shape_h, lambda *i: tensor_list["tensor_matmul_result_l0c_" + t](*i), name=t + "_ub") tensor_list[t + "_ub"] = gate_ub_tmp scope_list[t + "_ub"] = cce.scope_ubuf operation_list[t + "_ub"] = "dma_copy"