def CusMatMulCubeFraczRightMul(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"): """CusMatMulCubeFraczRightMul""" if util.get_product_version() == util.VERSION_MINI: tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) else: tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) input_x1_shape = input_x1.get("shape") input_x1_dtype = input_x1.get("dtype").lower() input_x2_shape = input_x2.get("shape") input_x2_dtype = input_x2.get("dtype").lower() input_x3_shape = input_x3.get("shape") input_x3_dtype = input_x3.get("dtype").lower() output_shape = output_y.get("shape") Supported = [((72, 8, 16, 16), "float16", (72, 72, 16, 16), "float16", (1,), "float32"), ((32, 8, 16, 16), "float16", (32, 32, 16, 16), "float16", (1,), "float32"), ((8, 32, 16, 16), "float16", (8, 8, 16, 16), "float16", (1,), "float32"), ((4, 4, 16, 16), "float16", (4, 4, 16, 16), "float16", (1,), "float32"), ((4, 16, 16, 16), 'float16', (4, 4, 16, 16), 'float16', (1,), 'float32'), ((49, 4, 16, 16), 'float16', (49, 49, 16, 16), 'float16', (1,), 'float32'), ((36, 4, 16, 16), 'float16', (36, 36, 16, 16), 'float16', (1,), 'float32'), ((64, 16, 16, 16), 'float16', (64, 64, 16, 16), 'float16', (1,), 'float32'), ((32, 64, 16, 16), 'float16', (32, 32, 16, 16), 'float16', (1,), 'float32'), ((32, 16, 16, 16), 'float16', (32, 32, 16, 16), 'float16', (1,), 'float32'), ((16, 32, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32'), ((16, 8, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32'), ((16, 4, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32'), ((288, 32, 16, 16), 'float16', (288, 288, 16, 16), 'float16', (1,), 'float32'), ((144, 16, 16, 16), 'float16', (144, 144, 16, 16), 'float16', (1,), 'float32'), ((128, 32, 16, 16), 'float16', (128, 128, 16, 16), 'float16', (1,), 'float32'), ((64, 128, 16, 16), 'float16', (64, 64, 16, 16), 'float16', (1,), 'float32'), ((32, 128, 16, 16), 'float16', (32, 32, 16, 16), 'float16', (1,), 'float32'), ((64, 32, 16, 16), 'float16', (64, 64, 16, 16), 'float16', (1,), 'float32'), ((16, 64, 16, 16), 'float16', (16, 16, 16, 16), 'float16', (1,), 'float32')] input_shape = ( tuple(input_x1_shape), input_x1_dtype, tuple(input_x2_shape), input_x2_dtype, tuple(input_x3_shape), input_x3_dtype) if input_shape not in Supported: raise RuntimeError("input_shape %s is not supported" % str(input_shape)) input_x1 = tik_instance.Tensor("float16", input_x1_shape, name="left_matrix", scope=tik.scope_gm) input_x2 = tik_instance.Tensor("float16", input_x2_shape, name="right_matrix", scope=tik.scope_gm) input_x3 = tik_instance.Tensor("float32", input_x3_shape, name="matrix_max", scope=tik.scope_gm) resMatmul = tik_instance.Tensor("float32", output_shape, name="output", scope=tik.scope_gm) cus_cube_matmul_right_mul(tik_instance, input_x1, input_x2, input_x3, resMatmul) tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2, input_x3], outputs=[resMatmul]) return tik_instance
def CusMatMulCubeDenseRight(input_x1, input_x2, input_x3, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"): """CusMatMulCubeDenseRight""" shape_a_temp = (128, 63, 16, 16) shape_b_temp = (128, 128, 16, 16) shape_output = output_y.get("shape") matrix_max_shape = (1, ) support_shape = [ (shape_a_temp, shape_b_temp, matrix_max_shape), ] shape_a_input = input_x1.get("shape") shape_b_input = input_x2.get("shape") matrix_max_input = input_x3.get("shape") input_shape = (tuple(shape_a_input), tuple(shape_b_input), tuple(matrix_max_input)) if input_shape not in support_shape: raise RuntimeError("input_shape %s is not supported" % str(input_shape)) if shape_a_temp[0] == 128 and shape_a_temp[1] == 63 and shape_b_temp[ 0] == 128 and shape_b_temp[1] == 128: if util.get_product_version() == util.VERSION_MINI: tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) else: tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm) input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm) input_x3 = tik_instance.Tensor("float32", [ 1, ], name="matrix_max", scope=tik.scope_gm) resMatmul = tik_instance.Tensor("float32", shape_output, name="output", scope=tik.scope_gm) with tik_instance.for_range(0, 32, block_num=32) as block_index: core_m_idx = block_index // 16 core_n_idx = block_index % 16 matrix_max_scalar = tik_instance.Scalar("float32") matrix_max_local_UB = tik_instance.Tensor( "float32", (8, ), scope=tik.scope_ubuf, name="matrix_max_local_UB") tik_instance.data_move(matrix_max_local_UB, input_x3, 0, 1, 1, 0, 0) matrix_max_scalar.set_as(matrix_max_local_UB[0]) resMatmul_local_UB = tik_instance.Tensor("float32", (256 * 128, ), scope=tik.scope_ubuf, name="resMatmul_local_UB") resMatmul_local_UB1 = tik_instance.Tensor( "float32", (240 * 128, ), scope=tik.scope_ubuf, name="resMatmul_local_UB1") resMatmul_local_UB_local_L0C = tik_instance.Tensor( "float32", (256 * 128, ), scope=tik.scope_cc, name="resMatmul_local_UB_local_L0C") resMatmul_local_UB_local_L0C1 = tik_instance.Tensor( "float32", (240 * 128, ), scope=tik.scope_cc, name="resMatmul_local_UB_local_L0C1") input_1_local_L1_local_L0A = tik_instance.Tensor( "float16", (256 * 128, ), scope=tik.scope_ca, name="input_1_local_L1_local_L0A") input_2_local_L1 = tik_instance.Tensor("float16", (8 * 128 * 16, ), scope=tik.scope_cbuf, name="input_2_local_L1") input_2_local_L11 = tik_instance.Tensor("float16", (8 * 128 * 16, ), scope=tik.scope_cbuf, name="input_2_local_L11") input_1_local_L1 = tik_instance.Tensor("float16", (8 * 256 * 16, ), scope=tik.scope_cbuf, name="input_1_local_L1") input_1_local_L11 = tik_instance.Tensor("float16", (8 * 240 * 16, ), scope=tik.scope_cbuf, name="input_1_local_L11") input_2_local_L1_local_L0B = tik_instance.Tensor( "float16", (128 * 128, ), scope=tik.scope_cb, name="input_2_local_L1_local_L0B") input_2_local_L1_local_L0B1 = tik_instance.Tensor( "float16", (128 * 128, ), scope=tik.scope_cb, name="input_2_local_L1_local_L0B1") with tik_instance.if_scope(core_m_idx == 0): with tik_instance.for_range(0, 2) as cc1: tik_instance.data_move( input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0) tik_instance.data_move( input_1_local_L1, input_x1[core_n_idx * 129024 + cc1 * 4096], 0, 8, 256, 752, 0) with tik_instance.for_range(0, 8) as cc10: tik_instance.load2dv1( input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256], 0, 8, 8, 0, True) with tik_instance.for_range(0, 16) as cc101: tik_instance.load2dv1( input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256], 0, 8, 16, 0, False) tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 256, 128, 128, 0) tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0) tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255, 1, 1, 8, 8) tik_instance.vmuls(64, resMatmul_local_UB[255 * 64], resMatmul_local_UB[255 * 64], matrix_max_scalar, 255, 1, 1, 8, 8) tik_instance.vmuls(64, resMatmul_local_UB[510 * 64], resMatmul_local_UB[510 * 64], matrix_max_scalar, 2, 1, 1, 8, 8) tik_instance.data_move( resMatmul[core_n_idx * 129024 + cc1 * 4096], resMatmul_local_UB, 0, 8, 512, 0, 1504) with tik_instance.else_scope(): tik_instance.data_move( input_2_local_L1, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0) tik_instance.data_move( input_1_local_L1, input_x1[core_n_idx * 129024 + 2 * 4096], 0, 8, 256, 752, 0) with tik_instance.for_range(0, 8) as cc10: tik_instance.load2dv1( input_2_local_L1_local_L0B[cc10 * 2048], input_2_local_L1[cc10 * 256], 0, 8, 8, 0, True) with tik_instance.for_range(0, 16) as cc101: tik_instance.load2dv1( input_1_local_L1_local_L0A[cc101 * 2048], input_1_local_L1[cc101 * 256], 0, 8, 16, 0, False) tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 256, 128, 128, 0) tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0) tik_instance.vmuls(64, resMatmul_local_UB, resMatmul_local_UB, matrix_max_scalar, 255, 1, 1, 8, 8) tik_instance.vmuls(64, resMatmul_local_UB[255 * 64], resMatmul_local_UB[255 * 64], matrix_max_scalar, 255, 1, 1, 8, 8) tik_instance.vmuls(64, resMatmul_local_UB[510 * 64], resMatmul_local_UB[510 * 64], matrix_max_scalar, 2, 1, 1, 8, 8) tik_instance.data_move( resMatmul[core_n_idx * 129024 + 2 * 4096], resMatmul_local_UB, 0, 8, 512, 0, 1504) tik_instance.data_move( input_2_local_L11, input_x2[core_n_idx * 262144 + core_n_idx * 2048], 0, 8, 128, 1920, 0) tik_instance.data_move(input_1_local_L11, input_x1[core_n_idx * 129024 + 12288], 0, 8, 240, 768, 0) with tik_instance.for_range(0, 8) as cc102: tik_instance.load2dv1( input_2_local_L1_local_L0B1[cc102 * 2048], input_2_local_L11[cc102 * 256], 0, 8, 8, 0, True) with tik_instance.for_range(0, 16) as cc103: tik_instance.load2dv1( input_1_local_L1_local_L0A[cc103 * 2048], input_1_local_L11[cc103 * 256], 0, 8, 15, 0, False) tik_instance.mmad(resMatmul_local_UB_local_L0C1, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B1, 240, 128, 128, 0) tik_instance.data_move(resMatmul_local_UB1, resMatmul_local_UB_local_L0C1, 0, 1, 120, 0, 0) tik_instance.vmuls(64, resMatmul_local_UB1, resMatmul_local_UB1, matrix_max_scalar, 255, 1, 1, 8, 8) tik_instance.vmuls(64, resMatmul_local_UB1[255 * 64], resMatmul_local_UB1[255 * 64], matrix_max_scalar, 225, 1, 1, 8, 8) tik_instance.data_move(resMatmul[core_n_idx * 129024 + 12288], resMatmul_local_UB1, 0, 8, 480, 0, 1536) tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2, input_x3], outputs=[resMatmul]) return tik_instance
def CusCholeskyTrsm(input_x, output, kernel_name): """CusCholeskyTrsm""" input_x_shape = input_x.get("shape") output_shape = output.get("shape") split_dim = 128 matrix_dim = input_x_shape[0] split_dim = min(matrix_dim, split_dim) vector_repeat_times = int(split_dim // 64) blocks = int(matrix_dim // split_dim) if blocks == 0: blocks = 1 if util.get_product_version() == util.VERSION_MINI: tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) else: tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: input_x_ub = tik_instance.Tensor("float32", (split_dim, split_dim), name="input_x_ub", scope=tik.scope_ubuf) temp_ub = tik_instance.Tensor("float32", (split_dim, split_dim), name="temp_ub", scope=tik.scope_ubuf) assist_1_ub = tik_instance.Tensor("float32", (split_dim, ), name="assist_1_ub", scope=tik.scope_ubuf) assist_2_ub = tik_instance.Tensor("float32", (split_dim, ), name="assist_2_ub", scope=tik.scope_ubuf) with tik_instance.for_range(0, split_dim) as i: tik_instance.data_move( input_x_ub[i, 0], input_x[block_index * split_dim + i, block_index * split_dim], 0, 1, vector_repeat_times * 8, 0, 0) scalar1 = tik_instance.Scalar("float32", init_value=-0.5) with tik_instance.for_range(0, split_dim) as i: scalar2 = tik_instance.Scalar("float32") tik_instance.vln(64, assist_1_ub[0], input_x_ub[i, 0], vector_repeat_times, 1, 1, 8, 8) tik_instance.vmuls(64, assist_2_ub[0], assist_1_ub[0], scalar1, vector_repeat_times, 1, 1, 8, 8) tik_instance.vexp(64, assist_1_ub[0], assist_2_ub[0], vector_repeat_times, 1, 1, 8, 8) scalar2.set_as(assist_1_ub[i]) tik_instance.vmuls(64, input_x_ub[i, 0], input_x_ub[i, 0], scalar2, vector_repeat_times, 1, 1, 8, 8) with tik_instance.for_range(i + 1, split_dim) as j: scalar3 = tik_instance.Scalar("float32") scalar3.set_as(input_x_ub[i, j]) tik_instance.vmuls(64, temp_ub[j, 0], input_x_ub[i, 0], scalar3, vector_repeat_times, 1, 1, 8, 8) tik_instance.vsub(64, input_x_ub[i + 1, 0], input_x_ub[i + 1, 0], temp_ub[i + 1, 0], (split_dim - 1 - i) * vector_repeat_times, 1, 1, 1, 8, 8, 8) zero = tik_instance.Scalar("float32") zero.set_as(0.0) one = tik_instance.Scalar("float32") one.set_as(1.0) with tik_instance.for_range(0, split_dim) as i: tik_instance.vector_dup(64, temp_ub[i, 0], zero, vector_repeat_times, 1, 8) temp_ub.__setitem__(i * split_dim + i, one) chol_diag_element_final = tik_instance.Scalar("float32") chol_diag_element_final.set_as(input_x_ub[split_dim * split_dim - 1]) trsm_diag_element = tik_instance.Scalar("float32") trsm_diag_element.set_as(1.0 / chol_diag_element_final) temp_ub.__setitem__(split_dim * split_dim - 1, trsm_diag_element) with tik_instance.for_range(1, split_dim) as i: index = split_dim - i - 1 tik_instance.vector_dup(64, assist_1_ub, zero, vector_repeat_times, 1, 8) with tik_instance.for_range(0, i) as j: chol_diag_element_loop = tik_instance.Scalar("float32") chol_diag_element_loop.set_as(input_x_ub[index, index + 1 + j]) tik_instance.vmuls(64, assist_2_ub, temp_ub[j + index + 1, 0], chol_diag_element_loop, vector_repeat_times, 1, 1, 8, 8) tik_instance.vadd(64, assist_1_ub, assist_2_ub, assist_1_ub, vector_repeat_times, 1, 1, 1, 8, 8, 8) temp_scalar = tik_instance.Scalar("float32") temp_scalar.set_as(input_x_ub[index, index]) chol_diag_element = tik_instance.Scalar("float32") chol_diag_element.set_as(1.0 / temp_scalar) tik_instance.vsub(64, temp_ub[index, 0], temp_ub[index, 0], assist_1_ub, vector_repeat_times, 1, 1, 1, 8, 8, 8) tik_instance.vmuls(64, temp_ub[index, 0], temp_ub[index, 0], chol_diag_element, vector_repeat_times, 1, 1, 8, 8) tik_instance.data_move(res[block_index, 0, 0], temp_ub, 0, 1, 8 * vector_repeat_times * split_dim, 0, 0) tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res]) return tik_instance
def CusTranspose02314(input_x, output, kernel_name="transpose021354"): """CusTranspose02314""" input_x_shape = input_x.get("shape") output_shape = output.get("shape") perm = (0, 2, 3, 1, 4) input_x_shape = tuple(input_x_shape) support_shape = [(32, 128, 7, 7, 16), (32, 32, 7, 7, 16), (32, 32, 14, 14, 16), (32, 64, 14, 14, 16), (32, 16, 14, 14, 16), (32, 16, 28, 28, 16), (32, 32, 28, 28, 16), (32, 8, 28, 28, 16), (32, 8, 56, 56, 16), (32, 16, 56, 56, 16), (32, 4, 56, 56, 16), (32, 4, 112, 112, 16)] if input_x_shape not in support_shape: raise RuntimeError("input_shape %s is not supported" % str(input_x_shape)) if util.get_product_version() == util.VERSION_MINI: tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) else: tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) input_x = tik_instance.Tensor("float16", input_x_shape, name="input_x", scope=tik.scope_gm) res = tik_instance.Tensor("float16", output_shape, name="res", scope=tik.scope_gm) dtype = "float16" if tuple(input_x_shape) == (32, 4, 112, 112, 16): with tik_instance.for_range(0, 32, block_num=32) as block_idx: with tik_instance.for_range(0, 14) as cc1_db: with tik_instance.for_range(0, 2, thread_num=2) as db_idx: input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf) T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf) zero = tik_instance.Scalar(dtype="float16", init_value=0) tik_instance.data_move(input_1_local_UB, input_x[block_idx * 802816 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448, 12096, 0) with tik_instance.for_range(0, 448) as cc7: with tik_instance.for_range(0, 4) as cc8: tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16], input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0) elif tuple(input_x_shape) == (32, 4, 56, 56, 16): with tik_instance.for_range(0, 32, block_num=32) as block_idx: zero = tik_instance.Scalar(dtype="float16", init_value=0) with tik_instance.for_range(0, 3) as cc1_db: with tik_instance.for_range(0, 2, thread_num=2) as db_idx: input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf) T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB, input_x[block_idx * 200704 + cc1_db * 14336 + 7168 * db_idx], 0, 4, 448, 2688, 0) with tik_instance.for_range(0, 448) as cc7: with tik_instance.for_range(0, 4) as cc8: tik_instance.vadds(16, T_transpose_local_UB[cc7 * 64 + cc8 * 16], input_1_local_UB[7168 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0) input_1_local_UB2 = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB2", scope=tik.scope_ubuf) T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB2, input_x[block_idx * 200704 + 43008], 0, 4, 448, 2688, 0) with tik_instance.for_range(0, 448) as cc72: with tik_instance.for_range(0, 4) as cc82: tik_instance.vadds(16, T_transpose_local_UB2[cc72 * 64 + cc82 * 16], input_1_local_UB2[7168 * cc82 + cc72 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 200704 + 172032], T_transpose_local_UB2, 0, 1, 1792, 0, 0) elif tuple(input_x_shape) == (32, 16, 56, 56, 16): with tik_instance.for_range(0, 32, block_num=32) as block_idx: zero = tik_instance.Scalar(dtype="float16", init_value=0) with tik_instance.for_range(0, 14) as cc1_db: with tik_instance.for_range(0, 2, thread_num=2) as db_idx: input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf) T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB, input_x[block_idx * 802816 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 3024, 0) with tik_instance.for_range(0, 112) as cc7: with tik_instance.for_range(0, 16) as cc8: tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 802816 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0) elif tuple(input_x_shape) == (32, 8, 56, 56, 16): with tik_instance.for_range(0, 32, block_num=32) as block_idx: zero = tik_instance.Scalar(dtype="float16", init_value=0) with tik_instance.for_range(0, 7) as cc1_db: with tik_instance.for_range(0, 2, thread_num=2) as db_idx: input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf) T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 7168 + 3584 * db_idx], 0, 8, 224, 2912, 0) with tik_instance.for_range(0, 224) as cc7: with tik_instance.for_range(0, 16) as cc8: tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16], input_1_local_UB[3584 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0) elif tuple(input_x_shape) == (32, 8, 28, 28, 16): with tik_instance.for_range(0, 32, block_num=32) as block_idx: zero = tik_instance.Scalar(dtype="float16", init_value=0) with tik_instance.for_range(0, 2) as cc1_db: with tik_instance.for_range(0, 2, thread_num=2) as db_idx: input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB", scope=tik.scope_ubuf) T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB, input_x[block_idx * 100352 + cc1_db * 6272 + 3136 * db_idx], 0, 8, 196, 588, 0) with tik_instance.for_range(0, 196) as cc7: with tik_instance.for_range(0, 8) as cc8: tik_instance.vadds(16, T_transpose_local_UB[cc7 * 128 + cc8 * 16], input_1_local_UB[3136 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 100352 + cc1_db * 50176 + 25088 * db_idx], T_transpose_local_UB, 0, 1, 1568, 0, 0) elif tuple(input_x_shape) == (32, 32, 28, 28, 16): with tik_instance.for_range(0, 32, block_num=32) as block_idx: zero = tik_instance.Scalar(dtype="float16", init_value=0) with tik_instance.for_range(0, 7) as cc1_db: with tik_instance.for_range(0, 2, thread_num=2) as db_idx: input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf) T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB, input_x[block_idx * 401408 + cc1_db * 1792 + 896 * db_idx], 0, 32, 56, 728, 0) with tik_instance.for_range(0, 56) as cc7: with tik_instance.for_range(0, 32) as cc8: tik_instance.vadds(16, T_transpose_local_UB[cc7 * 512 + cc8 * 16], input_1_local_UB[896 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 401408 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0) elif tuple(input_x_shape) == (32, 16, 28, 28, 16): with tik_instance.for_range(0, 32, block_num=32) as block_idx: zero = tik_instance.Scalar(dtype="float16", init_value=0) with tik_instance.for_range(0, 3) as cc1_db: with tik_instance.for_range(0, 2, thread_num=2) as db_idx: input_1_local_UB = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB", scope=tik.scope_ubuf) T_transpose_local_UB = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB, input_x[block_idx * 200704 + cc1_db * 3584 + 1792 * db_idx], 0, 16, 112, 672, 0) with tik_instance.for_range(0, 112) as cc7: with tik_instance.for_range(0, 16) as cc8: tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 200704 + cc1_db * 57344 + 28672 * db_idx], T_transpose_local_UB, 0, 1, 1792, 0, 0) input_1_local_UB2 = tik_instance.Tensor(dtype, [28672], name="input_1_local_UB2", scope=tik.scope_ubuf) T_transpose_local_UB2 = tik_instance.Tensor(dtype, [28672], name="T_transpose_local_UB2", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB2, input_x[block_idx * 200704 + 10752], 0, 16, 112, 672, 0) with tik_instance.for_range(0, 112) as cc7: with tik_instance.for_range(0, 16) as cc8: tik_instance.vadds(16, T_transpose_local_UB2[cc7 * 256 + cc8 * 16], input_1_local_UB2[1792 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 200704 + 172032], T_transpose_local_UB2, 0, 1, 1792, 0, 0) elif tuple(input_x_shape) == (32, 16, 14, 14, 16): with tik_instance.for_range(0, 32, block_num=32) as block_idx: zero = tik_instance.Scalar(dtype="float16", init_value=0) with tik_instance.for_range(0, 2, thread_num=2) as db_idx: input_1_local_UB = tik_instance.Tensor(dtype, [25088], name="input_1_local_UB", scope=tik.scope_ubuf) T_transpose_local_UB = tik_instance.Tensor(dtype, [25088], name="T_transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB, input_x[block_idx * 50176 + 1568 * db_idx], 0, 16, 98, 98, 0) with tik_instance.for_range(0, 98) as cc7: with tik_instance.for_range(0, 16) as cc8: tik_instance.vadds(16, T_transpose_local_UB[cc7 * 256 + cc8 * 16], input_1_local_UB[1568 * cc8 + cc7 * 16], zero, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 50176 + 25088 * db_idx], T_transpose_local_UB, 0, 1, 1568, 0, 0) elif tuple(input_x_shape) == (32, 128, 7, 7, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16": with tik_instance.for_range(0, 32, block_num=32) as block_idx: with tik_instance.for_range(0, 7, thread_num=2) as cc1: input_x_ub = tik_instance.Tensor(dtype, [1, 128, 1, 7, 16], name="input_1_local_UB", scope=tik.scope_ubuf) transpose_ub = tik_instance.Tensor(dtype, [1, 1, 7, 128, 16], name="transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_x_ub, input_x[block_idx, 0, cc1, 0, 0], 0, 128, 7, 42, 0) with tik_instance.for_range(0, 7) as cc7: with tik_instance.for_range(0, 128) as cc8: tik_instance.vadds(16, transpose_ub[0, 0, cc7, cc8, 0], input_x_ub[0, cc8, 0, cc7, 0], 0, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 100352 + 14336 * cc1], transpose_ub, 0, 1, 896, 0, 0) elif tuple(input_x_shape) == (32, 32, 7, 7, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16": with tik_instance.for_range(0, 32, block_num=32) as block_idx: input_x_ub = tik_instance.Tensor(dtype, [1, 32, 7, 7, 16], name="input_1_local_UB", scope=tik.scope_ubuf) transpose_ub = tik_instance.Tensor(dtype, [1, 7, 7, 32, 16], name="transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_x_ub, input_x[block_idx, 0, 0, 0, 0], 0, 1, 1568, 0, 0) with tik_instance.for_range(0, 7) as cc1: with tik_instance.for_range(0, 7) as cc2: with tik_instance.for_range(0, 32) as cc3: tik_instance.vadds(16, transpose_ub[0, cc1, cc2, cc3, 0], input_x_ub[0, cc3, cc1, cc2, 0], 0, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 25088], transpose_ub, 0, 1, 1568, 0, 0) elif tuple(input_x_shape) == (32, 32, 14, 14, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16": def _inner_compute(split_index): input_x_ub = tik_instance.Tensor(dtype, [1, 32, 2, 14, 16], name="input_1_local_UB", scope=tik.scope_ubuf) transpose_ub = tik_instance.Tensor(dtype, [1, 2, 14, 32, 16], name="transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_x_ub, input_x[block_idx, 0, split_index * 2, 0, 0], 0, 32, 28, 168, 0) with tik_instance.for_range(0, 2) as cc2: with tik_instance.for_range(0, 14) as cc3: with tik_instance.for_range(0, 32) as cc4: tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0], 0, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 100352 + split_index * 2 * 7168], transpose_ub, 0, 1, 896, 0, 0) with tik_instance.for_range(0, 32, block_num=32) as block_idx: with tik_instance.for_range(0, 6, thread_num=2) as cc1: _inner_compute(cc1) _inner_compute(6) elif tuple(input_x_shape) == (32, 64, 14, 14, 16) and tuple(perm) == (0, 2, 3, 1, 4) and dtype == "float16": def _inner_compute(split_index, block_idx): input_x_ub = tik_instance.Tensor(dtype, [1, 64, 2, 14, 16], name="input_1_local_UB", scope=tik.scope_ubuf) transpose_ub = tik_instance.Tensor(dtype, [1, 2, 14, 64, 16], name="transpose_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_x_ub, input_x[block_idx, 0, split_index * 2, 0, 0], 0, 64, 28, 168, 0) with tik_instance.for_range(0, 2) as cc2: with tik_instance.for_range(0, 14) as cc3: with tik_instance.for_range(0, 64) as cc4: tik_instance.vadds(16, transpose_ub[0, cc2, cc3, cc4, 0], input_x_ub[0, cc4, cc2, cc3, 0], 0, 1, 1, 1, 0, 0) tik_instance.data_move(res[block_idx * 200704 + split_index * 2 * 14336], transpose_ub, 0, 1, 1792, 0, 0) with tik_instance.for_range(0, 32, block_num=32) as block_idx: with tik_instance.for_range(0, 6, thread_num=2) as cc1: _inner_compute(cc1, block_idx) _inner_compute(6, block_idx) tik_instance.BuildCCE(kernel_name, inputs=[input_x], outputs=[res]) return tik_instance
def CusMatMulCubeDenseLeft(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="matmulcube"): """ calculating matrix multiplication with bias, C = A*B + bias, support input data with fractal format. Parameters: shape_a: list or tuple Shape of the first tensor a with rank > 1 shape_b: list or tuple Shape of the second tensor b with the same type with a, and shape_a, shape_b must be 2 dims src_dtype: str The data type of input, support "float32", "float16" dst_dtype: str The data type of output, support "float32", "float16" trans_a: bool If True, shape_a == transposed before multiplication trans_b: bool If True, shape_b == transposed before multiplication is_fractal: bool If True, the input data format of a and b must be fractal format shape_bias: list or tuple Shape of bias, only support the input data format with ND Returns ------- None """ print("!!!!come into zzt~~~~~~~!!!!") shape_a = input_x1.get("ori_shape") shape_b = input_x2.get("ori_shape") shape_output = output_y.get("ori_shape") print("============") print(input_x1.get("format"), input_x2.get("format")) print(shape_a, shape_b) print("============") if input_x2.get("format") == "FRACTAL_Z": n, c, h, w = shape_b c0 = 16 c1 = c // c0 if c1 == 0: c1 = 1 shape_b = [n, c1 * h * w * c0] shape_a = [n, n] if input_x1.get("format") == "FRACTAL_Z": n, c, h, w = shape_a c0 = 16 c1 = c // c0 if c1 == 0: c1 = 1 shape_a = [n, c1 * h * w * c0] shape_b = [c1 * h * w * c0, c1 * h * w * c0] if input_x2.get("format") == "FRACTAL_NZ": shape_a = [shape_b[0], shape_b[0]] shape_b = shape_b if input_x1.get("format") == "FRACTAL_NZ": shape_a = shape_a shape_b = [shape_a[1], shape_a[1]] shape_a = list(shape_a) shape_b = list(shape_b) shape_a = _get_input_shape(shape_a) shape_b = _get_input_shape(shape_b) util.check_kernel_name(kernel_name) util.check_shape_rule(shape_a) util.check_shape_rule(shape_b) util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT) shape_a = [shape_a[1], shape_a[0]] trans_a = bool(1 - trans_a) shape_b = [shape_b[1], shape_b[0]] trans_b = bool(1 - trans_b) shape_bias = () if bias is not None and bool(bias): shape_bias = bias.get("shape") shape_bias = list(shape_bias) shape_bias = _get_bias(shape_bias) src_dtype = input_x1.get("dtype").lower() dst_dtype = output_y.get("dtype").lower() _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b) m_shape = shape_a[len(shape_a) - 2] km_shape = shape_a[len(shape_a) - 1] kn_shape = shape_b[len(shape_a) - 2] n_shape = shape_b[len(shape_a) - 1] if src_dtype == "float16": block_reduce = cce.BLOCK_REDUCE block_in = cce.BLOCK_IN block_out = cce.BLOCK_OUT if trans_a and km_shape == 1: block_in = cce.BLOCK_VECTOR if not trans_a and m_shape == 1: block_in = cce.BLOCK_VECTOR if trans_b and kn_shape == 1: block_out = cce.BLOCK_VECTOR if not trans_b and n_shape == 1: block_out = cce.BLOCK_VECTOR if trans_a: shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in) else: shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce) if trans_b: shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out) else: shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce) shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3]) format_a = "FRACTAL_NZ" shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3]) format_b = "FRACTAL_NZ" print("=======================================") print(shape_a_temp, shape_b_temp) print(format_a, format_b) print("=======================================") tensor_bias = None tensor_a = tvm.placeholder(shape_a_temp, name='tensor_a', dtype=src_dtype) tensor_b = tvm.placeholder(shape_b_temp, name='tensor_b', dtype=src_dtype) if shape_bias: tensor_bias = tvm.placeholder(shape_bias, name='tensor_bias', dtype=dst_dtype) if shape_a_temp[0] == 63 and shape_a_temp[1] == 63 and shape_b_temp[ 0] == 128 and shape_b_temp[1] == 63: if util.get_product_version() == util.VERSION_MINI: tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) else: tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) input_x1 = tik_instance.Tensor("float16", shape_a_temp, name="left_matrix", scope=tik.scope_gm) input_x2 = tik_instance.Tensor("float16", shape_b_temp, name="right_matrix", scope=tik.scope_gm) resMatmul = tik_instance.Tensor("float16", shape_output, name="output", scope=tik.scope_gm) with tik_instance.for_range(0, 32, block_num=32) as block_index: resMatmul_local_UB = tik_instance.Tensor("float16", (128 * 256, ), scope=tik.scope_ubuf, name="resMatmul_local_UB") resMatmul_local_UB_local_L0C = tik_instance.Tensor( "float32", (128 * 256, ), scope=tik.scope_cc, name="resMatmul_local_UB") input_1_local_L1_local_L0A = tik_instance.Tensor( "float16", (128 * 128, ), scope=tik.scope_ca, name="input_1_local_L1_local_L0A") input_2_local_L1 = tik_instance.Tensor("float16", (128 * 256, ), scope=tik.scope_cbuf, name="input_2_local_L1") input_1_local_L1 = tik_instance.Tensor("float16", (128 * 128, ), scope=tik.scope_cbuf, name="input_1_local_L1") input_2_local_L1_local_L0B = tik_instance.Tensor( "float16", (128 * 256, ), scope=tik.scope_cb, name="input_2_local_L1_local_L0B") core_m_idx = block_index % 8 core_n_idx = block_index // 8 with tik_instance.if_scope(core_m_idx != 7): tik_instance.data_move( input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 8, 128, 55 * 16, 0) tik_instance.data_move( input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0, 32, 128, 55 * 16, 0) with tik_instance.for_range(0, 8) as cc12: tik_instance.load2dv1( input_1_local_L1_local_L0A[cc12 * 2048], input_1_local_L1[cc12 * 256], 0, 8, 8, 0, False) with tik_instance.for_range(0, 2) as cc6: with tik_instance.for_range(0, 8) as cc121: tik_instance.load2dv1( input_2_local_L1_local_L0B[cc121 * 4096], input_2_local_L1[cc6 * 32768 + cc121 * 256], 0, 16, 8, 0, True) tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 128, 128, 256, 0) tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 128, 0, 0, 1) tik_instance.data_move( resMatmul[cc6 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], resMatmul_local_UB, 0, 16, 256 // 2, 0, 55 * 16 * 2 // 2) with tik_instance.else_scope(): tik_instance.data_move( input_1_local_L1, input_x1[core_m_idx * (8 * 256 + 128 * 1008)], 0, 7, 112, 56 * 16, 0) tik_instance.data_move( input_2_local_L1, input_x2[core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], 0, 32, 112, 56 * 16, 0) with tik_instance.for_range(0, 7) as cc10: tik_instance.load2dv1( input_1_local_L1_local_L0A[cc10 * 1792], input_1_local_L1[cc10 * 256], 0, 7, 7, 0, False) with tik_instance.for_range(0, 2) as cc5: with tik_instance.for_range(0, 7) as cc101: tik_instance.load2dv1( input_2_local_L1_local_L0B[cc101 * 4096], input_2_local_L1[cc5 * 28672 + cc101 * 256], 0, 16, 7, 0, True) tik_instance.mmad(resMatmul_local_UB_local_L0C, input_1_local_L1_local_L0A, input_2_local_L1_local_L0B, 112, 112, 256, 0) tik_instance.data_move(resMatmul_local_UB, resMatmul_local_UB_local_L0C, 0, 1, 112, 0, 0, 1) tik_instance.data_move( resMatmul[cc5 * 256 * 1008 + core_m_idx * 8 * 256 + core_n_idx * 512 * 1008], resMatmul_local_UB, 0, 16, 224 // 2, 0, 56 * 16 * 2 // 2) tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[resMatmul]) return tik_instance print("come into tbe, shape is error!") result = te.lang.cce.matmul(tensor_a, tensor_b, trans_a, trans_b, format_a=format_a, format_b=format_b, dst_dtype=dst_dtype, tensor_bias=tensor_bias) with tvm.target.cce(): schedule = generic.auto_schedule(result) tensor_list = [tensor_a, tensor_b, result] if shape_bias: tensor_list = [tensor_a, tensor_b, tensor_bias, result] config = { "print_ir": False, "name": kernel_name, "tensor_list": tensor_list } te.lang.cce.cce_build_code(schedule, config)
def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=True, kernel_name="batchmatmul"): """CusBatchMatMul""" if util.get_product_version() == util.VERSION_MINI: tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) else: tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) x1_shape = input_x1.get("shape") dtype = input_x1.get("dtype").lower() x2_shape = input_x2.get("shape") if dtype != input_x2.get("dtype").lower(): raise RuntimeError("dtype of input_x1 and input_x2 must be same, but got %s vs %s" % ( dtype, input_x2.get("dtype").lower())) input_shape = (tuple(x1_shape), tuple(x2_shape), dtype, transpose_a, transpose_b) support_shape = [((8, 128, 128), (8, 128, 128), "float32", False, True), ((36, 128, 128), (36, 128, 128), "float32", False, True), ((5, 128, 128), (5, 128, 128), "float32", False, True), ((18, 128, 128), (18, 128, 128), "float32", False, True), ((16, 128, 128), (16, 128, 128), "float32", False, True), ((9, 128, 128), (9, 128, 128), "float32", False, True), ((1, 64, 64), (1, 64, 64), "float32", False, True), ((1, 128, 128), (1, 128, 128), "float32", False, True), ((4, 128, 128), (4, 128, 128), "float32", False, True), ((2, 128, 128), (2, 128, 128), "float32", False, True)] if input_shape not in support_shape: raise RuntimeError("input_shape %s is not supported" % str(input_shape)) # if not transpose_a and transpose_b: batch, m, k = x1_shape input1_shape = _get_flattern_shape(x1_shape) input1 = tik_instance.Tensor(dtype, input1_shape, name="input1", scope=tik.scope_gm) input2_shape = _get_flattern_shape(x2_shape) input2 = tik_instance.Tensor(dtype, input2_shape, name="input2", scope=tik.scope_gm) output_shape = x1_shape res_shape = _get_flattern_shape(output_shape) res = tik_instance.Tensor(dtype, res_shape, name="res", scope=tik.scope_gm) if input_shape == ((36, 128, 128), (36, 128, 128), "float32", False, True): with tik_instance.for_range(0, 18, block_num=18) as block_idx: with tik_instance.for_range(0, 2) as cc0: with tik_instance.for_range(0, 128, thread_num=2) as cc1: input1_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128 input2_index = block_idx * 32768 + cc0 * 16384 res_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128 _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index) if input_shape == ((5, 128, 128), (5, 128, 128), "float32", False, True): with tik_instance.for_range(0, 30, block_num=30) as block_idx: with tik_instance.for_range(0, 11) as cc1_db: with tik_instance.for_range(0, 2, thread_num=2) as thread_idx: with tik_instance.if_scope(((((block_idx % 6) * 22) + (cc1_db * 2) + thread_idx) < 128)): input_1_local_UB = tik_instance.Tensor(dtype, [128], name="input_1_local_UB", scope=tik.scope_ubuf) t_1_0_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="t_1_0_local_UB", scope=tik.scope_ubuf) tik_instance.data_move(input_1_local_UB, input1[ (block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 + thread_idx * 128], 0, 1, 16, 0, 0) with tik_instance.for_range(0, 2) as vec_i: tik_instance.vadds(64, t_1_0_local_UB[vec_i * 64], input_1_local_UB[vec_i * 64], 0, 64, 1, 1, 16, 0) with tik_instance.for_range(0, 2, thread_num=2) as thread_idx2: input_2_local_UB = tik_instance.Tensor(dtype, [64 * 128], name="input_2_local_UB", scope=tik.scope_ubuf) t_1_local_UB = input_2_local_UB bisec_last_axis_local_UB = input_2_local_UB matmul_hybrid_f_t_local_UB = tik_instance.Tensor(dtype, [64], name="matmul_hybrid_f_t_local_UB", scope=tik.scope_ubuf) matmul_hybrid_f_t_local_UB_dst_tmp = tik_instance.Tensor(dtype, [64], name="matmul_hybrid_f_t_local_UB_dst_tmp", scope=tik.scope_ubuf) tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB, 0, 1, 1, 8) tik_instance.data_move(input_2_local_UB, input2[(block_idx // 6) * 16384 + thread_idx2 * 8192], 0, 1, 1024, 0, 0) tik_instance.vmul(64, t_1_local_UB, t_1_0_local_UB, input_2_local_UB, 128, 1, 1, 1, 8, 8, 8) tik_instance.vadd(64, bisec_last_axis_local_UB, t_1_local_UB, t_1_local_UB[64], 64, 1, 1, 1, 16, 16, 16) tik_instance.vector_dup(64, matmul_hybrid_f_t_local_UB_dst_tmp, 0, 1, 1, 8) with tik_instance.for_range(0, 64) as cc6: tik_instance.vcadd(64, matmul_hybrid_f_t_local_UB_dst_tmp[cc6], bisec_last_axis_local_UB[cc6 * 128], 1, 1, 1, 8) tik_instance.vadd(64, matmul_hybrid_f_t_local_UB, matmul_hybrid_f_t_local_UB_dst_tmp, matmul_hybrid_f_t_local_UB, 1, 1, 1, 1, 8, 8, 8) tik_instance.data_move( res[(block_idx // 6) * 16384 + (block_idx % 6) * 2816 + cc1_db * 256 + thread_idx * 128 + thread_idx2 * 64], matmul_hybrid_f_t_local_UB, 0, 1, 8, 0, 0) if input_shape == ((18, 128, 128), (18, 128, 128), "float32", False, True): with tik_instance.for_range(0, 18, block_num=18) as block_idx: with tik_instance.for_range(0, 128, thread_num=2) as cc0: input1_index = block_idx * 16384 + cc0 * 128 input2_index = block_idx * 16384 res_index = block_idx * 16384 + cc0 * 128 _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index) if input_shape == ((9, 128, 128), (9, 128, 128), "float32", False, True): with tik_instance.for_range(0, 27, block_num=27) as block_idx: with tik_instance.for_range(0, 42, thread_num=2) as cc0: input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128 input2_index = (block_idx // 3) * 16384 res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + cc0 * 128 _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index) with tik_instance.if_scope((block_idx % 3) < 2): input1_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128 input2_index = (block_idx // 3) * 16384 res_index = (block_idx // 3) * 16384 + (block_idx % 3) * 5504 + 42 * 128 _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index) if input_shape == ((1, 64, 64), (1, 64, 64), "float32", False, True): with tik_instance.for_range(0, 32, block_num=32) as block_idx: with tik_instance.for_range(0, 2, thread_num=2) as cc0: input1_index = block_idx * 128 + cc0 * 64 input2_index = 0 res_index = block_idx * 128 + cc0 * 64 _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index) input_shape_list = [((1, 128, 128), (1, 128, 128), "float32", False, True), ((2, 128, 128), (2, 128, 128), "float32", False, True), ((4, 128, 128), (4, 128, 128), "float32", False, True), ((8, 128, 128), (8, 128, 128), "float32", False, True), ((16, 128, 128), (16, 128, 128), "float32", False, True) ] if input_shape in input_shape_list: block_num = 32 input1_unit_size = 128 input2_unint_size = 128 * 128 with tik_instance.for_range(0, block_num, block_num=block_num) as block_idx: block_process_ele_num = (batch * m * k) // block_num loop_time = (batch * m * k) // block_num // input1_unit_size thread_num = 2 with tik_instance.for_range(0, loop_time, thread_num=thread_num) as cc0: input1_index = block_idx * block_process_ele_num + cc0 * input1_unit_size if batch > 1: input2_index = block_idx // (block_num // batch) * input2_unint_size else: input2_index = 0 res_index = block_idx * block_process_ele_num + cc0 * input1_unit_size _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index) tik_instance.BuildCCE(kernel_name, inputs=[input1, input2], outputs=[res]) return tik_instance
def CusMatMulCubeFraczLeftCast(input_x1, input_x2, bias=None, output_y={}, trans_a=False, trans_b=False, kernel_name="CusMatMulCubeFraczLeftCast"): """ calculating matrix multiplication with bias, C = A*B + bias, support input data with fractal format. Parameters: shape_a: list or tuple Shape of the first tensor a with rank > 1 shape_b: list or tuple Shape of the second tensor b with the same type with a, and shape_a, shape_b must be 2 dims src_dtype: str The data type of input, support "float32", "float16" dst_dtype: str The data type of output, support "float32", "float16" trans_a: bool If True, shape_a == transposed before multiplication trans_b: bool If True, shape_b == transposed before multiplication is_fractal: bool If True, the input data format of a and b must be fractal format shape_bias: list or tuple Shape of bias, only support the input data format with ND Returns ------- None """ shape_a = input_x1.get("ori_shape") shape_b = input_x2.get("ori_shape") print("============") print(input_x1.get("format"), input_x2.get("format")) print(shape_a, shape_b) print("============") if input_x2.get("format") == "FRACTAL_Z": n, c, h, w = shape_b c0 = 16 c1 = c // c0 if c1 == 0: c1 = 1 shape_b = [n, c1 * h * w * c0] shape_a = [n, n] if input_x1.get("format") == "FRACTAL_Z": n, c, h, w = shape_a c0 = 16 c1 = c // c0 if c1 == 0: c1 = 1 shape_a = [n, c1 * h * w * c0] shape_b = [c1 * h * w * c0, c1 * h * w * c0] if input_x2.get("format") == "FRACTAL_NZ": shape_a = [shape_b[0], shape_b[0]] shape_b = shape_b if input_x1.get("format") == "FRACTAL_NZ": shape_a = shape_a shape_b = [shape_a[1], shape_a[1]] shape_a = list(shape_a) shape_b = list(shape_b) shape_a = _get_input_shape(shape_a) shape_b = _get_input_shape(shape_b) util.check_kernel_name(kernel_name) util.check_shape_rule(shape_a) util.check_shape_rule(shape_b) util.check_shape_size(shape_a, SHAPE_SIZE_LIMIT) util.check_shape_size(shape_b, SHAPE_SIZE_LIMIT) shape_a = [shape_a[1], shape_a[0]] trans_a = bool(1 - trans_a) shape_b = [shape_b[1], shape_b[0]] trans_b = bool(1 - trans_b) shape_bias = () if bias is not None and bool(bias): shape_bias = bias.get("shape") shape_bias = list(shape_bias) shape_bias = _get_bias(shape_bias) src_dtype = input_x1.get("dtype").lower() _shape_check(shape_a, shape_b, shape_bias, src_dtype, trans_a, trans_b) m_shape = shape_a[len(shape_a) - 2] km_shape = shape_a[len(shape_a) - 1] kn_shape = shape_b[len(shape_a) - 2] n_shape = shape_b[len(shape_a) - 1] if src_dtype == "float16": block_reduce = cce.BLOCK_REDUCE block_in = cce.BLOCK_IN block_out = cce.BLOCK_OUT if trans_a and km_shape == 1: block_in = cce.BLOCK_VECTOR if not trans_a and m_shape == 1: block_in = cce.BLOCK_VECTOR if trans_b and kn_shape == 1: block_out = cce.BLOCK_VECTOR if not trans_b and n_shape == 1: block_out = cce.BLOCK_VECTOR if trans_a: shape_a_temp = (m_shape // block_reduce, km_shape // block_in, block_reduce, block_in) else: shape_a_temp = (m_shape // block_in, km_shape // block_reduce, block_in, block_reduce) if trans_b: shape_b_temp = (kn_shape // block_out, n_shape // block_reduce, block_reduce, block_out) else: shape_b_temp = (kn_shape // block_reduce, n_shape // block_out, block_out, block_reduce) shape_a_temp = (shape_a_temp[0], shape_a_temp[1], shape_a_temp[2], shape_a_temp[3]) shape_b_temp = (shape_b_temp[0], shape_b_temp[1], shape_b_temp[2], shape_b_temp[3]) if util.get_product_version() == util.VERSION_MINI: tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) else: tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) input_x1 = tik_instance.Tensor(input_x1.get("dtype"), shape_a_temp, name="left_matrix", scope=tik.scope_gm) input_x2 = tik_instance.Tensor(input_x2.get("dtype"), shape_b_temp, name="right_matrix", scope=tik.scope_gm) res_matmul = tik_instance.Tensor(output_y.get("dtype"), output_y.get("shape"), name="output", scope=tik.scope_gm) DIAG_SIZE = 128 mo_tile, ko_tile, no_tile, diag_opt = get_cus_tile_info( input_x1, input_x2, DIAG_SIZE) cus_cube_matmul_cast(tik_instance, input_x1, trans_a, input_x2, trans_b, res_matmul, mo_tile=mo_tile, ko_tile=ko_tile, no_tile=no_tile, diag_opt=diag_opt, diag_size=DIAG_SIZE) tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x1, input_x2], outputs=[res_matmul]) return tik_instance
def fake_quant_with_min_max_vars_gradient_compute(gradients, x, min, max, backprops_wrt_x, backprop_wrt_min, backprop_wrt_max, num_bits, narrow_range, kernel_name="fake_quant_" "with_min_max" "_vars_gradient"): """ Compute gradients for a FakeQuantWithMinMaxVars operation. Parameters ---------- gradients: tvm.tensor input tensor has shape and dtype attributes x: tvm.tensor input tensor has shape and dtype attributes min: tvm.tensor max: tvm.tensor backprops_wrt_x: tvm.tensor output tensor has shape and dtype attributes backprop_wrt_min: tvm.tensor output tensor has shape and dtype attributes backprop_wrt_max: TVM tensor output tensor has shape and dtype attributes num_bits: int the bitwidth of the quantization, between 2 and 16 narrow_range: bool whether to quantize into 2^num_bits - 1 distinct values kernel_name: str cce kernel name, default value is "fake_quant_with_min_max_vars_gradient" Returns ------ res: TVM tensor the calculation results """ input_shape = te.lang.cce.util.shape_to_list(x.shape) dtype = x.dtype min_broadcast = te.lang.cce.broadcast(min, input_shape, dtype) max_broadcast = te.lang.cce.broadcast(max, input_shape, dtype) nudged_min, nudged_max = _nudged_min_max_compute(min_broadcast, max_broadcast, num_bits, narrow_range) nudged_min_backup = te.lang.cce.vadds(nudged_min, tvm.const(0, D_TYPE)) nudged_max_backup = te.lang.cce.vadds(nudged_max, tvm.const(0, D_TYPE)) between_nudged_min_max = _between_nudged_min_max_compute(x, nudged_min, nudged_max) wrt_input_tensor = te.lang.cce.vmul(between_nudged_min_max, gradients) shape_list = [] for i, _ in enumerate(input_shape): shape_list.append(i) bool_below_min = _less_compare_float32(x, nudged_min_backup) below_min_data = te.lang.cce.vmul(bool_below_min, gradients) bool_below_max = _less_compare_float32(nudged_max_backup, x) below_max_data = te.lang.cce.vmul(bool_below_max, gradients) # process min and max are both zero tensor_one = te.lang.cce.broadcast(1, input_shape, dtype) bool_both_no_zero = _both_min_max_zero(min, max, input_shape, dtype) bool_both_no_zero_reverse = te.lang.cce.vsub(tensor_one, bool_both_no_zero) bool_both_no_zero_broad = te.lang.cce.broadcast(bool_both_no_zero, input_shape, dtype) bool_both_no_zero_reverse = te.lang.cce.broadcast(bool_both_no_zero_reverse, input_shape, dtype) wrt_input_weight = te.lang.cce.vmul(wrt_input_tensor, bool_both_no_zero_broad) gradients_weight = te.lang.cce.vmul(gradients, bool_both_no_zero_reverse) backprops_wrt_x = te.lang.cce.vadd(wrt_input_weight, gradients_weight) # cloud version: optimize to eliminating workspace by reducing atomic if util.get_product_version() == util.VERSION_CLOUD: # insert temp node to make vadd_last node as mid_outputTensor for eliminating workspace temp_insert_node_mul = te.lang.cce.vmuls(backprops_wrt_x, tvm.const(0, D_TYPE)) temp_insert_node_add = te.lang.cce.vadd(temp_insert_node_mul, below_min_data) below_min_data_tensor = te.lang.cce.vmul(temp_insert_node_add, bool_both_no_zero) below_max_data_tensor = te.lang.cce.vmul(below_max_data, bool_both_no_zero) backprop_wrt_min_max_list = te.lang.cce.tuple_sum( [below_min_data_tensor, below_max_data_tensor], axis=shape_list) output_list = [backprops_wrt_x] + list(backprop_wrt_min_max_list) else: below_min_data_tensor = te.lang.cce.vmul(below_min_data, bool_both_no_zero) below_max_data_tensor = te.lang.cce.vmul(below_max_data, bool_both_no_zero) backprop_wrt_min = te.lang.cce.sum(below_min_data_tensor, axis=shape_list) backprop_wrt_max = te.lang.cce.sum(below_max_data_tensor, axis=shape_list) output_list = [backprops_wrt_x, backprop_wrt_min, backprop_wrt_max] return output_list
def CusMatrixCombine(input_x, output, kernel_name="matrix_combine"): """CusMatrixCombine""" input_x_shape = input_x.get("shape") output_shape = output.get("shape") split_dim = 128 if util.get_product_version() == util.VERSION_MINI: tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) else: tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) blocks = 32 matrix_dim = input_x_shape[0] * input_x_shape[1] if input_x_shape[0] == 1 and input_x_shape[1] == 64: tiling_dim = 2 bs = 1 with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub", scope=tik.scope_ubuf) tik_instance.data_move(input_x_ub, input_x[0, block_index * tiling_dim, 0], 0, 1, 16, 0, 0) tik_instance.data_move(res[block_index * tiling_dim, 0], input_x_ub, 0, 1, 16, 0, 0) else: tiling_dim = 4 bs = input_x_shape[0] with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub", scope=tik.scope_ubuf) zero = tik_instance.Scalar("float32") zero.set_as(0.0) with tik_instance.for_range(0, bs) as i: repeat_real = tiling_dim * matrix_dim // 64 if repeat_real <= 255: tik_instance.vector_dup(64, input_x_ub, zero, repeat_real, 1, 8) else: repeat_1 = 255 repeat_2 = repeat_real - 255 tik_instance.vector_dup(64, input_x_ub, zero, repeat_1, 1, 8) tik_instance.vector_dup(64, input_x_ub[255 * 64], zero, repeat_2, 1, 8) with tik_instance.for_range(0, tiling_dim) as j: tik_instance.data_move( input_x_ub[j, split_dim * i], input_x[i, block_index * tiling_dim + j, 0], 0, 1, 16, 0, 0) tik_instance.data_move( res[i * split_dim + block_index * tiling_dim, 0], input_x_ub, 0, 1, tiling_dim * matrix_dim * 4 // 32, 0, 0) tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res]) return tik_instance
def CusBatchMatMul(input_x1, input_x2, output, transpose_a=False, transpose_b=True, kernel_name="batchmatmul"): """CusBatchMatMul""" if util.get_product_version() == util.VERSION_MINI: tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) else: tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) x1_shape = input_x1.get("shape") dtype = input_x1.get("dtype").lower() x2_shape = input_x2.get("shape") if dtype != input_x2.get("dtype").lower(): raise RuntimeError( "dtype of input_x1 and input_x2 must be same, but got %s vs %s" % (dtype, input_x2.get("dtype").lower())) input_shape = (tuple(x1_shape), tuple(x2_shape), dtype, transpose_a, transpose_b) support_shape = [((8, 128, 128), (8, 128, 128), "float32", False, True), ((36, 128, 128), (36, 128, 128), "float32", False, True), ((5, 128, 128), (5, 128, 128), "float32", False, True), ((18, 128, 128), (18, 128, 128), "float32", False, True), ((16, 128, 128), (16, 128, 128), "float32", False, True), ((9, 128, 128), (9, 128, 128), "float32", False, True), ((1, 64, 64), (1, 64, 64), "float32", False, True), ((1, 128, 128), (1, 128, 128), "float32", False, True), ((4, 128, 128), (4, 128, 128), "float32", False, True), ((2, 128, 128), (2, 128, 128), "float32", False, True), ((6, 128, 128), (6, 128, 128), "float32", False, True), ((24, 128, 128), (24, 128, 128), "float32", False, True), ((32, 128, 128), (32, 128, 128), 'float32', False, True)] if input_shape not in support_shape: raise RuntimeError("input_shape %s is not supported" % str(input_shape)) # if not transpose_a and transpose_b: batch, m, k = x1_shape input1_shape = _get_flattern_shape(x1_shape) input1 = tik_instance.Tensor(dtype, input1_shape, name="input1", scope=tik.scope_gm) input2_shape = _get_flattern_shape(x2_shape) input2 = tik_instance.Tensor(dtype, input2_shape, name="input2", scope=tik.scope_gm) output_shape = x1_shape res_shape = _get_flattern_shape(output_shape) res = tik_instance.Tensor(dtype, res_shape, name="res", scope=tik.scope_gm) if input_shape == ((36, 128, 128), (36, 128, 128), "float32", False, True): with tik_instance.for_range(0, 18, block_num=18) as block_idx: with tik_instance.for_range(0, 2) as cc0: with tik_instance.for_range(0, 128, thread_num=2) as cc1: input1_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128 input2_index = block_idx * 32768 + cc0 * 16384 res_index = block_idx * 32768 + cc0 * 16384 + cc1 * 128 _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index) process_input_shape_640(input_shape, tik_instance, dtype, input1, input2, res) if input_shape == ((18, 128, 128), (18, 128, 128), "float32", False, True): with tik_instance.for_range(0, 18, block_num=18) as block_idx: with tik_instance.for_range(0, 128, thread_num=2) as cc0: input1_index = block_idx * 16384 + cc0 * 128 input2_index = block_idx * 16384 res_index = block_idx * 16384 + cc0 * 128 _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index) process_input_shape_1152(input_shape, tik_instance, dtype, input1, input2, res) if input_shape == ((1, 64, 64), (1, 64, 64), "float32", False, True): with tik_instance.for_range(0, 32, block_num=32) as block_idx: with tik_instance.for_range(0, 2, thread_num=2) as cc0: input1_index = block_idx * 128 + cc0 * 64 input2_index = 0 res_index = block_idx * 128 + cc0 * 64 _inner_matmul_new_1_64_32_64(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index) input_shape_list = [ ((1, 128, 128), (1, 128, 128), "float32", False, True), ((2, 128, 128), (2, 128, 128), "float32", False, True), ((4, 128, 128), (4, 128, 128), "float32", False, True), ((6, 128, 128), (6, 128, 128), "float32", False, True), ((8, 128, 128), (8, 128, 128), "float32", False, True), ((16, 128, 128), (16, 128, 128), "float32", False, True), ((24, 128, 128), (24, 128, 128), "float32", False, True), ((32, 128, 128), (32, 128, 128), 'float32', False, True) ] if input_shape in input_shape_list: block_num = 32 input1_unit_size = 128 input2_unint_size = 128 * 128 with tik_instance.for_range(0, block_num, block_num=block_num) as block_idx: block_process_ele_num = (batch * m * k) // block_num loop_time = (batch * m * k) // block_num // input1_unit_size thread_num = 2 with tik_instance.for_range(0, loop_time, thread_num=thread_num) as cc0: input1_index = block_idx * block_process_ele_num + cc0 * input1_unit_size if batch > 1: input2_index = block_idx // (block_num // batch) * input2_unint_size else: input2_index = 0 res_index = block_idx * block_process_ele_num + cc0 * input1_unit_size _inner_matmul_new(tik_instance, dtype, input1, input1_index, input2, input2_index, res, res_index) tik_instance.BuildCCE(kernel_name, inputs=[input1, input2], outputs=[res]) return tik_instance