def matmul_execute(shape_x, shape_y, bias, left_format, right_format, out_format, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs): ''' There are four types of fractal format in Davinci core: zZ, zN, nZ, nN general matmul format left_trans: False right_trans False: zZ * nZ = zN left_trans: True right_trans False: nN * nZ = zN left_trans: False right_trans True : zZ * zN = zN left_trans: True right_trans True : nN * zN = zN Now we need to support: zN * nZ = zN use left_format to specify, left matrix data format use right_format to specify, right matrix data format ''' batch_tuple, m, k, n = extract_dim(shape_x, shape_y, adj_x, adj_y) m = (m + 15) // 16 * 16 n = (n + 15) // 16 * 16 k = (k + 15) // 16 * 16 shape_xx, shape_yy, bias_shape, out_shape, k = get_converted_shapes(m, n, k, batch_tuple, adj_x, adj_y, bias, left_format, right_format, out_format) mod = dynamic_matmul_compile(shape_x, shape_y, bias, left_format, right_format, out_format, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs) # Generate data m_x, m_y, bench_mark, bias_data = matmul_data(batch_tuple, m, k, n, dtype, out_dtype, bias, adj_x, adj_y, left_format, right_format, out_format) # mod launch output = np.full(out_shape, np.nan, out_dtype) if bias == 0: output = utils.mod_launch(mod, (m_x, m_y, output, 1, 1, 1, 1, 1, 1, 1, 1, 1), outputs=(2,), expect=bench_mark) elif bias == 1: output = utils.mod_launch(mod, (m_x, m_y, bias_data, output), expect=bench_mark) # compare result rtol, atol = get_rtol_atol("matmul", dtype) compare_result = compare_tensor(output, bench_mark, rtol=rtol, atol=atol, equal_nan=True) # compare_result = utils.result_compare(output, bench_mark, r_tol=5e-3) return (m_x, m_y), output, bench_mark, compare_result
def dynamic_matmul_compile(shape_x, shape_y, bias, left_format, right_format, output_format, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs): batch_tuple, m, k, n = extract_dim(shape_x, shape_y, adj_x, adj_y) m = akg.tvm.var("I2") n = akg.tvm.var("I1") k = akg.tvm.var("KO") x = akg.tvm.placeholder((1, m, k, 16, 16), name='A', dtype=dtype) y = akg.tvm.placeholder((1, k, n, 16, 16), name='B', dtype=dtype) """ m = (m + 15) // 16 * 16 n = (n + 15) // 16 * 16 k = (k + 15) // 16 * 16 """ shape_xx, shape_yy, bias_shape, out_shape, k = get_converted_shapes(m, n, k, batch_tuple, adj_x, adj_y, bias, left_format, right_format, output_format) input_shapes = [shape_xx, shape_yy, bias_shape] input_types = [dtype, dtype, dtype] has_bias = False if bias == 1: has_bias = True op_attrs = [out_dtype, left_format, right_format, output_format, adj_x, adj_y, has_bias, attrs] if has_bias == False: input_shapes = [x, y] input_types = [dtype, dtype] op_attrs = [None, out_dtype, left_format, right_format, output_format, adj_x, adj_y, has_bias, attrs] return utils.op_build_test(matmul, input_shapes, input_types, op_attrs, kernel_name, attrs)
def matmul4d_ad_run(shape_x, shape_y, bias, adj_x, adj_y, dtype, out_dtype, kernel_name, attrs): # calculate the shape in fractal type and create the data batch_tuple, m, k, n = extract_dim(shape_x, shape_y, adj_x, adj_y) m = (m + 15) // 16 * 16 n = (n + 15) // 16 * 16 k = (k + 15) // 16 * 16 shape_xx, shape_yy, bias_shape, output_shape, k = get_converted_shapes( m, n, k, batch_tuple, adj_x, adj_y, bias) input_x = random_gaussian(shape_xx, miu=0.5, sigma=0.01).astype(np.float16) input_y = random_gaussian(shape_yy, miu=0.5, sigma=0.01).astype(np.float16) input_head = random_gaussian(output_shape, miu=0.5, sigma=0.01).astype(np.float16) dX_expected = compute_expected(input_y, input_head, adj_x, adj_y, shape_xx) input_shapes = [output_shape, shape_xx, shape_yy, bias_shape] input_types = [out_dtype, dtype, dtype, dtype] op_attrs = [out_dtype, adj_x, adj_y] if bias_shape is None: input_shapes = [output_shape, shape_xx, shape_yy] input_types = [out_dtype, dtype, dtype] op_attrs = [None, out_dtype, adj_x, adj_y] mod = utils.op_build_test(matmul4d_ad.matmul4d_ad, input_shapes, input_types, op_attrs, kernel_name, attrs) # calculate the backward kernel dX = np.full(shape_xx, np.nan, dtype) dX = utils.mod_launch(mod, (input_head, input_x, input_y, dX), expect=dX_expected) return (input_x, input_y, input_head), dX, dX_expected, compare_tensor(dX, dX_expected, rtol=0.01, equal_nan=True)
def _gen_data_matmul_cube(op_desc: MatmulCubeDesc): """Generating test data for matmul_cube""" batch_tuple, m, k, n = matmul_run.extract_dim(op_desc.x_shape, op_desc.y_shape, op_desc.adj_x, op_desc.adj_y) m = (m + 15) // 16 * 16 n = (n + 15) // 16 * 16 k = (k + 15) // 16 * 16 _, _, _, out_shape, k = matmul_run.get_converted_shapes(m, n, k, batch_tuple, op_desc.adj_x, op_desc.adj_y, op_desc.bias, op_desc.left_format, op_desc.right_format, op_desc.out_format) m_x, m_y, bench_mark, bias_data = matmul_run.matmul_data(batch_tuple, m, k, n, op_desc.dtype, op_desc.out_dtype, op_desc.bias, op_desc.adj_x, op_desc.adj_y, op_desc.left_format, op_desc.right_format, op_desc.out_format) out_data = np.full(out_shape, np.nan, op_desc.out_dtype) if op_desc.bias: args = (m_x, m_y, bias_data, out_data) else: args = (m_x, m_y, out_data) return args, bench_mark
def _get_space_matmul_cube(op_desc: MatmulCubeDesc): """get config space of matmul_cube""" if not isinstance(op_desc, MatmulCubeDesc): raise TypeError('op_desc must be MatmulCubeDesc') config_space = ListConfigSpace(MatmulCubeConfig) batch_tuple, m, k, n = matmul_run.extract_dim(op_desc.x_shape, op_desc.y_shape, op_desc.adj_x, op_desc.adj_y) mmax = (m + 15) // 16 nmax = (n + 15) // 16 kmax = (k + 15) // 16 double_buffer = True mad_fp32 = True l1_max_size = (1024 * 1024) # L1 MEM 1024KB l0a_max_size = (64 * 1024) # L0A MEM 64KB l0b_max_size = (64 * 1024) # L0B MEM 64KB l0c_max_size = (256 * 1024) # L0C MEM 256KB ub_max_size = ( (256 - 8) * 1024) # UB MEM 248KB, 8KB reserved for compiler if double_buffer: l1_max_size = l1_max_size // 2 l0a_max_size = l0a_max_size // 2 l0b_max_size = l0b_max_size // 2 l0c_max_size = l0c_max_size // 2 ub_max_size = ub_max_size // 2 if mad_fp32: l0c_max_size = l0c_max_size // 2 if op_desc.out_dtype == 'float32': ub_max_size = ub_max_size // 2 bypass_options = [0, 1, 2] for bypass in bypass_options: if (bypass == 2) and ( (op_desc.adj_x == False and op_desc.left_format[0].lower() == 'n') or (op_desc.adj_x == True and op_desc.left_format[0].lower() == 'z')): continue if (bypass == 1) and ((op_desc.adj_y == False and op_desc.right_format[0].lower() == 'z') or (op_desc.adj_y == True and op_desc.right_format[0].lower() == 'n')): continue for k_l1 in range(1, kmax + 1): if kmax % k_l1 != 0: continue for k_l0 in range(1, k_l1 + 1): if k_l1 % k_l0 != 0: continue # no need to cut from l1 to l0 for m and n when k is cut for m_l1 in range(1, mmax + 1): if mmax % m_l1 != 0: continue m_l0_range = [m_l1] if k_l1 != kmax else range(1, m_l1 + 1) for m_l0 in m_l0_range: if m_l1 % m_l0 != 0: continue for n_l1 in range(1, nmax + 1): if nmax % n_l1 != 0: continue n_l0_range = [n_l1] if k_l1 != kmax else range( 1, n_l1 + 1) for n_l0 in n_l0_range: if n_l1 % n_l0 != 0: continue if m_l0 * 16 * k_l0 * 16 > l0a_max_size: continue if n_l0 * 16 * k_l0 * 16 > l0b_max_size: continue if m_l0 * 16 * n_l0 * 16 > l0c_max_size: continue if m_l0 * 16 * n_l0 * 16 > ub_max_size: continue if bypass == 2: l1_size = n_l1 * 16 * k_l1 * 16 elif bypass == 1: l1_size = m_l1 * 16 * k_l1 * 16 else: l1_size = (m_l1 * 16 + n_l1 * 16) * k_l1 * 16 if l1_size > l1_max_size: continue if nmax == 1: n_l1 = 0 n_l0 = 0 if mmax == 1: m_l1 = 0 m_l0 = 0 if kmax == 1: k_l1 = 16 k_l0 = 16 config_space.add( MatmulCubeConfig(n_l1, n_l0, m_l1, m_l0, k_l1, k_l0, bypass)) shape_xx, shape_yy, _, _, k = matmul_run.get_converted_shapes( m, n, k, batch_tuple, op_desc.adj_x, op_desc.adj_y, op_desc.bias, op_desc.left_format, op_desc.right_format, op_desc.out_format) return None, config_space, str( (shape_xx, shape_yy, op_desc.bias, op_desc.left_format, op_desc.right_format, op_desc.out_format, op_desc.adj_x, op_desc.adj_y, op_desc.dtype, op_desc.out_dtype)), None, None