def test_gemm_vectorized_decoupled(): # Test with vectorization A = np.random.rand(128, 128).astype(np.float32) B = np.random.rand(128, 128).astype(np.float32) C = np.random.rand(128, 128).astype(np.float32) alpha = 2.1 beta = 1.5 vec_width = 4 sdfg = create_gemm_sdfg("gemm_vectorized", alpha, beta, A, B, C, dace.float32, vec_width=vec_width) sdfg.expand_library_nodes() sdfg.apply_transformations_repeated([InlineSDFG]) # Compute ground truth C_regression = alpha * (A @ B) + beta * C with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=True): sdfg(A=A, B=B, C=C) assert np.allclose(C, C_regression, atol=1e-6) return sdfg
def test_gemm_size_not_multiples_of_decoupled(): # Test with matrix sizes that are not a multiple of #PEs and Tile sizes # To achieve II=1 with Xilinx, we need to decouple reads/writes from memory A = np.random.rand(120, 128).astype(np.float32) B = np.random.rand(128, 128).astype(np.float32) C = np.random.rand(120, 128).astype(np.float32) expansion_args = {"tile_size_m": 50, "num_pes": 7} sdfg = create_gemm_sdfg("gemm_not_multiple_of", 1, 1, A, B, C, dace.float32, expansion_args=expansion_args) sdfg.expand_library_nodes() sdfg.apply_transformations_repeated([InlineSDFG]) # compute ground truth C_regression = A @ B + C with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=True): sdfg(A=A, B=B, C=C) assert np.allclose(C, C_regression, atol=1e-6) return sdfg
def test_vec_sum_fpga_transform_first_decoupled_interfaces(): # For this test, decoupled read/write interfaces are needed to achieve II=1 with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=True): return run_vec_sum(True)
def test_default_stream_blas_node(): A_desc = dace.float32[10, 5] B_desc = dace.float32[5, 3] C_desc = dace.float32[10, 3] with set_temporary("compiler", "cuda", "max_concurrent_streams", value=-1): with change_default(blas, "cuBLAS"): @dace.program def test_default_stream_blas_node(A: A_desc, B: B_desc, C: C_desc): C[:] = A @ B A = np.random.rand(*A_desc.shape).astype(np.float32) B = np.random.rand(*B_desc.shape).astype(np.float32) C = np.zeros(C_desc.shape).astype(np.float32) sdfg: dace.SDFG = test_default_stream_blas_node.to_sdfg() sdfg.apply_gpu_transformations() sdfg.expand_library_nodes() all_tasklets = (n for n, _ in sdfg.all_nodes_recursive() if isinstance(n, dace.nodes.Tasklet)) environments = { env for n in all_tasklets for env in n.environments } assert "cuBLAS" in environments sdfg(A=A, B=B, C=C) assert np.allclose(A @ B, C)
def test_inhibit_state_fusion(): """ Tests that state fusion is inhibited around callbacks if configured as such. """ @dace_inhibitor def add(a, b): return a + b @dace.program def calladd(A: dace.float64[20], B: dace.float64[20], C: dace.float64[20], D: dace.float64[20]): A[:] = add(B, C) D[:] = add(A, C) with config.set_temporary('frontend', 'dont_fuse_callbacks', value=True): sdfg = calladd.to_sdfg(simplify=True) assert sdfg.number_of_nodes() == 5 with config.set_temporary('frontend', 'dont_fuse_callbacks', value=False): sdfg = calladd.to_sdfg(simplify=True) assert sdfg.number_of_nodes() == 1
def test_fusion_with_transient_fpga_decoupled(): A = np.random.rand(2, 20) expected = A * A * 2 sdfg = fusion_with_transient.to_sdfg() sdfg.simplify() assert sdfg.apply_transformations_repeated(MapFusion) >= 2 assert sdfg.apply_transformations_repeated(FPGATransformSDFG) == 1 with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=True): sdfg(A=A) assert np.allclose(A, expected) return sdfg
def test_map_unroll_processing_elements_decoupled(): # Grab the systolic GEMM implementation the samples directory spec = importlib.util.spec_from_file_location( "gemm", Path(__file__).parent.parent.parent / "samples" / "fpga" / "gemm_systolic_vectorized.py") gemm = importlib.util.module_from_spec(spec) spec.loader.exec_module(gemm) N = 128 K = 256 M = 512 P = 8 W = 4 TN = 32 TM = 128 # Create an SDFG with multiple processing elements sdfg = gemm.make_sdfg("map_unroll_processing_elements", dace.vector(dace.float32, W)) sdfg.specialize({"P": P, "W": W, "TN": TN, "TM": TM}) for state in sdfg.states(): for node in state.nodes(): if isinstance(node, nodes.MapEntry) and node.params == ["p"]: node.unroll = False node.schedule = dace.ScheduleType.Unrolled # Initialize arrays: Randomize A and B, zero C A = np.ndarray([N, K], dtype=dace.float32.type) B = np.ndarray([K, M], dtype=dace.float32.type) C = np.ndarray([N, M], dtype=dace.float32.type) A[:] = np.random.rand(N, K).astype(dace.float32.type) B[:] = np.random.rand(K, M).astype(dace.float32.type) C[:] = np.random.rand(N, M).astype(dace.float32.type) C_regression = A @ B + C with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=True): sdfg(A=A, B=B, C=C, N=N, M=M, K=K) diff = np.linalg.norm(C_regression - C) / float(N * M) if not np.allclose(C_regression, C): raise ValueError("Verification failed.") return sdfg
def four_interface_to_2_banks(mem_type, decouple_interfaces): sdfg = SDFG("test_4_interface_to_2_banks_" + mem_type) state = sdfg.add_state() _, desc_a = sdfg.add_array("a", [2, 2], dace.int32) desc_a.location["memorytype"] = mem_type desc_a.location["bank"] = "0:2" acc_read1 = state.add_read("a") acc_write1 = state.add_write("a") t1 = state.add_tasklet("r1", set(["_x1", "_x2"]), set(["_y1"]), "_y1 = _x1 + _x2") m1_in, m1_out = state.add_map("m", {"k": "0:2"}, dtypes.ScheduleType.Unrolled) state.add_memlet_path(acc_read1, m1_in, t1, memlet=memlet.Memlet("a[0, 0]"), dst_conn="_x1") state.add_memlet_path(acc_read1, m1_in, t1, memlet=memlet.Memlet("a[1, 0]"), dst_conn="_x2") state.add_memlet_path(t1, m1_out, acc_write1, memlet=memlet.Memlet("a[0, 1]"), src_conn="_y1") sdfg.apply_fpga_transformations() assert sdfg.apply_transformations(InlineSDFG) == 1 assert sdfg.apply_transformations(MapUnroll) == 1 for node in sdfg.states()[0].nodes(): if isinstance(node, dace.sdfg.nodes.Tasklet): sdfg.states()[0].out_edges(node)[0].data.subset = subsets.Range.from_string("1, 1") break with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=decouple_interfaces): bank_assignment = sdfg.generate_code()[3].clean_code # if we are not decoupling array interfaces we will use less mem interfaces assert bank_assignment.count("sp") == 6 if decouple_interfaces else 4 assert bank_assignment.count(mem_type + "[0]") == 3 if decouple_interfaces else 2 assert bank_assignment.count(mem_type + "[1]") == 3 if decouple_interfaces else 2 a = np.zeros([2, 2], np.int32) a[0, 0] = 2 a[1, 0] = 3 sdfg(a=a) assert a[0, 1] == 5 return sdfg
def test_set_temporary(): path = ["compiler", "build_type"] current_value = Config.get(*path) with set_temporary(*path, value="I'm not a build type"): assert Config.get(*path) == "I'm not a build type" assert Config.get(*path) == current_value
def test_xilinx_decoupled_array_interfaces(): with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=True): return run_atax(dace.dtypes.DeviceType.FPGA)
def test_hbm_reduce_2x3_2b_decouple_array_interfaces(): with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=True): return exec_test(2, 3, 2, "hbm", "red_2x3_2b_decoupled")
def test_ddr_reduce_red_2x40_6b_decouple_array_interfaces(): with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=True): return exec_test(2, 40, 6, "ddr", "red_2x40_6b_decoupled")
def test_ddr_reduce_red_1x50_1b_decouple_array_interfaces(): with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=True): return exec_test(1, 50, 1, "ddr", "red_1x50_1b_decoupled")
def test_hbm_reduce_red_1x40_8b_decouple_array_interfaces(): with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=True): return exec_test(1, 40, 8, "hbm", "red_1x40_8b_decoupled")
def test_hbm_reduce_10x50_4b_decouple_array_interfaces(): with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=True): return exec_test(10, 50, 4, "hbm", "red_10x50_4b_decoupled")
def test_dot_xilinx_decoupled(): with set_temporary("compiler", "xilinx", "decouple_array_interfaces", value=True): return run_test("xilinx", 64, 16)