def setup_env(): num_concurrent_streams = Config.get("compiler", "cuda", "max_concurrent_streams") if 'ORT_USE_STREAMS' in os.environ: ONNXRuntimeCUDA.use_streams = _env2bool(os.environ["ORT_USE_STREAMS"]) if ONNXRuntimeCUDA.use_streams: log.info("Using streams with ORT (experimental)") if num_concurrent_streams == 0: log.info("Setting compiler.cuda.max_concurrent_streams to 8") Config.set("compiler", "cuda", "max_concurrent_streams", value=8) elif num_concurrent_streams == -1: ONNXRuntimeCUDA.use_streams = False else: if num_concurrent_streams != -1: log.info("Setting compiler.cuda.max_concurrent_streams to -1") Config.set("compiler", "cuda", "max_concurrent_streams", value=-1) ONNXRuntimeCUDA.use_streams = False ONNXRuntimeCUDA.max_concurrent_streams = Config.get( "compiler", "cuda", "max_concurrent_streams")
sdfg.add_edge(state2, copy_out_state, dace.sdfg.sdfg.InterstateEdge()) sdfg.validate() return sdfg if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("N", type=int, nargs="?", default=32) args = vars(parser.parse_args()) size_n = args["N"] from dace.config import Config # set unique function to false to generate both sdfgs Config.set("compiler", "unique_functions", value=False) sdfg = make_nested_sdfg_fpga() X = np.random.rand(size_n).astype(np.float32) Y = np.random.rand(size_n).astype(np.float32) sdfg(X=X, Y=Y, N=size_n) ref = X+2 diff = np.linalg.norm(ref - Y) / size_n if diff <= 1e-5 : print("==== Program end ====") else: raise Exception("==== Program Error! ====")
@dace.program def matmul_np(A: dace.float64[128, 64], B: dace.float64[64, 32], C: dace.float64[128, 32]): C[:] = A @ B A = np.random.rand(128, 64).astype(np.float64) B = np.random.rand(64, 32).astype(np.float64) C = np.random.rand(128, 32).astype(np.float64) sdfg = matmul_np.to_sdfg() sdfg.apply_transformations([FPGATransformSDFG]) from dace.libraries.blas import Gemm Gemm.default_implementation = "FPGA1DSystolic" # We have to Inline sdfg.expand_library_nodes() sdfg.apply_transformations_repeated([InlineSDFG]) C_regression = A @ B p = Process(target=evaluate, args=(sdfg, A, B, C, C_regression)) p.start() p.join() del sdfg if __name__ == "__main__": # These tests will be executed on seperate process. The tranfform_on_call DACE configuration must be set to false Config.set('optimizer', 'transform_on_call', value=False) test_gemm_vectorized() test_gemm_size_not_multiples_of() test_matmul_np()