def build_checker(): if hasattr(build_checker, "dll"): return build_checker.dll checker_code_path = os.path.join( os.path.dirname(inspect.getfile(daceml.onnx)), "include", "op_checker.h") with open(checker_code_path, "r") as f: checker_code = f.read() program = codeobject.CodeObject("onnx_op_checker", checker_code, "cpp", targets.cpu.CPUCodeGen, "ONNXOpChecker", environments={"ONNXRuntime"}) BUILD_PATH = os.path.join('.dacecache', "onnx_op_checker") compiler.generate_program_folder(None, [program], BUILD_PATH) compiler.configure_and_compile(BUILD_PATH) checker_dll = ctypes.CDLL( compiler.get_binary_name(BUILD_PATH, "onnx_op_checker")) build_checker.dll = checker_dll return checker_dll
def cuda_helper(): helper_code = """ #include <dace/dace.h> extern "C" { int host_to_gpu(void* gpu, void* host, size_t size) { auto result = cudaMemcpy(gpu, host, size, cudaMemcpyHostToDevice); DACE_CUDA_CHECK(cudaGetLastError()); DACE_CUDA_CHECK(cudaDeviceSynchronize()); return result; } } """ program = codeobject.CodeObject("cuda_helper", helper_code, "cpp", targets.cpu.CPUCodeGen, "CudaHelper") dummy_cuda_target = codeobject.CodeObject("dummy", "", "cu", targets.cuda.CUDACodeGen, "CudaDummy") build_folder = dace.Config.get('default_build_folder') BUILD_PATH = os.path.join(build_folder, "cuda_helper") compiler.generate_program_folder(None, [program, dummy_cuda_target], BUILD_PATH) compiler.configure_and_compile(BUILD_PATH) checker_dll = compiled_sdfg.ReloadableDLL( compiler.get_binary_name(BUILD_PATH, "cuda_helper"), "cuda_helper") class CudaHelper: def __init__(self): self.dll = checker_dll checker_dll.load() self._host_to_gpu = checker_dll.get_symbol("host_to_gpu") self._host_to_gpu.restype = ctypes.c_int def __del__(self): self.dll.unload() def host_to_gpu(self, gpu_ptr: int, numpy_array: np.ndarray): size = ctypes.sizeof( dtypes._FFI_CTYPES[numpy_array.dtype.type]) * numpy_array.size result = ctypes.c_int( self._host_to_gpu( ctypes.c_void_p(gpu_ptr), ctypes.c_void_p( numpy_array.__array_interface__["data"][0]), ctypes.c_size_t(size))) if result.value != 0: raise ValueError("host_to_gpu returned nonzero result!") return CudaHelper()
def run_local(self, sdfg: SDFG, driver_file: str): workdir = sdfg.build_folder if Config.get_bool('diode', 'general', 'library_autoexpand'): sdfg.expand_library_nodes() code_objects = sdfg.generate_code() use_mpi = Executor._use_mpi(code_objects) # TODO: Implement (instead of pyrun, use mpirun/mpiexec) if use_mpi: raise NotImplementedError('Running MPI locally unimplemented') # Pipe stdout/stderr back to client output stdout = sys.stdout stderr = sys.stderr sys.stdout = FunctionStreamWrapper(self.show_output, stdout.write) sys.stderr = FunctionStreamWrapper(self.show_output, stderr.write) # Compile SDFG generate_program_folder(sdfg, code_objects, workdir, self._config) configure_and_compile(workdir, sdfg.name) self.show_output("Running script\n") # Run driver script with the compiled SDFG(s) as the default old_usecache = Config.get_bool('compiler', 'use_cache') Config.set('compiler', 'use_cache', value=True) try: runpy.run_path(driver_file, run_name='__main__') # Catching all exceptions, including SystemExit except (Exception, SystemExit) as ex: # Corner case: If exited with error code 0, it is a success if isinstance(ex, SystemExit): # If the exit code is nonzero, "raise" will not trigger a # printout on the server if ex.code != 0: traceback.print_exc() raise else: raise self.show_output("Execution Terminated\n") # Revert configuration and output redirection Config.set('compiler', 'use_cache', value=old_usecache) sys.stdout = stdout sys.stderr = stderr
def test_batchnorm2d_dp_gpu(): bngpusdfg: dace.SDFG = batchnorm2d_data_parallelism_gpu.to_sdfg( strict=True) bngpusdfg.apply_transformations(GPUTransformSDFG) # bngpusdfg.view() # rnsdfg: dace.SDFG = resnet_basicblock_gpu.to_sdfg() # rnsdfg.view() program_objects = bngpusdfg.generate_code() from dace.codegen import compiler out_path = '.dacecache/local/batchnorm/' + bngpusdfg.name program_folder = compiler.generate_program_folder(bngpusdfg, program_objects, out_path)
def run_remote(self, sdfg: SDFG, dace_state, fail_on_nonzero: bool): dace_progname = sdfg.name code_objects = sdfg.generate_code() use_mpi = Executor._use_mpi(code_objects) remote_workdir = self.config_get("execution", "general", "workdir") remote_base_path = self.config_get('default_build_folder') remote_dace_dir = os.path.join(remote_workdir, remote_base_path, dace_progname) try: tmpfolder = tempfile.mkdtemp() generate_program_folder(sdfg, code_objects, tmpfolder, config=self._config) self.create_remote_directory(remote_dace_dir) self.copy_folder_to_remote(tmpfolder, remote_dace_dir) # call compile.py on the remote node in the copied folder self.remote_compile(remote_dace_dir, dace_progname) # copy the input file and the .so file (with the right name) # to remote_dace_dir so_name = "lib" + dace_progname + "." + self.config_get( 'compiler', 'library_extension') self.copy_file_from_remote( os.path.join(remote_dace_dir, 'build', so_name), os.path.join(tmpfolder, so_name)) self.copy_file_to_remote(os.path.join(tmpfolder, so_name), remote_dace_dir) dace_file = dace_state.get_dace_tmpfile() if dace_file is None: raise ValueError("Dace file is None!") remote_dace_file = os.path.join(remote_workdir, os.path.basename(dace_file)) self.copy_file_to_remote(dace_file, remote_dace_file) self.remote_exec_dace(remote_workdir, remote_dace_file, use_mpi, fail_on_nonzero, repetitions=dace_state.repetitions) self.show_output("Execution Terminated\n") try: self.copy_file_from_remote(remote_workdir + "/results.log", ".") except RuntimeError: pass # Copy back the instrumentation and vectorization results try: self.copy_folder_from_remote( os.path.join(remote_dace_dir, 'perf'), ".") except RuntimeError: pass try: self.remote_delete_file(remote_workdir + "/results.log") except RuntimeError: pass self.remote_delete_file(remote_dace_file) self.remote_delete_dir(remote_dace_dir) except: # Running a custom script (the driver file), which can raise # any exception self.show_output(traceback.format_exc()) raise self.counter += 1
def run_remote(self, sdfg: SDFG, dace_state, fail_on_nonzero: bool): dace_progname = sdfg.name code_objects = sdfg.generate_code() use_mpi = Executor._use_mpi(code_objects) remote_workdir = self.config_get("execution", "general", "workdir") remote_dace_dir = os.path.join(remote_workdir, ".dacecache", dace_progname) try: tmpfolder = tempfile.mkdtemp() generate_program_folder(sdfg, code_objects, tmpfolder, config=self._config) self.create_remote_directory(remote_dace_dir) self.copy_folder_to_remote(tmpfolder, remote_dace_dir) # call compile.py on the remote node in the copied folder self.remote_compile(remote_dace_dir, dace_progname) # copy the input file and the .so file (with the right name) # to remote_dace_dir so_name = "lib" + dace_progname + "." + self.config_get( 'compiler', 'library_extension') self.copy_file_from_remote(remote_dace_dir + "/build/" + so_name, tmpfolder + "/" + so_name) self.copy_file_to_remote(tmpfolder + "/" + so_name, remote_dace_dir) dace_file = dace_state.get_dace_tmpfile() if dace_file is None: raise ValueError("Dace file is None!") remote_dace_file = remote_workdir + "/" + os.path.basename( dace_file) self.copy_file_to_remote(dace_file, remote_dace_file) papi = PAPIUtils.is_papi_used(sdfg) # We got the file there, now we can run with different # configurations. if papi: multirun_num = PAPISettings.perf_multirun_num( config=self._config) for iteration in range(multirun_num): optdict, omp_thread_num = PAPIUtils.get_run_options( self, iteration) self.remote_exec_dace(remote_workdir, remote_dace_file, use_mpi, fail_on_nonzero, omp_num_threads=omp_thread_num, repetitions=dace_state.repetitions, additional_options_dict=optdict) else: self.remote_exec_dace(remote_workdir, remote_dace_file, use_mpi, fail_on_nonzero, repetitions=dace_state.repetitions) self.show_output("Execution Terminated\n") try: self.copy_file_from_remote(remote_workdir + "/results.log", ".") except RuntimeError: pass if papi: # Copy back the vectorization results PAPIUtils.retrieve_vectorization_report( self, code_objects, remote_dace_dir) # Copy back the instrumentation results PAPIUtils.retrieve_instrumentation_results( self, remote_workdir) try: self.remote_delete_file(remote_workdir + "/results.log") except RuntimeError: pass self.remote_delete_file(remote_dace_file) self.remote_delete_dir(remote_dace_dir) except: # Running a custom script (the driver file), which can raise # any exception self.show_output(traceback.format_exc()) raise self.counter += 1
def run(self, dace_state, fail_on_nonzero=False): dace_progname = dace_state.get_sdfg().name code_objects = dace_state.get_generated_code() # Figure out whether we should use MPI for launching use_mpi = False for code_object in code_objects: if code_object.target.target_name == 'mpi': use_mpi = True break # Check counter validity PerfUtils.check_performance_counters(self) remote_workdir = Config.get("execution", "general", "workdir") remote_dace_dir = remote_workdir + "/.dacecache/%s/" % dace_progname self.show_output("Executing DaCe program " + dace_progname + " on " + \ Config.get("execution", "general", "host") + "\n") try: if self.running_async: # Add information about what is being run self.async_host.notify("Generating remote workspace") tmpfolder = tempfile.mkdtemp() generate_program_folder(dace_state.get_sdfg(), code_objects, tmpfolder) self.create_remote_directory(remote_dace_dir) self.copy_folder_to_remote(tmpfolder, remote_dace_dir) if self.running_async: # Add information about what is being run self.async_host.notify("Compiling...") # call compile.py on the remote node in the copied folder self.remote_compile(remote_dace_dir, dace_progname) if self.running_async: # Add information about what is being run self.async_host.notify("Done compiling") # copy the input file and the .so file (with the right name) # to remote_dace_dir so_name = "lib" + dace_progname + "." + Config.get( 'compiler', 'library_extension') self.copy_file_from_remote(remote_dace_dir + "/build/" + so_name, tmpfolder + "/" + so_name) self.copy_file_to_remote(tmpfolder + "/" + so_name, remote_dace_dir) dace_file = dace_state.get_dace_tmpfile() if dace_file is None: raise ValueError("Dace file is None!") # copy the SDFG try: local_sdfg = tmpfolder + "/sdfg.out" sdfg = dace_state.get_sdfg() sdfg.save(local_sdfg) remote_sdfg = remote_workdir + "/sdfg.out" self.copy_file_to_remote(local_sdfg, remote_sdfg) except: print("Could NOT save the SDFG") remote_dace_file = remote_workdir + "/" + os.path.basename( dace_file) self.copy_file_to_remote(dace_file, remote_dace_file) if self.running_async: # Add information about what is being run self.async_host.notify("All files copied to remote") # We got the file there, now we can run with different # configurations. for iteration in range(0, PerfSettings.perf_multirun_num()): optdict, omp_thread_num = PerfUtils.get_run_options( self, iteration) self.remote_exec_dace(remote_workdir, remote_dace_file, use_mpi, fail_on_nonzero, omp_num_threads=omp_thread_num, additional_options_dict=optdict) if self.running_async: # Add information about what is being run self.async_host.notify("Done option threads=" + str(omp_thread_num)) self.show_output("Execution Terminated\n") try: self.copy_file_from_remote(remote_workdir + "/results.log", ".") except: pass # Copy back the vectorization results PerfUtils.retrieve_vectorization_report(self, code_objects, remote_dace_dir) # Copy back the instrumentation results PerfUtils.retrieve_instrumentation_results(self, remote_workdir) if self.running_async: # Add information about what is being run self.async_host.notify("Cleaning up") try: self.remote_delete_file(remote_workdir + "/results.log") except: print( "WARNING: results.log could not be transmitted (probably not created)" ) self.remote_delete_file(remote_dace_file) self.remote_delete_dir(remote_dace_dir) def deferred(): try: res = self.update_performance_plot("results.log", str(self.counter)) os.remove("results.log") except FileNotFoundError: print("WARNING: results.log could not be read") self.async_host.run_sync(deferred) if self.running_async: # Add information about what is being run self.async_host.notify("Done cleaning") # Also, update the performance data. self.rendered_graphs.set_memspeed_target() self.rendered_graphs.render_performance_data( Config.get("instrumentation", "papi_mode")) except Exception as e: print("\n\n\n") print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") print("Running the program failed:") traceback.print_exc() print( "Inspect above output for more information about executed command sequence." ) print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") if self.headless: sys.exit(1) if self.running_async: self.async_host.notify("All done") self.counter += 1