def run_HPC(): ################# # Setup dask cluster ################# config = utils.read_config() num_workers = config["num_hipergator_workers"] #job args extra_args=[ "--error=/home/b.weinstein/logs/dask-worker-%j.err", "--account=ewhite", "--output=/home/b.weinstein/logs/dask-worker-%j.out" ] cluster = SLURMCluster( processes=2, queue='hpg2-compute', cores=3, memory='11GB', walltime='24:00:00', job_extra=extra_args, local_directory="/home/b.weinstein/logs/", death_timeout=150) print(cluster.job_script()) cluster.adapt(minimum=num_workers, maximum=num_workers) dask_client = Client(cluster) #Start dask dask_client.run_on_scheduler(start_tunnel) run(config, debug=False)
def start_dask_cluster(number_of_workers, mem_size="10GB"): ################# # Setup dask cluster ################# #job args extra_args = [ "--error=/home/b.weinstein/logs/dask-worker-%j.err", "--account=ewhite", "--output=/home/b.weinstein/logs/dask-worker-%j.out" ] cluster = SLURMCluster(processes=1, queue='hpg2-compute', cores=1, memory=mem_size, walltime='24:00:00', job_extra=extra_args, local_directory="/home/b.weinstein/logs/dask/", death_timeout=300) print(cluster.job_script()) cluster.adapt(minimum=number_of_workers, maximum=number_of_workers) dask_client = Client(cluster) #Start dask dask_client.run_on_scheduler(start_tunnel) return dask_client
def cli(scheduler_file, jlab_port, dash_port, notebook_dir, hostname, log_level): logger = get_logger(log_level) logger.info('getting client with scheduler file: %s' % scheduler_file) client = Client(scheduler_file=scheduler_file, timeout=30) logger.debug('Client: %s' % client) logger.debug('Getting hostname where scheduler is running') host = client.run_on_scheduler(socket.gethostname) logger.info('host is %s' % host) logger.info('Starting jupyter lab on host') client.run_on_scheduler(start_jlab, host=host, port=jlab_port, notebook_dir=notebook_dir) logger.debug('Done.') user = os.environ['USER'] print('Run the following command from your local machine:') print('ssh -N -L {}:{}:{} -L {}:{}:8787 {}@{}'.format( jlab_port, host, jlab_port, dash_port, host, user, hostname)) print('Then open the following URLs:') print('\tJupyter lab: http://localhost:{}'.format(jlab_port)) print('\tDask dashboard: http://localhost:{}'.format(dash_port))
def reload_modules_on_workers(url, modulelist=None): """Run reload(module) on the items in the modulelist""" client = Client(url) for mod in modulelist: print("reloading %s" % mod) client.run(importlib.reload, mod) client.run_on_scheduler(importlib.reload, mod)
def start(cpus=0, gpus=0, mem_size="10GB"): ################# # Setup dask cluster ################# if cpus > 0: #job args extra_args = [ "--error=/orange/idtrees-collab/logs/dask-worker-%j.err", "--account=ewhite", "--output=/orange/idtrees-collab/logs/dask-worker-%j.out" ] cluster = SLURMCluster( processes=1, queue='hpg2-compute', cores=1, memory=mem_size, walltime='24:00:00', job_extra=extra_args, extra=['--resources cpu=1'], scheduler_options={"dashboard_address": ":8781"}, local_directory="/orange/idtrees-collab/tmp/", death_timeout=300) print(cluster.job_script()) cluster.scale(cpus) if gpus: #job args extra_args = [ "--error=/orange/idtrees-collab/logs/dask-worker-%j.err", "--account=ewhite", "--output=/orange/idtrees-collab/logs/dask-worker-%j.out", "--partition=gpu", "--gpus=1" ] cluster = SLURMCluster( processes=1, cores=1, memory=mem_size, walltime='24:00:00', job_extra=extra_args, extra=['--resources gpu=1'], scheduler_options={"dashboard_address": ":8787"}, local_directory="/orange/idtrees-collab/tmp/", death_timeout=300) cluster.scale(gpus) dask_client = Client(cluster) #Start dask dask_client.run_on_scheduler(start_tunnel) return dask_client
def install_libraries_on_workers(url, runlist=None): """Install libraries if necessary on workers etc. e.g. if already on server... install_libraries_on_workers('127.0.0.1:8786') """ client = Client(url) if runlist is None: runlist = [ 'sudo apt-get -y install build-essential', 'pip install -U pip', 'sudo apt install libgl1-mesa-glx -y', 'conda update scipy -y', 'pip install git+https://github.com/sods/paramz.git', 'pip install git+https://github.com/SheffieldML/GPy.git', 'pip install git+https://github.com/lionfish0/dp4gp.git', 'conda install dask-searchcv -c conda-forge -y', 'pip install git+https://github.com/lionfish0/dask_dp4gp.git', 'pip install numpy', 'conda remove argcomplete -y', 'pip install git+https://github.com/lionfish0/dialysis_analysis.git --upgrade' ] #, 'conda install python=3.6 -y'] for item in runlist: print("Installing '%s' on workers..." % item) res = client.run(os.system, item) print(res) print("Installing '%s' on scheduler..." % item) res = client.run_on_scheduler(os.system, item) print(res)
def run_HPC(data_paths): ################# # Setup dask cluster ################# from dask_jobqueue import SLURMCluster from dask.distributed import Client, wait DeepForest_config = config.load_config() num_workers = DeepForest_config["num_hipergator_workers"] #job args extra_args = [ "--error=/home/b.weinstein/logs/dask-worker-%j.err", "--account=ewhite", "--output=/home/b.weinstein/logs/dask-worker-%j.out" ] cluster = SLURMCluster(processes=1, queue='hpg2-compute', cores=1, memory='13GB', walltime='24:00:00', job_extra=extra_args, local_directory="/home/b.weinstein/logs/", death_timeout=300) print(cluster.job_script()) cluster.adapt(minimum=num_workers, maximum=num_workers) dask_client = Client(cluster) #Start dask dask_client.run_on_scheduler(start_tunnel) for site in data_paths: futures = dask_client.map(Generate.run, data_paths[site], site=site, DeepForest_config=DeepForest_config) wait(futures) print("{} complete".format(site)) print("All sites complete")
def cli(scheduler_file, jlab_port, dash_port, notebook_dir, hostname, log_level): logger = get_logger(log_level) logger.info('getting client with scheduler file: %s' % scheduler_file) client = Client(scheduler_file=scheduler_file, timeout=30) logger.debug('Client: %s' % client) logger.debug('Getting hostname where scheduler is running') host = client.run_on_scheduler(socket.gethostname) logger.info('host is %s' % host) logger.info('Starting jupyter lab on host') client.run_on_scheduler(start_jlab, host=host, port=jlab_port, notebook_dir=notebook_dir) logger.debug('Done.') user = os.environ['USER'] print('Run the following command from your local machine:') #print('ssh -N -L {}:{}:{} -L {}:{}:8787 {}@{}'.format(jlab_port, host, jlab_port, dash_port, host, user, hostname)) print('ssh -N -L {}:{}:{} -L {}:{}:8787 {}'.format(jlab_port, host, jlab_port, dash_port, host, hostname)) #Modification for existing ssh key print('Then open the following URLs:') print('\tJupyter lab: http://localhost:{}'.format(jlab_port)) print('\tDask dashboard: http://localhost:{}'.format(dash_port))
#print (Client(Scheduler_IP)) c = Client(processes=False, threads_per_worker=4, n_workers=1, memory_limit='2GB') FramesBase = 4187 with open('data3.txt', mode='w') as file: traj_size = [600] for k in traj_size: # we have 3 trajectory sizes block_size = [144] for i in block_size: # changing blocks for j in range(1, 40): # changing files (5 files per block size) c.run_on_scheduler( submitCustomProfiler, '/data/03170/tg824689/BecksteinLab/scripts-DCD/stragglers_test_%d_%d_%d.txt' % (k, i, j)) # Provide the path to my file to all processes total = com_parallel_dask_distributed(FramesBase * k, i) total = delayed(total) start = time.time() output = total.compute(scheduler=c.get) total.visualize(filename='transpose.svg') tot_time = time.time() - start c.run_on_scheduler(removeCustomProfiler) file.write("DCD{} {} {} {} {} {} {} {}\n".format( k, i, j, output[1], output[2], output[3], output[4], tot_time)) file.flush()
def main(args): cluster_options = get_cluster_options(args) Cluster = cluster_options["class"] cluster_args = cluster_options["args"] cluster_kwargs = cluster_options["kwargs"] scheduler_addr = cluster_options["scheduler_addr"] if args.sched_addr: client = Client(args.sched_addr) else: filterwarnings("ignore", message=".*NVLink.*rmm_pool_size.*", category=UserWarning) cluster = Cluster(*cluster_args, **cluster_kwargs) if args.multi_node: import time # Allow some time for workers to start and connect to scheduler # TODO: make this a command-line argument? time.sleep(15) client = Client(scheduler_addr if args.multi_node else cluster) if args.type == "gpu": client.run( setup_memory_pool, pool_size=args.rmm_pool_size, disable_pool=args.disable_rmm_pool, log_directory=args.rmm_log_directory, ) # Create an RMM pool on the scheduler due to occasional deserialization # of CUDA objects. May cause issues with InfiniBand otherwise. client.run_on_scheduler( setup_memory_pool, pool_size=1e9, disable_pool=args.disable_rmm_pool, log_directory=args.rmm_log_directory, ) scheduler_workers = client.run_on_scheduler(get_scheduler_workers) n_workers = len(scheduler_workers) client.wait_for_workers(n_workers) # Allow the number of chunks to vary between # the "base" and "other" DataFrames args.base_chunks = args.base_chunks or n_workers args.other_chunks = args.other_chunks or n_workers if args.all_to_all: all_to_all(client) took_list = [] for _ in range(args.runs - 1): took_list.append(run(client, args, n_workers, write_profile=None)) took_list.append( run(client, args, n_workers, write_profile=args.profile)) # Only profiling the last run # Collect, aggregate, and print peer-to-peer bandwidths incoming_logs = client.run( lambda dask_worker: dask_worker.incoming_transfer_log) bandwidths = defaultdict(list) total_nbytes = defaultdict(list) for k, L in incoming_logs.items(): for d in L: if d["total"] >= args.ignore_size: bandwidths[k, d["who"]].append(d["bandwidth"]) total_nbytes[k, d["who"]].append(d["total"]) bandwidths = {(scheduler_workers[w1].name, scheduler_workers[w2].name): [ "%s/s" % format_bytes(x) for x in numpy.quantile(v, [0.25, 0.50, 0.75]) ] for (w1, w2), v in bandwidths.items()} total_nbytes = {( scheduler_workers[w1].name, scheduler_workers[w2].name, ): format_bytes(sum(nb)) for (w1, w2), nb in total_nbytes.items()} broadcast = (False if args.shuffle_join else (True if args.broadcast_join else "default")) t_runs = numpy.empty(len(took_list)) if args.markdown: print("```") print("Merge benchmark") print("-------------------------------") print(f"backend | {args.backend}") print(f"merge type | {args.type}") print(f"rows-per-chunk | {args.chunk_size}") print(f"base-chunks | {args.base_chunks}") print(f"other-chunks | {args.other_chunks}") print(f"broadcast | {broadcast}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devs}") print(f"rmm-pool | {(not args.disable_rmm_pool)}") print(f"frac-match | {args.frac_match}") if args.protocol == "ucx": print(f"tcp | {args.enable_tcp_over_ucx}") print(f"ib | {args.enable_infiniband}") print(f"nvlink | {args.enable_nvlink}") print(f"data-processed | {format_bytes(took_list[0][0])}") print("===============================") print("Wall-clock | Throughput") print("-------------------------------") for idx, (data_processed, took) in enumerate(took_list): throughput = int(data_processed / took) m = format_time(took) m += " " * (15 - len(m)) print(f"{m}| {format_bytes(throughput)}/s") t_runs[idx] = float(format_bytes(throughput).split(" ")[0]) print("===============================") if args.markdown: print("\n```") if args.plot is not None: plot_benchmark(t_runs, args.plot, historical=True) if args.backend == "dask": if args.markdown: print( "<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```" ) print("(w1,w2) | 25% 50% 75% (total nbytes)") print("-------------------------------") for (d1, d2), bw in sorted(bandwidths.items()): fmt = ("(%s,%s) | %s %s %s (%s)" if args.multi_node or args.sched_addr else "(%02d,%02d) | %s %s %s (%s)") print(fmt % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)])) if args.markdown: print("```\n</details>\n") if args.multi_node: client.shutdown() client.close()
def main(args): # Set up workers on the local machine if args.protocol == "tcp": cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devs, ) else: enable_infiniband = args.enable_infiniband enable_nvlink = args.enable_nvlink enable_tcp_over_ucx = args.enable_tcp_over_ucx cluster = LocalCUDACluster( protocol=args.protocol, n_workers=args.n_workers, CUDA_VISIBLE_DEVICES=args.devs, ucx_net_devices="auto", enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, ) initialize( create_cuda_context=True, enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, ) client = Client(cluster) def _worker_setup(initial_pool_size=None): import rmm rmm.reinitialize( pool_allocator=not args.no_rmm_pool, devices=0, initial_pool_size=initial_pool_size, ) cupy.cuda.set_allocator(rmm.rmm_cupy_allocator) client.run(_worker_setup) # Create an RMM pool on the scheduler due to occasional deserialization # of CUDA objects. May cause issues with InfiniBand otherwise. client.run_on_scheduler(_worker_setup, 1e9) took_list = [] for _ in range(args.runs - 1): took_list.append(run(client, args, write_profile=None)) took_list.append( run(client, args, write_profile=args.profile) ) # Only profiling the last run # Collect, aggregate, and print peer-to-peer bandwidths incoming_logs = client.run(lambda dask_worker: dask_worker.incoming_transfer_log) bandwidths = defaultdict(list) total_nbytes = defaultdict(list) for k, L in incoming_logs.items(): for d in L: if d["total"] >= args.ignore_size: bandwidths[k, d["who"]].append(d["bandwidth"]) total_nbytes[k, d["who"]].append(d["total"]) bandwidths = { (cluster.scheduler.workers[w1].name, cluster.scheduler.workers[w2].name): [ "%s/s" % format_bytes(x) for x in numpy.quantile(v, [0.25, 0.50, 0.75]) ] for (w1, w2), v in bandwidths.items() } total_nbytes = { ( cluster.scheduler.workers[w1].name, cluster.scheduler.workers[w2].name, ): format_bytes(sum(nb)) for (w1, w2), nb in total_nbytes.items() } if args.markdown: print("```") print("Merge benchmark") print("-------------------------------") print(f"backend | {args.backend}") print(f"rows-per-chunk | {args.chunk_size}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devs}") print(f"rmm-pool | {(not args.no_rmm_pool)}") print(f"frac-match | {args.frac_match}") if args.protocol == "ucx": print(f"tcp | {args.enable_tcp_over_ucx}") print(f"ib | {args.enable_infiniband}") print(f"nvlink | {args.enable_nvlink}") print(f"data-processed | {format_bytes(took_list[0][0])}") print("===============================") print("Wall-clock | Throughput") print("-------------------------------") for data_processed, took in took_list: throughput = int(data_processed / took) m = format_time(took) m += " " * (15 - len(m)) print(f"{m}| {format_bytes(throughput)}/s") print("===============================") if args.markdown: print("\n```") if args.backend == "dask": if args.markdown: print("<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```") print("(w1,w2) | 25% 50% 75% (total nbytes)") print("-------------------------------") for (d1, d2), bw in sorted(bandwidths.items()): print( "(%02d,%02d) | %s %s %s (%s)" % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]) ) if args.markdown: print("```\n</details>\n")
with open('data.txt', mode='a') as file: traj_size = [600] for k in traj_size: # we have 3 trajectory sizes # Creating the universe for doing benchmark u1 = mda.Universe(PSF, DCD1) longXTC = os.path.abspath(os.path.normpath(os.path.join(os.getcwd(),'files/newtraj.xtc'))) # Doing benchmarks ii=1 block_size = [1 6 12 18 24 30 36 42 48 54 60 66 72] for i in block_size: # changing blocks for j in range(1,6): # changing files (5 files per block size) # Create a new filei c.run_on_scheduler(submitCustomProfiler,os.path.abspath(os.path.normpath(os.path.join(os.getcwd(),'files/XTC_{}_{}_{}.txt'.format(k,i,j))))) longXTC1 = os.path.abspath(os.path.normpath(os.path.join(os.getcwd(),'files/newtraj{}.xtc'.format(ii)))) copyfile(longXTC, longXTC1) # Provide the path to my file to all processes my_path = os.path.normpath(os.path.join(os.getcwd(), longXTC1)) longXTC1 = os.path.abspath(my_path) # Define a new universe with the new trajectory u = mda.Universe(PSF, longXTC1) print(u) print("frames in trajectory ", u.trajectory.n_frames) print (len(u.trajectory)) mobile = u.select_atoms("(resid 1:29 or resid 60:121 or resid 160:214) and name CA") index = mobile.indices total = com_parallel_dask_distributed(mobile, index, i) total = delayed (total) start = time.time()
if __name__ == '__main__': Scheduler_IP = sys.argv[1] #SLURM_JOBID = sys.argv[2] print(Scheduler_IP) #print (Client(Scheduler_IP)) c = Client(Scheduler_IP) with open('data3.txt', mode='w') as file: traj_size = [600] for k in traj_size: # we have 3 trajectory sizes block_size = [int(sys.argv[2])] for i in block_size: # changing blocks for j in range(10): # changing files (5 files per block size) c.run_on_scheduler( submitCustomProfiler, sys.argv[3] + '/stragglers_test_%d_%d_%d.txt' % (k, i, j)) # Provide the path to my file to all processes total = com_parallel_dask_distributed(104675 * i, i) total = delayed(total) start = time.time() output = total.compute(get=c.get) tot_time = time.time() - start c.run_on_scheduler(removeCustomProfiler) file.write( 'size,blocks,iter,t_comp_avg,t_comp_max,t_all_frame_avg,t_all_frame_max,tot_time' ) file.write("{0},{1},{2},{3},{4},{5},{6},{7}\n".format( k, i, j, output[1], output[2], output[3], output[4], tot_time)) file.flush()
cluster = SLURMCluster(project='ewhite', death_timeout=100) cluster.start_workers(1) print(cluster.job_script()) from dask.distributed import Client client = Client(cluster) client counter = 0 while counter < 10: print(datetime.now().strftime("%a, %d %B %Y %I:%M:%S")) print(client) sleep(20) counter += 1 import socket host = client.run_on_scheduler(socket.gethostname) def start_jlab(dask_scheduler): import subprocess proc = subprocess.Popen(['jupyter', 'lab', '--ip', host, '--no-browser']) dask_scheduler.jlab_proc = proc client.run_on_scheduler(start_jlab) print("ssh -N -L 8787:%s:8787 -L 8888:%s:8888 -l b.weinstein hpg2.rc.ufl.edu" % (host, host))
from dask_mpi import initialize initialize() import socket from distributed.scheduler import logger import dask.array as da from dask.distributed import Client client = Client() # Connect this local process to remote workers host = client.run_on_scheduler(socket.gethostname) port = client.scheduler_info()['services']['dashboard'] login_node_address = ( 'supercomputer.university.edu' # Change this to the address/domain of your login node ) logger.info(f'ssh -N -L {port}:{host}:{port} {login_node_address}') logger.info('HELLO' * 10) print('WORLD' * 10) x = da.random.random((200, 10_000, 5_000), chunks=(20, 1_000, 1_000)) y = x.std(axis=0) y = y.compute() print(y)
from dask.distributed import Client import socket client = Client(scheduler_file='scheduler.json') print(client) host = client.run_on_scheduler(socket.gethostname) def start_jlab(dask_scheduler): import subprocess proc = subprocess.Popen(['jupyter', 'notebook', '--ip', host]) dask_scheduler.jlab_proc = proc client.run_on_scheduler(start_jlab) print("HOST : %s" % host)
def main(args): cluster_options = get_cluster_options(args) Cluster = cluster_options["class"] cluster_args = cluster_options["args"] cluster_kwargs = cluster_options["kwargs"] scheduler_addr = cluster_options["scheduler_addr"] cluster = Cluster(*cluster_args, **cluster_kwargs) if args.multi_node: import time # Allow some time for workers to start and connect to scheduler # TODO: make this a command-line argument? time.sleep(15) client = Client(scheduler_addr if args.multi_node else cluster) client.run(setup_memory_pool, disable_pool=args.no_rmm_pool) # Create an RMM pool on the scheduler due to occasional deserialization # of CUDA objects. May cause issues with InfiniBand otherwise. client.run_on_scheduler(setup_memory_pool, 1e9, disable_pool=args.no_rmm_pool) scheduler_workers = client.run_on_scheduler(get_scheduler_workers) n_workers = len(scheduler_workers) took_list = [] for _ in range(args.runs - 1): took_list.append(run(client, args, n_workers, write_profile=None)) took_list.append( run(client, args, n_workers, write_profile=args.profile) ) # Only profiling the last run # Collect, aggregate, and print peer-to-peer bandwidths incoming_logs = client.run(lambda dask_worker: dask_worker.incoming_transfer_log) bandwidths = defaultdict(list) total_nbytes = defaultdict(list) for k, L in incoming_logs.items(): for d in L: if d["total"] >= args.ignore_size: bandwidths[k, d["who"]].append(d["bandwidth"]) total_nbytes[k, d["who"]].append(d["total"]) bandwidths = { (scheduler_workers[w1].name, scheduler_workers[w2].name): [ "%s/s" % format_bytes(x) for x in numpy.quantile(v, [0.25, 0.50, 0.75]) ] for (w1, w2), v in bandwidths.items() } total_nbytes = { (scheduler_workers[w1].name, scheduler_workers[w2].name,): format_bytes(sum(nb)) for (w1, w2), nb in total_nbytes.items() } if args.markdown: print("```") print("Merge benchmark") print("-------------------------------") print(f"backend | {args.backend}") print(f"rows-per-chunk | {args.chunk_size}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devs}") print(f"rmm-pool | {(not args.no_rmm_pool)}") print(f"frac-match | {args.frac_match}") if args.protocol == "ucx": print(f"tcp | {args.enable_tcp_over_ucx}") print(f"ib | {args.enable_infiniband}") print(f"nvlink | {args.enable_nvlink}") print(f"data-processed | {format_bytes(took_list[0][0])}") print("===============================") print("Wall-clock | Throughput") print("-------------------------------") for data_processed, took in took_list: throughput = int(data_processed / took) m = format_time(took) m += " " * (15 - len(m)) print(f"{m}| {format_bytes(throughput)}/s") print("===============================") if args.markdown: print("\n```") if args.backend == "dask": if args.markdown: print("<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```") print("(w1,w2) | 25% 50% 75% (total nbytes)") print("-------------------------------") for (d1, d2), bw in sorted(bandwidths.items()): fmt = ( "(%s,%s) | %s %s %s (%s)" if args.multi_node else "(%02d,%02d) | %s %s %s (%s)" ) print(fmt % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)])) if args.markdown: print("```\n</details>\n") if args.multi_node: client.shutdown() client.close()
def main(args): cluster_options = get_cluster_options(args) Cluster = cluster_options["class"] cluster_args = cluster_options["args"] cluster_kwargs = cluster_options["kwargs"] scheduler_addr = cluster_options["scheduler_addr"] if args.sched_addr: client = Client(args.sched_addr) else: filterwarnings("ignore", message=".*NVLink.*rmm_pool_size.*", category=UserWarning) cluster = Cluster(*cluster_args, **cluster_kwargs) if args.multi_node: import time # Allow some time for workers to start and connect to scheduler # TODO: make this a command-line argument? time.sleep(15) client = Client(scheduler_addr if args.multi_node else cluster) if args.type == "gpu": client.run( setup_memory_pool, pool_size=args.rmm_pool_size, disable_pool=args.disable_rmm_pool, log_directory=args.rmm_log_directory, ) # Create an RMM pool on the scheduler due to occasional deserialization # of CUDA objects. May cause issues with InfiniBand otherwise. client.run_on_scheduler( setup_memory_pool, pool_size=1e9, disable_pool=args.disable_rmm_pool, log_directory=args.rmm_log_directory, ) scheduler_workers = client.run_on_scheduler(get_scheduler_workers) n_workers = len(scheduler_workers) client.wait_for_workers(n_workers) if args.all_to_all: all_to_all(client) took_list = [] for _ in range(args.runs - 1): took_list.append(run(client, args, n_workers, write_profile=None)) took_list.append( run(client, args, n_workers, write_profile=args.profile)) # Only profiling the last run # Collect, aggregate, and print peer-to-peer bandwidths incoming_logs = client.run( lambda dask_worker: dask_worker.incoming_transfer_log) bandwidths = defaultdict(list) total_nbytes = defaultdict(list) for k, L in incoming_logs.items(): for d in L: if d["total"] >= args.ignore_size: bandwidths[k, d["who"]].append(d["bandwidth"]) total_nbytes[k, d["who"]].append(d["total"]) bandwidths = {(scheduler_workers[w1].name, scheduler_workers[w2].name): [ "%s/s" % format_bytes(x) for x in numpy.quantile(v, [0.25, 0.50, 0.75]) ] for (w1, w2), v in bandwidths.items()} total_nbytes = {( scheduler_workers[w1].name, scheduler_workers[w2].name, ): format_bytes(sum(nb)) for (w1, w2), nb in total_nbytes.items()} t_runs = numpy.empty(len(took_list)) if args.markdown: print("```") print("Shuffle benchmark") print("-------------------------------") print(f"backend | {args.backend}") print(f"partition-size | {format_bytes(args.partition_size)}") print(f"in-parts | {args.in_parts}") print(f"protocol | {args.protocol}") print(f"device(s) | {args.devs}") if args.device_memory_limit: print(f"memory-limit | {format_bytes(args.device_memory_limit)}") print(f"rmm-pool | {(not args.disable_rmm_pool)}") if args.protocol == "ucx": print(f"tcp | {args.enable_tcp_over_ucx}") print(f"ib | {args.enable_infiniband}") print(f"nvlink | {args.enable_nvlink}") print(f"data-processed | {format_bytes(took_list[0][0])}") print("===============================") print("Wall-clock | Throughput") print("-------------------------------") for idx, (data_processed, took) in enumerate(took_list): throughput = int(data_processed / took) m = format_time(took) m += " " * (15 - len(m)) print(f"{m}| {format_bytes(throughput)}/s") t_runs[idx] = float(format_bytes(throughput).split(" ")[0]) print("===============================") if args.markdown: print("\n```") if args.plot is not None: plot_benchmark(t_runs, args.plot, historical=True) if args.backend == "dask": if args.markdown: print( "<details>\n<summary>Worker-Worker Transfer Rates</summary>\n\n```" ) print("(w1,w2) | 25% 50% 75% (total nbytes)") print("-------------------------------") for (d1, d2), bw in sorted(bandwidths.items()): fmt = ("(%s,%s) | %s %s %s (%s)" if args.multi_node or args.sched_addr else "(%02d,%02d) | %s %s %s (%s)") print(fmt % (d1, d2, bw[0], bw[1], bw[2], total_nbytes[(d1, d2)])) if args.markdown: print("```\n</details>\n") if args.benchmark_json: bandwidths_json = { "bandwidth_({d1},{d2})_{i}" if args.multi_node or args.sched_addr else "(%02d,%02d)_%s" % (d1, d2, i): parse_bytes(v.rstrip("/s")) for (d1, d2), bw in sorted(bandwidths.items()) for i, v in zip( ["25%", "50%", "75%", "total_nbytes"], [bw[0], bw[1], bw[2], total_nbytes[(d1, d2)]], ) } with open(args.benchmark_json, "a") as fp: for data_processed, took in took_list: fp.write( dumps( dict( { "backend": args.backend, "partition_size": args.partition_size, "in_parts": args.in_parts, "protocol": args.protocol, "devs": args.devs, "device_memory_limit": args.device_memory_limit, "rmm_pool": not args.disable_rmm_pool, "tcp": args.enable_tcp_over_ucx, "ib": args.enable_infiniband, "nvlink": args.enable_nvlink, "data_processed": data_processed, "wall_clock": took, "throughput": data_processed / took, }, **bandwidths_json, )) + "\n") if args.multi_node: client.shutdown() client.close()
class Client(elfi.client.ClientBase): """A multiprocessing client using dask.""" def __init__(self): """Initialize a dask client.""" self.dask_client = DaskClient() self.tasks = {} self._id_counter = itertools.count() def apply(self, kallable, *args, **kwargs): """Add `kallable(*args, **kwargs)` to the queue of tasks. Returns immediately. Parameters ---------- kallable: callable Returns ------- task_id: int """ task_id = self._id_counter.__next__() async_result = self.dask_client.submit(kallable, *args, **kwargs) self.tasks[task_id] = async_result return task_id def apply_sync(self, kallable, *args, **kwargs): """Call and returns the result of `kallable(*args, **kwargs)`. Parameters ---------- kallable: callable """ return self.dask_client.run_on_scheduler(kallable, *args, **kwargs) def get_result(self, task_id): """Return the result from task identified by `task_id` when it arrives. Parameters ---------- task_id: int Returns ------- dict """ async_result = self.tasks.pop(task_id) return async_result.result() def is_ready(self, task_id): """Return whether task with identifier `task_id` is ready. Parameters ---------- task_id: int Returns ------- bool """ return self.tasks[task_id].done() def remove_task(self, task_id): """Remove task with identifier `task_id` from scheduler. Parameters ---------- task_id: int """ async_result = self.tasks.pop(task_id) if not async_result.done(): async_result.cancel() def reset(self): """Stop all worker processes immediately and clear pending tasks.""" self.dask_client.shutdown() self.tasks.clear() @property def num_cores(self): """Return the number of processes. Returns ------- int """ return os.cpu_count()