def eval_parameter_grid(run_ids, job_name, eval_function, parameter_grid, n_gpus=1): parameters = list(ParameterGrid(parameter_grid)) n_parameters_config = len(parameters) # eval eval_cluster = SLURMCluster( cores=1, job_cpu=40, memory='80GB', job_name=job_name, walltime='5:00:00', interface='ib0', job_extra=[ f'--gres=gpu:{n_gpus}', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) eval_cluster.scale(n_parameters_config) client = Client(eval_cluster) original_parameters = [] for params in parameters: original_params = {} original_params['n_samples'] = params.pop('n_samples', None) original_params['loss'] = params.pop('loss', 'mae') original_params['fixed_masks'] = params.pop('fixed_masks', False) original_parameters.append(original_params) futures = [ client.submit( # function to execute eval_function, run_id=run_id, n_samples=50, **params, ) for run_id, params in zip(run_ids, parameters) ] for params, original_params, future in zip(parameters, original_parameters, futures): metrics_names, eval_res = client.gather(future) params.update(original_params) print('Parameters', params) print(metrics_names) print(eval_res) print('Shutting down dask workers') client.close() eval_cluster.close()
def train_eval_dealiasers(contrast='CORPD_FBK', n_epochs=200, n_samples=None, model_name=None, model_size=None, loss='mae'): job_name = 'dealiasing_fastmri' model_specs = list(get_model_specs(force_res=True, dealiasing=True)) if model_name is not None: model_specs = [ms for ms in model_specs if ms[0] == model_name] if model_size is not None: model_specs = [ms for ms in model_specs if ms[1] == model_size] n_models = len(model_specs) train_cluster = SLURMCluster( cores=1, job_cpu=20, memory='80GB', job_name=job_name, walltime='20:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) train_cluster.adapt(minimum_jobs=0, maximum_jobs=n_models) client = Client(train_cluster) futures = [client.submit( # function to execute train_dealiaser, model_fun=model_fun, model_kwargs=kwargs, run_id=f'{model_name}_{model_size}', n_scales=n_scales, contrast=contrast, n_epochs=n_epochs, n_samples=n_samples, loss=loss, ) for model_name, model_size, model_fun, kwargs, _, n_scales, _ in model_specs] run_ids = client.gather(futures) client.close() train_cluster.close() # eval eval_dealiasers( run_ids, job_name=job_name, contrast=contrast, n_epochs=n_epochs, model_name=model_name, model_size=model_size, n_samples_train=n_samples, loss=loss, ) return run_ids
def eval_parameter_grid(job_name, eval_function, parameter_grid, run_ids, n_samples_eval=None): parameters = list(ParameterGrid(parameter_grid)) n_parameters_config = len(parameters) assert n_parameters_config == len( run_ids), 'Not enough run ids provided for grid evaluation' eval_cluster = SLURMCluster( cores=1, job_cpu=40, memory='60GB', job_name=job_name, walltime='3:00:00', interface='ib0', job_extra=[ f'--gres=gpu:4', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/understanding-unets', '. ./submission_scripts_jean_zay/env_config.sh', ], ) eval_cluster.scale(n_parameters_config) client = Client(eval_cluster) n_samples_list = [] for params in parameters: n_samples = params.pop('n_samples', -1) n_samples_list.append(n_samples) futures = [ client.submit( # function to execute eval_function, run_id=run_id, n_samples=n_samples_eval, **params, ) for run_id, params in zip(run_ids, parameters) ] results = [] for params, future, n_samples in zip(parameters, futures, n_samples_list): metrics_names, eval_res = client.gather(future) if n_samples != -1: params.update({'n_samples': n_samples}) results.append((params, eval_res)) print('Shutting down dask workers') client.close() eval_cluster.close() return metrics_names, results
class Cluster: def __init__(self): print("Start Cluster") self.cluster = SLURMCluster(memory='16g', processes=1, cores=1, death_timeout=200, walltime="168:00:00", job_extra=['--partition=Sibirien']) self.cluster.start_workers(25) self.cli = Client(self.cluster.scheduler.address) def close(self): self.cluster.close()
def train_eval_parameter_grid(job_name, train_function, eval_function, parameter_grid, n_samples_eval=None): parameters = list(ParameterGrid(parameter_grid)) n_parameters_config = len(parameters) train_cluster = SLURMCluster( cores=1, job_cpu=40, memory='60GB', job_name=job_name, walltime='20:00:00', interface='ib0', job_extra=[ f'--gres=gpu:4', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/understanding-unets', '. ./submission_scripts_jean_zay/env_config.sh', ], ) train_cluster.scale(n_parameters_config) client = Client(train_cluster) futures = [ client.submit( # function to execute train_function, **params, ) for params in parameters ] run_ids = client.gather(futures) client.close() train_cluster.close() # eval return eval_parameter_grid( job_name, eval_function, parameter_grid, run_ids, n_samples_eval=n_samples_eval, )
def train_eval_parameter_grid(job_name, train_function, eval_function, parameter_grid): parameters = list(ParameterGrid(parameter_grid)) n_parameters_config = len(parameters) train_cluster = SLURMCluster( cores=1, job_cpu=20, memory='80GB', job_name=job_name, walltime='60:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t4', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) train_cluster.scale(n_parameters_config) client = Client(train_cluster) futures = [ client.submit( # function to execute train_function, **params, ) for params in parameters ] run_ids = client.gather(futures) client.close() train_cluster.close() eval_parameter_grid(run_ids, job_name, eval_function, parameter_grid)
def eval_plug_and_play( run_ids, job_name='eval_pandp', contrast='CORPD_FBK', n_samples_train=None, n_epochs=200, af=4, n_primal=5, train_partition='gpu_p1', model_name=None, model_size=None, ): model_specs = list(get_model_specs(force_res=False, n_primal=n_primal)) if model_name is not None: model_specs = [ms for ms in model_specs if ms[0] == model_name] if model_size is not None: model_specs = [ms for ms in model_specs if ms[1] == model_size] n_models = len(model_specs) # eval eval_cluster = SLURMCluster( cores=1, job_cpu=40, memory='80GB', job_name=job_name, walltime='2:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) eval_cluster.adapt(minimum_jobs=0, maximum_jobs=n_models) client = Client(eval_cluster) futures = [ client.submit( # function to execute evaluate_xpdnet, model_fun=model_fun, model_kwargs=kwargs, run_id=run_id, multicoil=False, n_samples=50, contrast=contrast, af=af, n_epochs=n_epochs, n_scales=n_scales, res=res, ) for run_id, (_, _, model_fun, kwargs, _, n_scales, res) in zip(run_ids, model_specs) ] df_results = pd.DataFrame( columns='model_name model_size psnr ssim'.split()) for (name, model_size, _, _, _, _, _), future in zip(model_specs, futures): _, eval_res = client.gather(future) df_results = df_results.append(dict( model_name=name, model_size=model_size, psnr=eval_res[0], ssim=eval_res[1], ), ignore_index=True) print(df_results) outputs_file = f'reconstruction_results_{n_samples_train}.csv' if model_name is not None: outputs_file = f'reconstruction_results_{n_samples_train}_{model_name}.csv' df_results.to_csv(outputs_file) print('Shutting down dask workers') client.close() eval_cluster.close() return run_ids
import logging, time import xarray as xr from dask.distributed import Client from typing import List, Optional, Tuple, Dict, Any from dask_jobqueue import SLURMCluster logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) variable = "tas" uri = 'https://dataserver.nccs.nasa.gov/thredds/dodsC/bypass/CREATE-IP/reanalysis/MERRA2/mon/atmos/tas.ncml' cluster = SLURMCluster(queue="myNodes") cluster.adapt(minimum=1, maximum=4, interval="2s", wait_count=500) print("CLUSTER JOB SCRIPT: " + cluster.job_script()) client = Client(cluster) t0 = time.time() dset: xr.Dataset = xr.open_dataset(uri) da: xr.DataArray = dset['tas'] da2: xr.DataArray = da.groupby('time.month').mean('time') da_monthly = da2.load() print(da_monthly) print(" Completed computation in " + str(time.time() - t0) + " seconds") client.close() cluster.close()
def full_pipeline_dask(job_name, train_function, eval_function, infer_function, **kwargs): # original training if os.environ.get('FASTMRI_DEBUG'): n_epochs_train = 1 n_epochs_fine_tune = 1 n_eval_samples = 1 n_inference_samples = 1 else: n_epochs_train = 250 n_epochs_fine_tune = 50 n_eval_samples = 50 n_inference_samples = None train_cluster = SLURMCluster( cores=1, job_cpu=20, memory='80GB', job_name=job_name, walltime='100:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t4', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) train_cluster.scale(2) client = Client(train_cluster) acceleration_factors = [4, 8] futures = [ client.submit( # function to execute train_function, af=af, n_epochs=n_epochs_train, **kwargs, # this function has potential side effects pure=True, ) for af in acceleration_factors ] run_ids = client.gather(futures) client.close() train_cluster.close() # fine tuning fine_tuning_cluster = SLURMCluster( cores=1, job_cpu=20, memory='80GB', job_name=job_name, walltime='20:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) fine_tuning_cluster.scale(4) client = Client(fine_tuning_cluster) contrasts = ['CORPDFS_FBK', 'CORPD_FBK'] futures = [] for af, run_id in zip(acceleration_factors, run_ids): for contrast in contrasts: futures += [ client.submit( # function to execute train_function, af=af, contrast=contrast, original_run_id=run_id, n_epochs=n_epochs_fine_tune, **kwargs, # this function has potential side effects pure=True, ) ] fine_tuned_run_ids = client.gather(futures) client.close() fine_tuning_cluster.close() # inference and eval inference_eval_cluster = SLURMCluster( cores=1, job_cpu=40, memory='80GB', job_name=job_name, walltime='20:00:00', interface='ib0', job_extra=[ f'--gres=gpu:4', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) inference_eval_cluster.scale(8) client = Client(inference_eval_cluster) i_run_id = 0 inference_futures = [] eval_futures = [] kwargs.pop('loss') for af in acceleration_factors: for contrast in contrasts: run_id = fine_tuned_run_ids[i_run_id] inference_futures += [ client.submit( # function to execute infer_function, contrast=contrast, af=af, run_id=run_id, n_epochs=n_epochs_fine_tune, n_samples=n_inference_samples, exp_id=job_name, **kwargs, # this function has potential side effects pure=True, ) ] eval_futures += [ client.submit( # function to execute eval_function, contrast=contrast, af=af, run_id=run_id, n_epochs=n_epochs_fine_tune, n_samples=n_eval_samples, **kwargs, # this function has potential side effects pure=True, ) ] i_run_id += 1 client.gather(inference_futures) # eval printing i_run_id = 0 for af in acceleration_factors: for contrast in contrasts: metrics_names, eval_res = client.gather(eval_futures[i_run_id]) print('AF', af) print('Contrast', contrast) print(metrics_names) print(eval_res) i_run_id += 1 print('Shutting down dask workers') client.close() inference_eval_cluster.close()
class dask_controller: #adapted from Charles' code def __init__(self,n_workers=6,local=True,queue="short",death_timeout=3.,\ walltime='01:30:00',cores=1,processes=1,memory='6GB',\ working_directory="./",job_extra=[]): self.local = local self.n_workers = n_workers self.walltime = walltime self.queue = queue self.death_timeout = death_timeout self.processes = processes self.memory = memory self.cores = cores self.working_directory = working_directory self.job_extra = job_extra writedir(working_directory, overwrite=False) def startdask(self): if self.local: self.daskclient = Client() self.daskclient.cluster.scale(self.n_workers) else: self.daskcluster = SLURMCluster(queue=self.queue,death_timeout=self.death_timeout,walltime=self.walltime,\ processes=self.processes,memory=self.memory,\ cores=self.cores,local_directory=self.working_directory,\ log_directory=self.working_directory,job_extra=self.job_extra) self.workers = self.daskcluster.start_workers(self.n_workers) self.daskclient = Client(self.daskcluster) def shutdown(self): self.daskclient.restart() if not self.local: self.daskcluster.stop_all_jobs() self.daskcluster.close() for item in os.listdir(self.working_directory): if "worker-" in item or "slurm-" in item or ".lock" in item: path = "./" + item if os.path.isfile(path): os.remove(path) elif os.path.isdir(path): shutil.rmtree(path) def printprogress(self): complete = len( [item for item in self.futures if item.status == "finished"]) print(str(complete) + "/" + str(len(self.futures))) def displaydashboard(self): link = self.daskcluster.dashboard_link display(HTML('<a href="' + link + '">Dashboard</a>')) def mapfovs(self, function, fov_list, retries=0): self.function = function self.retries = retries def mapallfovs(fov_number, function=function): function(fov_number) self.futures = {} for fov in fov_list: future = self.daskclient.submit(mapallfovs, fov, retries=retries) self.futures[fov] = future def retry_failed(self): self.failed_fovs = [ fov for fov, future in self.futures.items() if future.status != 'finished' ] out = self.daskclient.restart() self.mapfovs(self.function, self.failed_fovs, retries=self.retries) def retry_processing(self): self.proc_fovs = [ fov for fov, future in self.futures.items() if future.status == 'pending' ] out = self.daskclient.restart() self.mapfovs(self.function, self.proc_fovs, retries=self.retries)
def full_pipeline_dask(): job_name = 'grappa' acceleration_factors = [4, 8] contrasts = ['CORPDFS_FBK', 'CORPD_FBK'] # inference and eval inference_eval_cluster = SLURMCluster( cores=1, job_cpu=10, memory='20GB', job_name=job_name, walltime='1:00:00', interface='ib0', job_extra=[ f'--gres=gpu:0', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) inference_eval_cluster.scale(8) client = Client(inference_eval_cluster) inference_futures = [] eval_futures = [] for af in acceleration_factors: for contrast in contrasts: inference_futures += [ client.submit( # function to execute grappa_inference, contrast=contrast, af=af, exp_id=job_name, # this function has potential side effects pure=True, ) ] eval_futures += [ client.submit( # function to execute eval_grappa, contrast=contrast, af=af, n_samples=50, # this function has potential side effects pure=True, ) ] client.gather(inference_futures) # eval printing i = 0 for af in acceleration_factors: for contrast in contrasts: m = client.gather(eval_futures[i]) print('AF', af) print('Contrast', contrast) print(m) i += 1 print('Shutting down dask workers') client.close() inference_eval_cluster.close()
def train_eval_plug_and_play( contrast='CORPD_FBK', n_epochs=200, n_samples=None, af=4, n_primal=5, loss='compound_mssim', train_partition='gpu_p1', model_name=None, model_size=None, ): job_name = 'plug_and_play' model_specs = list(get_model_specs(force_res=False, n_primal=n_primal)) if model_name is not None: model_specs = [ms for ms in model_specs if ms[0] == model_name] if model_size is not None: model_specs = [ms for ms in model_specs if ms[1] == model_size] n_models = len(model_specs) train_cluster = SLURMCluster( cores=1, job_cpu=20, memory='80GB', job_name=job_name, walltime='60:00:00', interface='ib0', job_extra=[ '--gres=gpu:1', '--qos=qos_gpu-t4', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', f'--partition {train_partition}', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) train_cluster.adapt(minimum_jobs=0, maximum_jobs=n_models) client = Client(train_cluster) futures = [ client.submit( # function to execute train_xpdnet, model_fun=model_fun, model_kwargs=kwargs, model_size=model_size, multicoil=False, n_scales=n_scales, res=res, n_primal=n_primal, contrast=contrast, n_epochs=n_epochs, n_samples=n_samples, af=af, loss=loss, ) for _, model_size, model_fun, kwargs, _, n_scales, res in model_specs ] run_ids = client.gather(futures) client.close() train_cluster.close() # eval eval_plug_and_play( run_ids, job_name=job_name, contrast=contrast, n_epochs=n_epochs, af=af, n_primal=n_primal, model_name=model_name, model_size=model_size, n_samples_train=n_samples, ) return run_ids
def run_benchmarks(args: Args): # Results are stored as they are returned all_results = {} # Try running the benchmarks try: # Get benchmark resources dir resources_dir = Path( ).parent.parent / "aicsimageio" / "tests" / "resources" # Store machine config _ = { "platform": platform.system(), "platform_version": platform.version(), "architecture": platform.machine(), "cpu_total_count": psutil.cpu_count(), "cpu_current_utilization": psutil.cpu_percent(), "memory_total_gb": psutil.virtual_memory().total / 10e8, "memory_available_gb": psutil.virtual_memory().available / 10e8, } # Store python config pyversion = sys.version_info _ = { "python_version": f"{pyversion.major}.{pyversion.minor}.{pyversion.micro}", "aicsimageio": aicsimageio.__version__, "czifile": czifile.__version__, "imageio": imageio.__version__, "tifffile": tifffile.__version__, } # Run tests ####################################################################### log.info(f"Running tests: no cluster...") log.info(f"=" * 80) all_results["no-cluster"] = _run_benchmark_suite( resources_dir=resources_dir) ####################################################################### for cluster_config in CLUSTER_CONFIGS: total_cores = cluster_config["per_worker_cores"] * cluster_config[ "workers"] log.info(f"Running tests: {cluster_config['name']} " f"(Total cores: {total_cores}) ...") log.info(f"=" * 80) # Create or get log dir # Do not include ms log_dir_name = datetime.now().isoformat().split(".")[0] log_dir = Path(f".dask_logs/{log_dir_name}").expanduser() # Log dir settings log_dir.mkdir(parents=True, exist_ok=True) # Calc per_worker_memory per_worker_memory = cluster_config["per_worker_cores"] * 2 per_worker_memory = f"{per_worker_memory}GB" # Create cluster cluster = SLURMCluster( cores=cluster_config["per_worker_cores"], memory=per_worker_memory, queue="aics_cpu_general", walltime="10:00:00", local_directory=str(log_dir), log_directory=str(log_dir), ) # Scale cluster cluster.scale(cluster_config["workers"]) # Create client connection client = Client(cluster) # Wait for a minute for the cluster to fully spin up time.sleep(60) # Run benchmark all_results[cluster_config["name"]] = _run_benchmark_suite( resources_dir=resources_dir) client.shutdown() cluster.close() # Wait for a minute for the cluster to fully shutdown time.sleep(60) ####################################################################### log.info(f"Completed all tests") log.info(f"=" * 80) # Ensure save dir exists and save results args.save_path.parent.mkdir(parents=True, exist_ok=True) with open(args.save_path, "w") as write_out: json.dump(all_results, write_out) # Construct and push package if args.upload: p = Package() p.set("results.json", args.save_path) p.push( "aicsimageio/benchmarks", "s3://aics-modeling-packages-test-resources", message=f"aicsimageio version: {aicsimageio.__version__}", ) # Catch any exception except Exception as e: log.error("=============================================") if args.debug: log.error("\n\n" + traceback.format_exc()) log.error("=============================================") log.error("\n\n" + str(e) + "\n") log.error("=============================================") sys.exit(1)
def main(): # Distributed host for p = argparse.ArgumentParser(prog="process", description="Process the FOV pipeline") p.add_argument( "--min_jobs", type=int, default=300, help="Minimum number of jobs to use", ) p.add_argument( "--max_jobs", type=int, default=300, help="Maximum number of jobs to use", ) p.add_argument( "--walltime", type=int, default=5, help="Walltime in hours", ) p.add_argument( "--up_time", type=int, default=10, help="up time for the scheduler in hours", ) args = p.parse_args() cluster = SLURMCluster( cores=2, memory="16GB", walltime="{}:00:00".format(args.walltime), queue="aics_cpu_general", ) cluster.adapt(minimum_jobs=args.min_jobs, maximum_jobs=args.max_jobs) client = dask.distributed.Client(cluster) # noqa connection_info = {} connection_info["HOSTNAME"] = socket.gethostname() connection_info["PORT"] = cluster.scheduler_info["address"].split(":")[-1] connection_info["DASHBOARD_PORT"] = cluster.scheduler_info["services"]["dashboard"] connection_str = ( "ssh -A -J slurm-master -L {PORT}:{HOSTNAME}:{PORT} -L " "{DASHBOARD_PORT}:{HOSTNAME}:{DASHBOARD_PORT} {HOSTNAME}".format( **connection_info ) ) log.info( ( "In a new terimal the machine that you run the pipeline on, copy and paste the following string to forward " "ports to this server:" ) ) log.info(connection_str) log.info(" ") log.info("Then use the following command to kick off your FPP jobs:") log.info("fpp_process --distributed 1 --port {PORT}".format(**connection_info)) log.info(" ") log.info("You can see the dashboard on:") log.info("localhost:{PORT}".format(**connection_info)) log.info(" ") log.info("Command + C will teardown the server.") try: time.sleep(args.up_time * 60 * 60) except KeyboardInterrupt: log.info("Tearing down scheduler.") cluster.close()
def esi_cluster_setup(partition="8GBS", n_jobs=2, mem_per_job=None, timeout=180, interactive=True, start_client=True, **kwargs): """ Start a distributed Dask cluster of parallel processing workers using SLURM (or local multi-processing) Parameters ---------- partition : str Name of SLURM partition/queue to use n_jobs : int Number of jobs to spawn mem_per_job : None or str Memory booking for each job. Can be specified either in megabytes (e.g., ``mem_per_job = 1500MB``) or gigabytes (e.g., ``mem_per_job = "2GB"``). If `mem_per_job` is `None`, it is attempted to infer a sane default value from the chosen queue, e.g., for ``partition = "8GBS"`` `mem_per_job` is automatically set to the allowed maximum of `'8GB'`. However, even in queues with guaranted memory bookings, it is possible to allocate less memory than the allowed maximum per job to spawn numerous low-memory jobs. See Examples for details. timeout : int Number of seconds to wait for requested jobs to start up. interactive : bool If `True`, user input is required in case not all jobs could be started in the provided waiting period (determined by `timeout`). If `interactive` is `False` and the jobs could not be started within `timeout` seconds, a `TimeoutError` is raised. start_client : bool If `True`, a distributed computing client is launched and attached to the workers. If `start_client` is `False`, only a distributed computing cluster is started to which compute-clients can connect. **kwargs : dict Additional keyword arguments can be used to control job-submission details. Returns ------- proc : object A distributed computing client (if ``start_client = True``) or a distributed computing cluster (otherwise). Examples -------- The following command launches 10 SLURM jobs with 2 gigabytes memory each in the `8GBS` partition >>> spy.esi_cluster_setup(n_jobs=10, partition="8GBS", mem_per_job="2GB") If you want to access properties of the created distributed computing client, assign an explicit return quantity, i.e., >>> client = spy.esi_cluster_setup(n_jobs=10, partition="8GBS", mem_per_job="2GB") The underlying distributed computing cluster can be accessed using >>> client.cluster Notes ----- Syncopy's parallel computing engine relies on the concurrent processing library `Dask <https://docs.dask.org/en/latest/>`_. Thus, the distributed computing clients used by Syncopy are in fact instances of :class:`dask.distributed.Client`. This function specifically acts as a wrapper for :class:`dask_jobqueue.SLURMCluster`. Users familiar with Dask in general and its distributed scheduler and cluster objects in particular, may leverage Dask's entire API to fine-tune parallel processing jobs to their liking (if wanted). See also -------- cluster_cleanup : remove dangling parallel processing job-clusters """ # For later reference: dynamically fetch name of current function funcName = "Syncopy <{}>".format(inspect.currentframe().f_code.co_name) # Be optimistic: prepare success message successMsg = "{name:s} Cluster dashboard accessible at {dash:s}" # Retrieve all partitions currently available in SLURM out, err = subprocess.Popen("sinfo -h -o %P", stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True).communicate() if len(err) > 0: # SLURM is not installed, either allocate `LocalCluster` or just leave if "sinfo: not found" in err: if interactive: msg = "{name:s} SLURM does not seem to be installed on this machine " +\ "({host:s}). Do you want to start a local multi-processing " +\ "computing client instead? " startLocal = user_yesno(msg.format(name=funcName, host=socket.gethostname()), default="no") else: startLocal = True if startLocal: client = Client() successMsg = "{name:s} Local parallel computing client ready. \n" + successMsg print(successMsg.format(name=funcName, dash=client.cluster.dashboard_link)) if start_client: return client return client.cluster return # SLURM is installed, but something's wrong msg = "SLURM queuing system from node {node:s}. " +\ "Original error message below:\n{error:s}" raise SPYIOError(msg.format(node=socket.gethostname(), error=err)) options = out.split() # Make sure we're in a valid partition (exclude IT partitions from output message) if partition not in options: valid = list(set(options).difference(["DEV", "PPC"])) raise SPYValueError(legal="'" + "or '".join(opt + "' " for opt in valid), varname="partition", actual=partition) # Parse job count try: scalar_parser(n_jobs, varname="n_jobs", ntype="int_like", lims=[1, np.inf]) except Exception as exc: raise exc # Get requested memory per job if mem_per_job is not None: if not isinstance(mem_per_job, str): raise SPYTypeError(mem_per_job, varname="mem_per_job", expected="string") if not any(szstr in mem_per_job for szstr in ["MB", "GB"]): lgl = "string representation of requested memory (e.g., '8GB', '12000MB')" raise SPYValueError(legal=lgl, varname="mem_per_job", actual=mem_per_job) # Query memory limit of chosen partition and ensure that `mem_per_job` is # set for partitions w/o limit idx = partition.find("GB") if idx > 0: mem_lim = int(partition[:idx]) * 1000 else: if partition == "PREPO": mem_lim = 16000 else: if mem_per_job is None: lgl = "explicit memory amount as required by partition '{}'" raise SPYValueError(legal=lgl.format(partition), varname="mem_per_job", actual=mem_per_job) mem_lim = np.inf # Consolidate requested memory with chosen partition (or assign default memory) if mem_per_job is None: mem_per_job = str(mem_lim) + "MB" else: if "MB" in mem_per_job: mem_req = int(mem_per_job[:mem_per_job.find("MB")]) else: mem_req = int(round(float(mem_per_job[:mem_per_job.find("GB")]) * 1000)) if mem_req > mem_lim: msg = "`mem_per_job` exceeds limit of {lim:d}GB for partition {par:s}. " +\ "Capping memory at partition limit. " SPYWarning(msg.format(lim=mem_lim, par=partition)) mem_per_job = str(int(mem_lim)) + "GB" # Parse requested timeout period try: scalar_parser(timeout, varname="timeout", ntype="int_like", lims=[1, np.inf]) except Exception as exc: raise exc # Determine if cluster allocation is happening interactively if not isinstance(interactive, bool): raise SPYTypeError(interactive, varname="interactive", expected="bool") # Determine if a dask client was requested if not isinstance(start_client, bool): raise SPYTypeError(start_client, varname="start_client", expected="bool") # Set/get "hidden" kwargs workers_per_job = kwargs.get("workers_per_job", 1) try: scalar_parser(workers_per_job, varname="workers_per_job", ntype="int_like", lims=[1, 8]) except Exception as exc: raise exc n_cores = kwargs.get("n_cores", 1) try: scalar_parser(n_cores, varname="n_cores", ntype="int_like", lims=[1, np.inf]) except Exception as exc: raise exc slurm_wdir = kwargs.get("slurmWorkingDirectory", None) if slurm_wdir is None: usr = getpass.getuser() slurm_wdir = "/mnt/hpx/slurm/{usr:s}/{usr:s}_{date:s}" slurm_wdir = slurm_wdir.format(usr=usr, date=datetime.now().strftime('%Y%m%d-%H%M%S')) os.makedirs(slurm_wdir, exist_ok=True) else: try: io_parser(slurm_wdir, varname="slurmWorkingDirectory", isfile=False) except Exception as exc: raise exc # Hotfix for upgraded cluster-nodes: point to correct Python executable if working from /home pyExec = sys.executable if sys.executable.startswith("/home"): pyExec = "/mnt/gs" + sys.executable # Create `SLURMCluster` object using provided parameters out_files = os.path.join(slurm_wdir, "slurm-%j.out") cluster = SLURMCluster(cores=n_cores, memory=mem_per_job, processes=workers_per_job, local_directory=slurm_wdir, queue=partition, name="spyswarm", python=pyExec, header_skip=["-t", "--mem"], job_extra=["--output={}".format(out_files)]) # interface="asdf", # interface is set via `psutil.net_if_addrs()` # job_extra=["--hint=nomultithread", # "--threads-per-core=1"] # Compute total no. of workers and up-scale cluster accordingly total_workers = n_jobs * workers_per_job cluster.scale(total_workers) # Fire up waiting routine to avoid premature cluster setups if _cluster_waiter(cluster, funcName, total_workers, timeout, interactive): return # Kill a zombie cluster in non-interactive mode if not interactive and _count_running_workers(cluster) == 0: cluster.close() err = "SLURM jobs could not be started within given time-out " +\ "interval of {0:d} seconds" raise TimeoutError(err.format(timeout)) # Highlight how to connect to dask performance monitor print(successMsg.format(name=funcName, dash=cluster.dashboard_link)) # If client was requested, return that instead of the created cluster if start_client: return Client(cluster) return cluster
def train_eval_parameter_grid(job_name, train_function, eval_function, parameter_grid): parameters = list(ParameterGrid(parameter_grid)) n_parameters_config = len(parameters) train_cluster = SLURMCluster( cores=1, job_cpu=20, memory='80GB', job_name=job_name, walltime='60:00:00', interface='ib0', job_extra=[ f'--gres=gpu:1', '--qos=qos_gpu-t4', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) train_cluster.scale(n_parameters_config) client = Client(train_cluster) futures = [ client.submit( # function to execute train_function, **params, ) for params in parameters ] run_ids = client.gather(futures) client.close() train_cluster.close() # eval eval_cluster = SLURMCluster( cores=1, job_cpu=40, memory='80GB', job_name=job_name, walltime='20:00:00', interface='ib0', job_extra=[ f'--gres=gpu:4', '--qos=qos_gpu-t3', '--distribution=block:block', '--hint=nomultithread', '--output=%x_%j.out', ], env_extra=[ 'cd $WORK/fastmri-reproducible-benchmark', '. ./submission_scripts_jean_zay/env_config.sh', ], ) eval_cluster.scale(n_parameters_config) client = Client(eval_cluster) for params in parameters: params.pop('n_samples') futures = [ client.submit( # function to execute eval_function, run_id=run_id, n_samples=50, **params, ) for run_id, params in zip(run_ids, parameters) ] for params, future in zip(parameters, futures): metrics_names, eval_res = client.gather(future) print('Parameters', params) print(metrics_names) print(eval_res) print('Shutting down dask workers') client.close() eval_cluster.close()
def run( self, distributed: bool = False, clean: bool = False, debug: bool = False, **kwargs, ): """ Run a flow with your steps. Parameters ---------- distributed: bool Create a SLURMCluster to use for job distribution. Default: False (do not create a cluster) clean: bool Should the local staging directory be cleaned prior to this run. Default: False (Do not clean) debug: bool A debug flag for the developer to use to manipulate how much data runs, how it is processed, etc. Default: False (Do not debug) Notes ----- Documentation on prefect: https://docs.prefect.io/core/ Basic prefect example: https://docs.prefect.io/core/ """ # Initalize steps raw = steps.MappedRaw() invert = steps.MappedInvert() cumsum = steps.MappedSum() plot = steps.Plot() fancyplot = steps.Fancyplot() # Choose executor if distributed: # Log dir settings log_dir_name = datetime.now().isoformat().split(".")[0] # Do not include ms log_dir = Path(f".logs/{log_dir_name}/") log_dir.mkdir(parents=True) # Spawn cluster cluster = SLURMCluster( cores=2, memory="32GB", walltime="10:00:00", queue="aics_cpu_general", local_directory=str(log_dir), log_directory=str(log_dir), ) # Set adaptive scaling cluster.adapt(minimum_jobs=1, maximum_jobs=40) else: # Stop conflicts between Dask and OpenBLAS # Info here: # https://stackoverflow.com/questions/45086246/too-many-memory-regions-error-with-dask os.environ["OMP_NUM_THREADS"] = "1" # Spawn local cluster cluster = LocalCluster() # Log bokeh info if cluster.dashboard_link: log.info(f"Dask UI running at: {cluster.dashboard_link}") # Start local dask cluster exe = DaskExecutor(cluster.scheduler_address) # Configure your flow with Flow("example_step_workflow") as flow: # If your step utilizes a secondary flow with dask pass the executor address # If you want to clean the local staging directories pass clean # If you want to utilize some debugging functionality pass debug # If you don't utilize any of these, just pass the parameters you need. matrices = raw( distributed_executor_address=cluster.scheduler_address, clean=clean, debug=debug, **kwargs, # Allows us to pass `--n {some integer}` or other params ) inversions = invert( matrices, distributed_executor_address=cluster.scheduler_address, clean=clean, debug=debug, ) vectors = cumsum( inversions, distributed_executor_address=cluster.scheduler_address, clean=clean, debug=debug, ) plot( vectors, distributed_executor_address=cluster.scheduler_address, clean=clean, debug=debug, ) fancyplot( vectors, distributed_executor_address=cluster.scheduler_address, clean=clean, debug=debug, ) # Run flow and get ending state state = flow.run(executor=exe) # Get plot location log.info(f"Plot stored to: {plot.get_result(state, flow)}") # Close cluster if distributed: cluster.close()