def init_cluster(num_workers, wait_for_all_workers=True): """ Start up a dask cluster, optionally wait until all workers have been launched, and then return the resulting distributed.Client object. Args: num_workers: How many workers to launch. wait_for_all_workers: If True, pause until all workers have been launched before returning. Otherwise, just wait for a single worker to launch. Returns: distributed.Client """ # Local import: LSFCluster probably isn't importable on your local machine, # so it's nice to avoid importing it when you're just running local tests without a cluster. from dask_jobqueue import LSFCluster cluster = LSFCluster(ip='0.0.0.0') cluster.scale(num_workers) required_workers = 1 if wait_for_all_workers: required_workers = num_workers client = Client(cluster) while (wait_for_all_workers and client.status == "running" and len(cluster.scheduler.workers) < required_workers): print( f"Waiting for {required_workers - len(cluster.scheduler.workers)} workers..." ) time.sleep(1.0) return client
def start_dask_lsfcluster(cluster_size=5): """Start a dask cluster.""" if cluster_size < 4: raise Exception('Too small of a cluster') # Settings for Sanger farm memory_in_gb = 20 cluster = LSFCluster( queue='normal', walltime='00:30', log_directory='{}/dask_logs'.format(os.getcwd()), cores=4, memory='{} Gb'.format(memory_in_gb), mem=memory_in_gb * 1e+9, # should be in bytes lsf_units='mb', job_extra=[ '-G team152', '-g /lt9/dask', '-R "select[mem>{}] rusage[mem={}]"'.format( int(memory_in_gb * 1e+3), int(memory_in_gb * 1e+3)) ], use_stdin=True) # View the job submission from Dask # cluster.job_script() # Scale cluster cluster.scale(cluster_size) # auto-scale between 10 and 100 jobs # cluster.adapt( # minimum_jobs=int(cluster_size/4), # maximum_jobs=cluster_size # ) # cluster.adapt(maximum_memory="10 TB") # use core/memory limits client = Client(cluster, timeout=120) client.wait_for_workers(n_workers=cluster_size) # print(client.scheduler_info()['services']) return cluster, client
def setup_dask_lsf_cluster( n_workers: int, queue: str, memory_gigabytes: int, wall_time: str, environment_name: str, ) -> "LSFCluster": """Set up a dask cluster which integrates with an existing LSF queue manager to spawn and manage workers. Args: n_workers: The number of workers to spawn. queue: The queue to submit the workers to. memory_gigabytes: The maximum memory to request per worker in GB. wall_time: The maximum wall-clock time to spawn each worker for. environment_name: The conda environment to activate for each worker. Returns: The initialized cluster. """ import dask from dask_jobqueue import LSFCluster env_extra = dask.config.get("jobqueue.lsf.env-extra", default=[]) env_extra.append(f"conda activate {environment_name}") cluster = LSFCluster( queue=queue, cores=1, memory=f"{memory_gigabytes * 1e9}B", walltime=wall_time, local_directory="dask-worker-space", log_directory="dask-worker-logs", env_extra=env_extra, ) cluster.scale(n=n_workers) return cluster
def _init_dask(self): """ Starts a dask cluster, according to the cluster type specified in the constructor. Sets self.client. Also writes useful URLs to graph-links.txt. If the 'cluster-type' is 'synchronous', then the cluster will be a special stub class (DebugCluster), which provides dummy implementations of a few functions from the DistributedCluster API. (Mostly just for convenient unit testing.) """ # Consider using client.register_worker_callbacks() to configure # - faulthandler (later) # - excepthook? # - (okay, maybe it's just best to put that stuff in __init__.py, like in DSS) load_and_overwrite_dask_config(self.cluster_type, 'dask-config.yaml', True) self._write_driver_graph_urls() if self.cluster_type in JOBQUEUE_CLUSTERS: update_jobqueue_config_with_defaults(self.cluster_type) if self.cluster_type == "lsf": from dask_jobqueue import LSFCluster cluster = LSFCluster() #ip='0.0.0.0') elif self.cluster_type == "sge": from dask_jobqueue import SGECluster cluster = SGECluster(ip='0.0.0.0') elif self.cluster_type == "slurm": from dask_jobqueue import SLURMCluster cluster = SLURMCluster(ip='0.0.0.0') else: raise AssertionError("Unimplemented jobqueue cluster") cluster.scale(self.num_workers) elif self.cluster_type == "local-cluster": cluster = LocalCluster(self.num_workers, threads_per_worker=1, processes=True, ip='0.0.0.0') elif self.cluster_type in ("synchronous", "processes"): cluster = None # synchronous/processes mode is for testing and debugging only assert dask.config.get('scheduler', self.cluster_type) == self.cluster_type, \ "Inconsistency between the dask-config and the scheduler you chose." dask.config.set(scheduler=self.cluster_type) self.client = DebugClient(self.cluster_type) else: raise AssertionError("Unknown cluster type") dump_dask_config('full-dask-config.yaml') if cluster: dashboard = cluster.dashboard_link logger.info(f"Dashboard running on {dashboard}") dashboard_ip = extract_ip_from_link(dashboard) dashboard = dashboard.replace(dashboard_ip, socket.gethostname()) logger.info(f" a.k.a. {dashboard}") # Note: Overrides config value: distributed.comm.timeouts.connect self.client = Client(cluster, timeout='60s') # Wait for the workers to spin up. with Timer(f"Waiting for {self.num_workers} workers to launch", logger) as wait_timer: while (self.wait_for_workers and self.client.status == "running" and len(self.client.cluster.scheduler.workers) < self.num_workers): if wait_timer.seconds > (60 * self.cluster_max_wait): msg = ( f"Not all cluster workers could be launched within the " "allotted time ({self.cluster_max_wait} minutes).\n" "Try again or adjust the 'cluster-max-wait' setting.\n" ) raise RuntimeError(msg) time.sleep(0.1) if self.wait_for_workers and self.cluster_type == "lsf": self._write_worker_graph_urls('graph-links.txt')
return total if __name__ == "__main__": cluster = LSFCluster( name='worker_bee', queue='general', # the queue on Pegasus project='insarlab', # your project name cores=2, memory='2GB', # unused by Pegasus but a required param walltime='00:30', # how long the worker will run for interface='ib0', # which network to use. NECESSARY PARAM job_extra=[ '-R "rusage[mem=2500]"', # how to actually define memory usage "-o WORKER-%J.out" ], # where to write worker output files python=sys.executable, # Where to look for Python executable config_name='lsf') # define your own config in a .yaml file cluster.scale(20) print("JOB FILE:", cluster.job_script()) client = Client(cluster) print("Time to run sequential code:", timeit(stmt=sequential_main, number=1)) print("Time to run parallel code:", timeit(stmt=distributed_main, number=1)) print("Time to run parallel code with ~0 data transfer:", timeit(stmt=distributed_main2, number=1)) client.close()
def activate_client(self, library=('dask', 'LSF'), num_processes=2, timeout=1800): """ Parameters ---------- library : tuple(str, str), default ('dask', 'LSF') parallelism and scheduler tuple num_processes : int or None number of workers to run with the new client if None, num_processes will be adaptive timeout : int number of seconds to wait to fulfill the workers order """ self.library = library if library is not None: _logger.debug(f"library is not None") assert library[0] in list( self.supported_libraries.keys() ), f"{library[0]} is not a supported parallelism. (supported parallelisms are {self.supported_libraries.keys()})" assert library[1] in list( self.supported_libraries[library[0]] ), f"{library[1]} is not a supported . (supported parallelisms are {self.supported_libraries[library[0]]})" elif library is None: _logger.debug(f"library is None") self.client = None self._adapt = False self.num_processes = 0 self.workers = {} return if library[0] == 'dask': _logger.debug(f"detected dask parallelism...") if library[1] == 'LSF': _logger.debug(f"detected LSF scheduler") from dask_jobqueue import LSFCluster _logger.debug(f"creating cluster...") cluster = LSFCluster() if num_processes is None: _logger.debug(f"adaptive cluster") self._adapt = True cluster.adapt(minimum=1, interval='1s') else: _logger.debug(f"nonadaptive cluster") self._adapt = False self.num_processes = num_processes cluster.scale(self.num_processes) _logger.debug(f"creating client with cluster") self.client = distributed.Client(cluster, timeout=timeout) if not self._adapt: while len(self.client.nthreads()) != self.num_processes: _logger.debug( f"waiting for worker request fulfillment...") time.sleep(5) worker_threads = self.client.nthreads() self.workers = { i: _worker for i, _worker in zip(range(len(worker_threads)), worker_threads.keys()) } _logger.debug(f"workers initialized: {self.workers}") else: raise Exception( f"{library[1]} is supported, but without client-activation functionality!" )