def start_dask(workers): ###################################################### # Setup dask cluster ###################################################### cluster = SLURMCluster(processes=1, queue='hpg2-compute', threads=2, memory='4GB', walltime='144:00:00') print('Starting up workers') workers = [] for _ in range(config.num_hipergator_workers): workers.extend(cluster.start_workers(1)) sleep(60) dask_client = Client(cluster) wait_time = 0 while len(dask_client.scheduler_info() ['workers']) < config.num_hipergator_workers: print('waiting on workers: {s} sec. so far'.format(s=wait_time)) sleep(10) wait_time += 10 # If 5 minutes goes by try adding them again if wait_time > 300: workers.extend(cluster.start_workers(1)) print('All workers accounted for') # xr import must be after dask.array, and I think after setup # up the cluster/client. import dask.array as da import xarray as xr
class Cluster: def __init__(self): print("Start Cluster") self.cluster = SLURMCluster(memory='16g', processes=1, cores=1, death_timeout=200, walltime="168:00:00", job_extra=['--partition=Sibirien']) self.cluster.start_workers(25) self.cli = Client(self.cluster.scheduler.address) def close(self): self.cluster.close()
class dask_controller: #adapted from Charles' code def __init__(self,n_workers=6,local=True,queue="short",death_timeout=3.,\ walltime='01:30:00',cores=1,processes=1,memory='6GB',\ working_directory="./",job_extra=[]): self.local = local self.n_workers = n_workers self.walltime = walltime self.queue = queue self.death_timeout = death_timeout self.processes = processes self.memory = memory self.cores = cores self.working_directory = working_directory self.job_extra = job_extra writedir(working_directory, overwrite=False) def startdask(self): if self.local: self.daskclient = Client() self.daskclient.cluster.scale(self.n_workers) else: self.daskcluster = SLURMCluster(queue=self.queue,death_timeout=self.death_timeout,walltime=self.walltime,\ processes=self.processes,memory=self.memory,\ cores=self.cores,local_directory=self.working_directory,\ log_directory=self.working_directory,job_extra=self.job_extra) self.workers = self.daskcluster.start_workers(self.n_workers) self.daskclient = Client(self.daskcluster) def shutdown(self): self.daskclient.restart() if not self.local: self.daskcluster.stop_all_jobs() self.daskcluster.close() for item in os.listdir(self.working_directory): if "worker-" in item or "slurm-" in item or ".lock" in item: path = "./" + item if os.path.isfile(path): os.remove(path) elif os.path.isdir(path): shutil.rmtree(path) def printprogress(self): complete = len( [item for item in self.futures if item.status == "finished"]) print(str(complete) + "/" + str(len(self.futures))) def displaydashboard(self): link = self.daskcluster.dashboard_link display(HTML('<a href="' + link + '">Dashboard</a>')) def mapfovs(self, function, fov_list, retries=0): self.function = function self.retries = retries def mapallfovs(fov_number, function=function): function(fov_number) self.futures = {} for fov in fov_list: future = self.daskclient.submit(mapallfovs, fov, retries=retries) self.futures[fov] = future def retry_failed(self): self.failed_fovs = [ fov for fov, future in self.futures.items() if future.status != 'finished' ] out = self.daskclient.restart() self.mapfovs(self.function, self.failed_fovs, retries=self.retries) def retry_processing(self): self.proc_fovs = [ fov for fov, future in self.futures.items() if future.status == 'pending' ] out = self.daskclient.restart() self.mapfovs(self.function, self.proc_fovs, retries=self.retries)
today = datetime.datetime.today().date() ###################################################### # Setup dask cluster ###################################################### from dask_jobqueue import SLURMCluster from dask.distributed import Client from dask import delayed import dask cluster = SLURMCluster(processes=1,queue='hpg2-compute', cores=1, memory='10GB', walltime='96:00:00', job_extra=['--qos ewhite-b'], death_timeout=600, local_directory='/tmp/', interface='ib0') print('Starting up workers') workers = cluster.start_workers(hindcast_config.num_hipergator_workers) dask_client = Client(cluster) wait_time=0 while len(dask_client.scheduler_info()['workers']) < hindcast_config.num_hipergator_workers: print('waiting on workers: {s} sec. so far'.format(s=wait_time)) sleep(10) wait_time+=10 # If 5 minutes goes by try adding them again if wait_time > 300: workers.extend(cluster.start_workers(1)) print('All workers accounted for') # xr import must be after dask.array, and I think after setup # up the cluster/client.
from dask.distributed import Client, fire_and_forget from dask_jobqueue import SLURMCluster from dask import bag as db from dask import delayed cluster = SLURMCluster(cores=40, processes=40, memory='250GB', queue='scavenger', walltime='02:00') cluster.start_workers(2) client = Client(cluster) if __name__ == '__main__': guest = load_molecule('aaa.res') trial_xyz, trial_rad = perturb_mol(guest) trial_xyz = delayed(trial_xyz) trial_rad = delayed(trial_rad) structures = load_crystals() bag = db.from_sequence([(structure, trial_xyz, trial_rad) for structure in structures], partition_size=40) guest_hits = client.map(gen_guests, bag) fire_and_forget(save_expansion, guest_hits) inserts = client.map(insert_guests, guest_hits)
'std': k_b * T * ti.d_delta_f_.values[0,-1:]}, columns=['DG', 'std']) return df if __name__ == "__main__": cluster = SLURMCluster(cores=24, processes=24, memory='120GB', walltime='00:59:00', interface='ib0', queue='compute', death_timeout=60, local_directory='/scratch/$USER/$SLURM_JOB_ID') cluster.start_workers(96) cl = Client(cluster) #cl = LocalCluster() ionsegs = {'repulsion_to_ghost': mds.discover('/pylon5/mc3bggp/beckstei/Projects/Transporters/SYSTEMS/Na/repulsion_to_ghost/production1/'), 'ghost_to_ion': mds.discover('/pylon5/mc3bggp/beckstei/Projects/Transporters/SYSTEMS/Na/ghost_to_ion/production1/')} dHdls = {} """ for seg in ionsegs: dHdls[seg] = [delayed(get_dHdl, pure=True)(sim, lower=5000, step=200) for sim in ionsegs[seg]] L_ionDG = {}
#srun --ntasks=1 --cpus-per-task=2 --mem=2gb -t 90 --pty bash -i from dask_jobqueue import SLURMCluster from datetime import datetime from time import sleep cluster = SLURMCluster(project='ewhite', death_timeout=100) cluster.start_workers(1) print(cluster.job_script()) from dask.distributed import Client client = Client(cluster) client counter = 0 while counter < 10: print(datetime.now().strftime("%a, %d %B %Y %I:%M:%S")) print(client) sleep(20) counter += 1 import socket host = client.run_on_scheduler(socket.gethostname) def start_jlab(dask_scheduler): import subprocess proc = subprocess.Popen(['jupyter', 'lab', '--ip', host, '--no-browser']) dask_scheduler.jlab_proc = proc
###################################################### from dask_jobqueue import SLURMCluster from dask.distributed import Client from time import sleep num_hipergator_workers = 120 cluster = SLURMCluster(processes=1, queue='hpg2-compute', threads=1, memory='4GB', walltime='96:00:00', death_timeout=600, local_directory='/tmp/') print('Starting up workers') workers = cluster.start_workers(num_hipergator_workers) dask_client = Client(cluster) wait_time = 0 while len(dask_client.scheduler_info() ['workers']) < num_hipergator_workers / 2: print('waiting on workers: {s} sec. so far'.format(s=wait_time)) sleep(10) wait_time += 10 # If 5 minutes goes by try adding them again if wait_time > 300: workers.extend(cluster.start_workers(1)) print('Most workers accounted for')
class dask_controller: #adapted from Charles' code def __init__(self,n_workers=6,local=True,queue="short",\ walltime='01:30:00',cores=1,processes=1,memory='6GB',job_extra=[]): self.local = local self.n_workers = n_workers self.walltime = walltime self.queue = queue self.processes = processes self.memory = memory self.cores = cores self.job_extra = job_extra def writedir(self, directory): if not os.path.exists(directory): os.makedirs(directory) def startdask(self): if self.local: self.daskclient = Client() self.daskclient.cluster.scale(self.n_workers) else: self.daskcluster = SLURMCluster(queue=self.queue,walltime=self.walltime,\ processes=self.processes,memory=self.memory, cores=self.cores,job_extra=self.job_extra) self.workers = self.daskcluster.start_workers(self.n_workers) self.daskclient = Client(self.daskcluster) def shutdown(self): self.daskcluster.stop_all_jobs() for item in os.listdir("./"): if "worker-" in item or "slurm-" in item or ".lock" in item: path = "./" + item if os.path.isfile(path): os.remove(path) elif os.path.isdir(path): shutil.rmtree(path) def printprogress(self): complete = len( [item for item in self.futures if item.status == "finished"]) print(str(complete) + "/" + str(len(self.futures))) def mapfovs(self, function, fov_list, retries=0): self.function = function self.retries = retries def mapallfovs(fov_number, function=function): function(fov_number) self.futures = {} for fov in fov_list: future = self.daskclient.submit(mapallfovs, fov, retries=retries) self.futures[fov] = future def retry_failed(self): self.failed_fovs = [ fov for fov, future in self.futures.items() if future.status != 'finished' ] self.daskclient.restart() time.sleep(5) self.mapfovs(self.function, self.failed_fovs, retries=self.retries) def retry_processing(self): self.proc_fovs = [ fov for fov, future in self.futures.items() if future.status == 'pending' ] self.daskclient.restart() time.sleep(5) self.mapfovs(self.function, self.proc_fovs, retries=self.retries)
from dask import delayed # AFNI modules from afnipython.construct_template_graph import get_task_graph # parallelization library try: # TODO: generalize to other clusters from dask_jobqueue import SLURMCluster from dask.distributed import Client cluster = SLURMCluster(queue='nimh', memory="8g", processes=1, threads=4, job_extra=['--constraint=10g']) print("starting %d workers!" % n_workers) cluster.start_workers(n_workers) client = Client(cluster) using_cluster = True except ImportError as err: try: from distributed import Client, LocalCluster client = Client() using_cluster = False except ImportError as e: print("Import error: {0}".format(err)) else: raise (err) g_help_string = """ ===========================================================================