示例#1
0
def start_dask(workers):

    ######################################################
    # Setup dask cluster
    ######################################################

    cluster = SLURMCluster(processes=1,
                           queue='hpg2-compute',
                           threads=2,
                           memory='4GB',
                           walltime='144:00:00')

    print('Starting up workers')
    workers = []
    for _ in range(config.num_hipergator_workers):
        workers.extend(cluster.start_workers(1))
        sleep(60)
    dask_client = Client(cluster)

    wait_time = 0
    while len(dask_client.scheduler_info()
              ['workers']) < config.num_hipergator_workers:
        print('waiting on workers: {s} sec. so far'.format(s=wait_time))
        sleep(10)
        wait_time += 10

        # If 5 minutes goes by try adding them again
        if wait_time > 300:
            workers.extend(cluster.start_workers(1))

    print('All workers accounted for')
    # xr import must be after dask.array, and I think after setup
    # up the cluster/client.
    import dask.array as da
    import xarray as xr
示例#2
0
class Cluster:
    def __init__(self):
        print("Start Cluster")
        self.cluster = SLURMCluster(memory='16g',
                                    processes=1,
                                    cores=1,
                                    death_timeout=200,
                                    walltime="168:00:00",
                                    job_extra=['--partition=Sibirien'])
        self.cluster.start_workers(25)
        self.cli = Client(self.cluster.scheduler.address)

    def close(self):
        self.cluster.close()
示例#3
0
class dask_controller:  #adapted from Charles' code
    def __init__(self,n_workers=6,local=True,queue="short",death_timeout=3.,\
                 walltime='01:30:00',cores=1,processes=1,memory='6GB',\
                 working_directory="./",job_extra=[]):
        self.local = local
        self.n_workers = n_workers
        self.walltime = walltime
        self.queue = queue
        self.death_timeout = death_timeout
        self.processes = processes
        self.memory = memory
        self.cores = cores
        self.working_directory = working_directory
        self.job_extra = job_extra

        writedir(working_directory, overwrite=False)

    def startdask(self):
        if self.local:
            self.daskclient = Client()
            self.daskclient.cluster.scale(self.n_workers)
        else:
            self.daskcluster = SLURMCluster(queue=self.queue,death_timeout=self.death_timeout,walltime=self.walltime,\
                                   processes=self.processes,memory=self.memory,\
                                  cores=self.cores,local_directory=self.working_directory,\
                                log_directory=self.working_directory,job_extra=self.job_extra)
            self.workers = self.daskcluster.start_workers(self.n_workers)
            self.daskclient = Client(self.daskcluster)

    def shutdown(self):
        self.daskclient.restart()
        if not self.local:
            self.daskcluster.stop_all_jobs()
            self.daskcluster.close()
        for item in os.listdir(self.working_directory):
            if "worker-" in item or "slurm-" in item or ".lock" in item:
                path = "./" + item
                if os.path.isfile(path):
                    os.remove(path)
                elif os.path.isdir(path):
                    shutil.rmtree(path)

    def printprogress(self):
        complete = len(
            [item for item in self.futures if item.status == "finished"])
        print(str(complete) + "/" + str(len(self.futures)))

    def displaydashboard(self):
        link = self.daskcluster.dashboard_link
        display(HTML('<a href="' + link + '">Dashboard</a>'))

    def mapfovs(self, function, fov_list, retries=0):
        self.function = function
        self.retries = retries

        def mapallfovs(fov_number, function=function):
            function(fov_number)

        self.futures = {}
        for fov in fov_list:
            future = self.daskclient.submit(mapallfovs, fov, retries=retries)
            self.futures[fov] = future

    def retry_failed(self):
        self.failed_fovs = [
            fov for fov, future in self.futures.items()
            if future.status != 'finished'
        ]
        out = self.daskclient.restart()
        self.mapfovs(self.function, self.failed_fovs, retries=self.retries)

    def retry_processing(self):
        self.proc_fovs = [
            fov for fov, future in self.futures.items()
            if future.status == 'pending'
        ]
        out = self.daskclient.restart()
        self.mapfovs(self.function, self.proc_fovs, retries=self.retries)
示例#4
0
today = datetime.datetime.today().date()

######################################################
# Setup dask cluster
######################################################
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
from dask import delayed
import dask
cluster = SLURMCluster(processes=1,queue='hpg2-compute', cores=1, memory='10GB', walltime='96:00:00',
                       job_extra=['--qos ewhite-b'],
                       death_timeout=600, local_directory='/tmp/', interface='ib0')

print('Starting up workers')
workers = cluster.start_workers(hindcast_config.num_hipergator_workers)
dask_client = Client(cluster)

wait_time=0
while len(dask_client.scheduler_info()['workers']) < hindcast_config.num_hipergator_workers:
    print('waiting on workers: {s} sec. so far'.format(s=wait_time))
    sleep(10)
    wait_time+=10
    
    # If 5 minutes goes by try adding them again
    if wait_time > 300:
        workers.extend(cluster.start_workers(1))

print('All workers accounted for')
# xr import must be after dask.array, and I think after setup
# up the cluster/client. 
示例#5
0
from dask.distributed import Client, fire_and_forget
from dask_jobqueue import SLURMCluster
from dask import bag as db
from dask import delayed

cluster = SLURMCluster(cores=40,
                       processes=40,
                       memory='250GB',
                       queue='scavenger',
                       walltime='02:00')

cluster.start_workers(2)
client = Client(cluster)

if __name__ == '__main__':

    guest = load_molecule('aaa.res')
    trial_xyz, trial_rad = perturb_mol(guest)
    trial_xyz = delayed(trial_xyz)
    trial_rad = delayed(trial_rad)
    structures = load_crystals()

    bag = db.from_sequence([(structure, trial_xyz, trial_rad)
                            for structure in structures],
                           partition_size=40)

    guest_hits = client.map(gen_guests, bag)

    fire_and_forget(save_expansion, guest_hits)

    inserts = client.map(insert_guests, guest_hits)
                       'std': k_b * T * ti.d_delta_f_.values[0,-1:]},
                      columns=['DG', 'std'])
    return df


if __name__ == "__main__":

    cluster = SLURMCluster(cores=24,
            processes=24,
            memory='120GB',
            walltime='00:59:00',
            interface='ib0',
            queue='compute',
            death_timeout=60,
            local_directory='/scratch/$USER/$SLURM_JOB_ID')
    cluster.start_workers(96)
    cl = Client(cluster)
    #cl = LocalCluster()
    
    ionsegs = {'repulsion_to_ghost':
            mds.discover('/pylon5/mc3bggp/beckstei/Projects/Transporters/SYSTEMS/Na/repulsion_to_ghost/production1/'),
           'ghost_to_ion':
           mds.discover('/pylon5/mc3bggp/beckstei/Projects/Transporters/SYSTEMS/Na/ghost_to_ion/production1/')}
    
    dHdls = {}
    """
    for seg in ionsegs:
        dHdls[seg] = [delayed(get_dHdl, pure=True)(sim, lower=5000, step=200)
    				for sim in ionsegs[seg]]
    
    L_ionDG = {}
示例#7
0
#srun --ntasks=1 --cpus-per-task=2 --mem=2gb -t 90 --pty bash -i

from dask_jobqueue import SLURMCluster
from datetime import datetime
from time import sleep

cluster = SLURMCluster(project='ewhite', death_timeout=100)
cluster.start_workers(1)

print(cluster.job_script())

from dask.distributed import Client
client = Client(cluster)

client

counter = 0
while counter < 10:
    print(datetime.now().strftime("%a, %d %B %Y %I:%M:%S"))
    print(client)
    sleep(20)
    counter += 1

import socket
host = client.run_on_scheduler(socket.gethostname)


def start_jlab(dask_scheduler):
    import subprocess
    proc = subprocess.Popen(['jupyter', 'lab', '--ip', host, '--no-browser'])
    dask_scheduler.jlab_proc = proc
示例#8
0
    ######################################################
    from dask_jobqueue import SLURMCluster
    from dask.distributed import Client
    from time import sleep

    num_hipergator_workers = 120
    cluster = SLURMCluster(processes=1,
                           queue='hpg2-compute',
                           threads=1,
                           memory='4GB',
                           walltime='96:00:00',
                           death_timeout=600,
                           local_directory='/tmp/')

    print('Starting up workers')
    workers = cluster.start_workers(num_hipergator_workers)

    dask_client = Client(cluster)

    wait_time = 0
    while len(dask_client.scheduler_info()
              ['workers']) < num_hipergator_workers / 2:
        print('waiting on workers: {s} sec. so far'.format(s=wait_time))
        sleep(10)
        wait_time += 10

        # If 5 minutes goes by try adding them again
        if wait_time > 300:
            workers.extend(cluster.start_workers(1))

    print('Most workers accounted for')
示例#9
0
class dask_controller:  #adapted from Charles' code
    def __init__(self,n_workers=6,local=True,queue="short",\
                 walltime='01:30:00',cores=1,processes=1,memory='6GB',job_extra=[]):
        self.local = local
        self.n_workers = n_workers
        self.walltime = walltime
        self.queue = queue
        self.processes = processes
        self.memory = memory
        self.cores = cores
        self.job_extra = job_extra

    def writedir(self, directory):
        if not os.path.exists(directory):
            os.makedirs(directory)

    def startdask(self):
        if self.local:
            self.daskclient = Client()
            self.daskclient.cluster.scale(self.n_workers)
        else:
            self.daskcluster = SLURMCluster(queue=self.queue,walltime=self.walltime,\
                                   processes=self.processes,memory=self.memory,
                                  cores=self.cores,job_extra=self.job_extra)
            self.workers = self.daskcluster.start_workers(self.n_workers)
            self.daskclient = Client(self.daskcluster)

    def shutdown(self):
        self.daskcluster.stop_all_jobs()
        for item in os.listdir("./"):
            if "worker-" in item or "slurm-" in item or ".lock" in item:
                path = "./" + item
                if os.path.isfile(path):
                    os.remove(path)
                elif os.path.isdir(path):
                    shutil.rmtree(path)

    def printprogress(self):
        complete = len(
            [item for item in self.futures if item.status == "finished"])
        print(str(complete) + "/" + str(len(self.futures)))

    def mapfovs(self, function, fov_list, retries=0):
        self.function = function
        self.retries = retries

        def mapallfovs(fov_number, function=function):
            function(fov_number)

        self.futures = {}
        for fov in fov_list:
            future = self.daskclient.submit(mapallfovs, fov, retries=retries)
            self.futures[fov] = future

    def retry_failed(self):
        self.failed_fovs = [
            fov for fov, future in self.futures.items()
            if future.status != 'finished'
        ]
        self.daskclient.restart()
        time.sleep(5)
        self.mapfovs(self.function, self.failed_fovs, retries=self.retries)

    def retry_processing(self):
        self.proc_fovs = [
            fov for fov, future in self.futures.items()
            if future.status == 'pending'
        ]
        self.daskclient.restart()
        time.sleep(5)
        self.mapfovs(self.function, self.proc_fovs, retries=self.retries)
示例#10
0
from dask import delayed
# AFNI modules
from afnipython.construct_template_graph import get_task_graph
# parallelization library
try:
    # TODO: generalize to other clusters
    from dask_jobqueue import SLURMCluster
    from dask.distributed import Client
    cluster = SLURMCluster(queue='nimh',
                           memory="8g",
                           processes=1,
                           threads=4,
                           job_extra=['--constraint=10g'])
    print("starting %d workers!" % n_workers)
    cluster.start_workers(n_workers)
    client = Client(cluster)
    using_cluster = True
except ImportError as err:
    try:
        from distributed import Client, LocalCluster
        client = Client()
        using_cluster = False
    except ImportError as e:
        print("Import error: {0}".format(err))

    else:
        raise (err)

g_help_string = """
    ===========================================================================