def test_publish_bag(s, a, b): db = pytest.importorskip('dask.bag') c = Client((s.ip, s.port), start=False) yield c._start() f = Client((s.ip, s.port), start=False) yield f._start() bag = db.from_sequence([0, 1, 2]) bagp = c.persist(bag) assert len(futures_of(bagp)) == 3 keys = {f.key for f in futures_of(bagp)} assert keys == set(bag.dask) yield c._publish_dataset(data=bagp) # check that serialization didn't affect original bag's dask assert len(futures_of(bagp)) == 3 result = yield f._get_dataset('data') assert set(result.dask.keys()) == set(bagp.dask.keys()) assert {f.key for f in result.dask.values()} == {f.key for f in bagp.dask.values()} out = yield f.compute(result)._result() assert out == [0, 1, 2] yield c._shutdown() yield f._shutdown()
def test_multiple_maxlen(c, s, a, b): c2 = Client((s.ip, s.port), start=False) yield c2._start() x = c.channel('x', maxlen=10) assert x.futures.maxlen == 10 x2 = c2.channel('x', maxlen=20) assert x2.futures.maxlen == 20 for i in range(10): x.append(c.submit(inc, i)) while len(s.wants_what[c2.id]) < 10: yield gen.sleep(0.01) for i in range(10, 20): x.append(c.submit(inc, i)) while len(x2) < 20: yield gen.sleep(0.01) yield gen.sleep(0.1) assert len(x2) == 20 # They stay this long after a delay assert len(s.task_state) == 20 yield c2._shutdown()
def create_client_and_cluster(n_jobs, num_tasks, dask_kwargs, entityset_size): cluster = None if 'cluster' in dask_kwargs: cluster = dask_kwargs['cluster'] else: # diagnostics_port sets the default port to launch bokeh web interface # if it is set to None web interface will not be launched diagnostics_port = None if 'diagnostics_port' in dask_kwargs: diagnostics_port = dask_kwargs['diagnostics_port'] del dask_kwargs['diagnostics_port'] workers = n_jobs_to_workers(n_jobs) workers = min(workers, num_tasks) # Distributed default memory_limit for worker is 'auto'. It calculates worker # memory limit as total virtual memory divided by the number # of cores available to the workers (alwasy 1 for featuretools setup). # This means reducing the number of workers does not increase the memory # limit for other workers. Featuretools default is to calculate memory limit # as total virtual memory divided by number of workers. To use distributed # default memory limit, set dask_kwargs['memory_limit']='auto' if 'memory_limit' in dask_kwargs: memory_limit = dask_kwargs['memory_limit'] del dask_kwargs['memory_limit'] else: total_memory = psutil.virtual_memory().total memory_limit = int(total_memory / float(workers)) cluster = LocalCluster(n_workers=workers, threads_per_worker=1, diagnostics_port=diagnostics_port, memory_limit=memory_limit, **dask_kwargs) # if cluster has bokeh port, notify user if unxepected port number if diagnostics_port is not None: if hasattr(cluster, 'scheduler') and cluster.scheduler: info = cluster.scheduler.identity() if 'bokeh' in info['services']: msg = "Dashboard started on port {}" print(msg.format(info['services']['bokeh'])) client = Client(cluster) warned_of_memory = False for worker in list(client.scheduler_info()['workers'].values()): worker_limit = worker['memory_limit'] if worker_limit < entityset_size: raise ValueError("Insufficient memory to use this many workers") elif worker_limit < 2 * entityset_size and not warned_of_memory: logger.warn("Worker memory is between 1 to 2 times the memory" " size of the EntitySet. If errors occur that do" " not occur with n_jobs equals 1, this may be the " "cause. See https://docs.featuretools.com/guides/parallel.html" " for more information.") warned_of_memory = True return client, cluster
class ClusterDaskDistributor(DistributorBaseClass): """ Distributor using a dask cluster, meaning that the calculation is spread over a cluster """ def __init__(self, address): """ Sets up a distributor that connects to a Dask Scheduler to distribute the calculaton of the features :param address: the ip address and port number of the Dask Scheduler :type address: str """ from distributed import Client self.client = Client(address=address) def calculate_best_chunk_size(self, data_length): """ Uses the number of dask workers in the cluster (during execution time, meaning when you start the extraction) to find the optimal chunk_size. :param data_length: A length which defines how many calculations there need to be. :type data_length: int """ n_workers = len(self.client.scheduler_info()["workers"]) chunk_size, extra = divmod(data_length, n_workers * 5) if extra: chunk_size += 1 return chunk_size def distribute(self, func, partitioned_chunks, kwargs): """ Calculates the features in a parallel fashion by distributing the map command to the dask workers on a cluster :param func: the function to send to each worker. :type func: callable :param partitioned_chunks: The list of data chunks - each element is again a list of chunks - and should be processed by one worker. :type partitioned_chunks: iterable :param kwargs: parameters for the map function :type kwargs: dict of string to parameter :return: The result of the calculation as a list - each item should be the result of the application of func to a single element. """ result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks)) return [item for sublist in result for item in sublist] def close(self): """ Closes the connection to the Dask Scheduler """ self.client.close()
class LocalDaskDistributor(DistributorBaseClass): """ Distributor using a local dask cluster and inproc communication. """ def __init__(self, n_workers): """ Initiates a LocalDaskDistributor instance. :param n_workers: How many workers should the local dask cluster have? :type n_workers: int """ from distributed import LocalCluster, Client import tempfile # attribute .local_dir_ is the path where the local dask workers store temporary files self.local_dir_ = tempfile.mkdtemp() cluster = LocalCluster(n_workers=n_workers, processes=False, local_dir=self.local_dir_) self.client = Client(cluster) self.n_workers = n_workers def distribute(self, func, partitioned_chunks, kwargs): """ Calculates the features in a parallel fashion by distributing the map command to the dask workers on a local machine :param func: the function to send to each worker. :type func: callable :param partitioned_chunks: The list of data chunks - each element is again a list of chunks - and should be processed by one worker. :type partitioned_chunks: iterable :param kwargs: parameters for the map function :type kwargs: dict of string to parameter :return: The result of the calculation as a list - each item should be the result of the application of func to a single element. """ result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks)) return [item for sublist in result for item in sublist] def close(self): """ Closes the connection to the local Dask Scheduler """ self.client.close()
def test_adaptive_local_cluster_multi_workers(): loop = IOLoop.current() cluster = LocalCluster(0, scheduler_port=0, silence_logs=False, nanny=False, diagnostic_port=None, loop=loop, start=False) alc = Adaptive(cluster.scheduler, cluster, interval=100) c = Client(cluster, start=False, loop=loop) yield c._start() for i in range(20): futures = c.map(slowinc, range(100), delay=0.01) yield c._gather(futures) del futures yield gen.sleep(0.1) yield c._shutdown() yield cluster._close()
def test_adaptive_local_cluster_multi_workers(): loop = IOLoop.current() cluster = LocalCluster(0, scheduler_port=0, silence_logs=False, nanny=False, diagnostics_port=None, loop=loop, start=False) cluster.scheduler.allowed_failures = 1000 alc = Adaptive(cluster.scheduler, cluster, interval=100) c = Client(cluster, start=False, loop=loop) yield c._start() futures = c.map(slowinc, range(100), delay=0.01) start = time() while not cluster.scheduler.worker_info: yield gen.sleep(0.01) assert time() < start + 15 yield c._gather(futures) del futures start = time() while cluster.workers: yield gen.sleep(0.01) assert time() < start + 5 assert not cluster.workers yield gen.sleep(0.2) assert not cluster.workers futures = c.map(slowinc, range(100), delay=0.01) yield c._gather(futures) yield c._shutdown() yield cluster._close()
def test_wait_for_workers_timeout(): # Start a cluster with 0 worker: cluster = LocalCluster(n_workers=0, processes=False, threads_per_worker=2) client = Client(cluster) try: with parallel_backend('dask', wait_for_workers_timeout=0.1): # Short timeout: DaskDistributedBackend msg = "DaskDistributedBackend has no worker after 0.1 seconds." with pytest.raises(TimeoutError, match=msg): Parallel()(delayed(inc)(i) for i in range(10)) with parallel_backend('dask', wait_for_workers_timeout=0): # No timeout: fallback to generic joblib failure: msg = "DaskDistributedBackend has no active worker" with pytest.raises(RuntimeError, match=msg): Parallel()(delayed(inc)(i) for i in range(10)) finally: client.close() cluster.close()
def test_wait_for_workers(cluster_strategy): cluster = LocalCluster(n_workers=0, processes=False, threads_per_worker=2) client = Client(cluster) if cluster_strategy == "adaptive": cluster.adapt(minimum=0, maximum=2) elif cluster_strategy == "late_scaling": # Tell the cluster to start workers but this is a non-blocking call # and new workers might take time to connect. In this case the Parallel # call should wait for at least one worker to come up before starting # to schedule work. cluster.scale(2) try: with parallel_backend('dask'): # The following should wait a bit for at least one worker to # become available. Parallel()(delayed(inc)(i) for i in range(10)) finally: client.close() cluster.close()
def __init__(self, address): """ Sets up a distributor that connects to a Dask Scheduler to distribute the calculaton of the features :param address: the ip address and port number of the Dask Scheduler :type address: str """ from distributed import Client self.client = Client(address=address)
def test_scale_up_and_down(): loop = IOLoop.current() cluster = LocalCluster(0, scheduler_port=0, nanny=False, silence_logs=False, diagnostics_port=None, loop=loop, start=False) c = Client(cluster, start=False, loop=loop) yield c._start() assert not cluster.workers yield cluster.scale_up(2) assert len(cluster.workers) == 2 assert len(cluster.scheduler.ncores) == 2 addr = cluster.workers[0].address yield cluster.scale_down([addr]) assert len(cluster.workers) == 1 assert addr not in cluster.scheduler.ncores yield c._shutdown() yield cluster._close()
def __init__(self, n_workers): """ Initiates a LocalDaskDistributor instance. :param n_workers: How many workers should the local dask cluster have? :type n_workers: int """ from distributed import LocalCluster, Client cluster = LocalCluster(n_workers=n_workers, processes=False) self.client = Client(cluster) self.n_workers = n_workers
def test_multiple_clients_restart(s, a, b): e1 = Client((s.ip, s.port), start=False) yield e1._start() e2 = Client((s.ip, s.port), start=False) yield e2._start() x = e1.submit(inc, 1) y = e2.submit(inc, 2) xx = yield x._result() yy = yield y._result() assert xx == 2 assert yy == 3 yield e1._restart() assert x.cancelled() assert y.cancelled() yield e1._shutdown(fast=True) yield e2._shutdown(fast=True)
def __init__(self, n_workers): """ Initiates a LocalDaskDistributor instance. :param n_workers: How many workers should the local dask cluster have? :type n_workers: int """ from distributed import LocalCluster, Client import tempfile # attribute .local_dir_ is the path where the local dask workers store temporary files self.local_dir_ = tempfile.mkdtemp() cluster = LocalCluster(n_workers=n_workers, processes=False, local_dir=self.local_dir_) self.client = Client(cluster) self.n_workers = n_workers
def test_publish_simple(s, a, b): c = Client((s.ip, s.port), start=False) yield c._start() f = Client((s.ip, s.port), start=False) yield f._start() data = yield c._scatter(range(3)) out = yield c._publish_dataset(data=data) assert 'data' in s.extensions['publish'].datasets with pytest.raises(KeyError) as exc_info: out = yield c._publish_dataset(data=data) assert "exists" in str(exc_info.value) assert "data" in str(exc_info.value) result = yield c.scheduler.publish_list() assert result == ['data'] result = yield f.scheduler.publish_list() assert result == ['data'] yield c._shutdown() yield f._shutdown()
def test_Client_solo(loop): with Client(loop=loop) as c: pass assert c.cluster.status == 'closed'
def test_client_cluster_synchronous(loop): with clean(threads=False): with Client(loop=loop, processes=False) as c: assert not c.asynchronous assert not c.cluster.asynchronous
def test_Client_twice(loop): with Client(loop=loop, silence_logs=False, dashboard_address=None) as c: with Client(loop=loop, silence_logs=False, dashboard_address=None) as f: assert c.cluster.scheduler.port != f.cluster.scheduler.port
def test_Client_solo(loop): with Client(loop=loop, silence_logs=False) as c: pass assert c.cluster.status == Status.closed
class Ensemble(object): ''' Ensembles represent an multiple SUMMA configurations based on changing the decisions or parameters of a given run. Attributes ---------- executable: Path to the SUMMA executable filemanager: (optional) Path to the file manager configuration: Dictionary of runs, along with settings num_workers: Number of parallel workers to use simulations: Dictionary of run names and Simulation objects ''' def __init__(self, executable: str, configuration: dict, filemanager: str = None, num_workers: int = 1, threads_per_worker: int = OMP_NUM_THREADS, scheduler: str = None, client: Client = None): """ Create a new Ensemble object. The API mirrors that of the Simulation object. """ self._status = 'Initialized' self.executable: str = executable self.filemanager: str = filemanager self.configuration: dict = configuration self.num_workers: int = num_workers self.simulations: dict = {} self.submissions: list = [] # Try to get a client, and if none exists then start a new one if client: self._client = client workers = len(self._client.get_worker_logs()) if workers <= self.num_workers: self._client.cluster.scale(workers) else: try: self._client = get_client() # Start more workers if necessary: workers = len(self._client.get_worker_logs()) if workers <= self.num_workers: self._client.cluster.scale(workers) except ValueError: self._client = Client(n_workers=self.num_workers, threads_per_worker=threads_per_worker) self._generate_simulation_objects() def _generate_simulation_objects(self): """ Create a mapping of configurations to the simulation objects. """ if self.filemanager: for name, config in self.configuration.items(): self.simulations[name] = Simulation(self.executable, self.filemanager, False) else: for name, config in self.configuration.items(): assert config['file_manager'] is not None, \ "No filemanager found in configuration or Ensemble!" self.simulations[name] = Simulation(self.executable, config['file_manager'], False) def _generate_coords(self): """ Generate the coordinates that can be used to merge the output of the ensemble runs into a single dataset. """ decision_dims = ChainDict() manager_dims = ChainDict() parameter_dims = ChainDict() for name, conf in self.configuration.items(): for k, v in conf.get('decisions', {}).items(): decision_dims[k] = v for k, v in conf.get('file_manager', {}).items(): manager_dims[k] = v #for k, v in conf.get('parameters', {}).items(): # parameter_dims[k] = v for k, v in conf.get('trial_parameters', {}).items(): parameter_dims[k] = v return { 'decisions': decision_dims, 'managers': manager_dims, 'parameters': parameter_dims } def merge_output(self): """ Open and merge all of the output datasets from the ensemble run into a single dataset. """ nc = self._generate_coords() new_coords = (list(nc.get('decisions', {})) + list(nc.get('parameters', {}))) decision_tuples = [ tuple(n.split('++')[1:-1]) for n in self.configuration.keys() ] for i, t in enumerate(decision_tuples): decision_tuples[i] = tuple( (float(l.split('=')[-1]) if '=' in l else l for l in t)) decision_names = [ '++'.join(tuple(n.split('++')[1:-1])) for n in self.configuration.keys() ] if sum([len(dt) for dt in decision_tuples]) == 0: raise NameError("Simulations in the ensemble do not share all" " common decisions! Please use `open_output`" " to retrieve the output of this Ensemble") for i, t in enumerate(decision_names): decision_names[i] = '++'.join(l.split('=')[0] for l in t) new_idx = pd.MultiIndex.from_tuples(decision_tuples, names=new_coords) out_file_paths = [ s.get_output_files() for s in self.simulations.values() ] out_file_paths = [fi for sublist in out_file_paths for fi in sublist] full = xr.open_mfdataset(out_file_paths, concat_dim='run_number', combine='nested') merged = full.assign_coords(run_number=decision_names) merged['run_number'] = new_idx merged = merged.unstack('run_number') return merged def start(self, run_option: str, prerun_cmds: list = None): """ Start running the ensemble members. Parameters ---------- run_option: The run type. Should be either 'local' or 'docker' prerun_cmds: A list of preprocessing commands to run """ for n, s in self.simulations.items(): # Sleep calls are to ensure writeout happens config = self.configuration[n] self.submissions.append( self._client.submit(_submit, s, n, run_option, prerun_cmds, config)) def run(self, run_option: str, prerun_cmds=None, monitor: bool = True): """ Run the ensemble Parameters ---------- run_option: Where to run the simulation. Can be ``local`` or ``docker`` prerun_cmds: A list of shell commands to run before running SUMMA monitor: Whether to halt operation until runs are complete """ self.start(run_option, prerun_cmds) if monitor: return self.monitor() else: return True def map(self, fun, args, include_sims=True, monitor=True): for n, s in self.simulations.items(): config = self.configuration[n] if include_sims: all_args = (s, n, *args, {'config': config}) else: all_args = (*args, {'config': config}) self.submissions.append(self._client.submit(fun, *all_args)) if monitor: return self.monitor() else: return True def monitor(self): """ Halt computation until submitted simulations are complete """ simulations = self._client.gather(self.submissions) for s in simulations: self.simulations[s.run_suffix] = s def summary(self): """ Show the user information about ensemble status """ success, error, other = [], [], [] for n, s in self.simulations.items(): if s.status == 'Success': success.append(n) elif s.status == 'Error': error.append(n) else: other.append(n) return {'success': success, 'error': error, 'other': other} def rerun_failed(self, run_option: str, prerun_cmds=None, monitor: bool = True): """ Try to re-run failed simulations. Parameters ---------- run_option: Where to run the simulation. Can be ``local`` or ``docker`` prerun_cmds: A list of shell commands to run before running SUMMA monitor: Whether to halt operation until runs are complete """ run_summary = self.summary() self.submissions = [] for n in run_summary['error']: config = self.configuration[n] s = self.simulations[n] s.reset() self.submissions.append( self._client.submit(_submit, s, n, run_option, prerun_cmds, config)) if monitor: return self.monitor() else: return True
async def test_simple(cleanup): async with Scheduler(protocol="ucx") as s: async with Worker(s.address) as a: async with Client(s.address, asynchronous=True) as c: result = await c.submit(lambda x: x + 1, 10) assert result == 11
def test_dont_assume_function_purity(loop): # noqa: F811 with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as client: # noqa: F841 with parallel_backend('dask') as (ba, _): x, y = Parallel()(delayed(random2)() for i in range(2)) assert x != y
def test_rolling_sync(loop): with cluster() as (c, [a, b]): with Client(('127.0.0.1', c['port']), loop=loop) as c: df = pd.util.testing.makeTimeDataFrame() ddf = dd.from_pandas(df, npartitions=10) dd.rolling_mean(ddf.A, 2).compute(get=c.get)
import socket time.sleep(5) return '{0} on {1}'.format(i, socket.gethostname()) if __name__ == '__main__': arg_parser = ArgumentParser(description='compute sum of squares and check ' 'task placement') arg_parser.add_argument('--scheduler', help='scheduler host') arg_parser.add_argument('--scheduler_port', default='8786', help='scheduler port to use') arg_parser.add_argument('--n', type=int, default=100, help='number of terms in sum') arg_parser.add_argument('--verbose', action='store_true', help='give verbose output') options = arg_parser.parse_args() client = Client('{0}:{1}'.format(options.scheduler, options.scheduler_port)) if options.verbose: print('Client: {0}'.format(str(client)), flush=True) futures = client.map(square, range(options.n)) total = client.submit(sum, futures) expected_total = (options.n - 1)*options.n*(2*options.n - 1)//6 print('sum_i=0..99 i^2 = {0:d}, expected {1:d}'.format(total.result(), expected_total)) futures = client.map(get_hostname, range(options.n)) process_locations = client.gather(futures) if options.verbose: print('task placement:') print('\t' + '\n\t'.join(process_locations)) count = dict() for process_location in process_locations: _, _, hostname = process_location.split()
def test_secede_with_no_processes(loop): # noqa: F811 # https://github.com/dask/distributed/issues/1775 with Client(loop=loop, processes=False, set_as_default=True): with parallel_backend('dask'): Parallel(n_jobs=4)(delayed(id)(i) for i in range(2))
def test_stream_shares_client_loop(loop): # noqa: F811 with cluster() as (s, [a, b]): with Client(s['address'], loop=loop) as client: # noqa: F841 source = Stream() d = source.timed_window('20ms').scatter() # noqa: F841 assert source.loop is client.loop
async def test_config(cleanup): async with Scheduler() as s: async with Nanny(s.address, config={"foo": "bar"}) as n: async with Client(s.address, asynchronous=True) as client: config = await client.run(dask.config.get, "foo") assert config[n.worker_address] == "bar"
#validation para esta parte nos vamos a basar en las funciones que aparece en manual #dask-ml Documentation versión 0.1 que encontré en internet (a partir de la página 12) regdask = LinearRegression() param_grid = [{'C': [5, 10, 15], 'tol': [1e-4, 1e-3, 1e-2]}] # In[22]: dk_grid_search = GridSearchCV(regdask, param_grid=param_grid, n_jobs=-1, cv=5) # In[23]: #Para correr las cosas en distribuido agregamos el siguiente código: from dask.distributed import Client client = Client("scheduler:8786") # In[24]: ini = timeit.default_timer() lr_dask = dk_grid_search.fit(x_train, y_train) print("Tiempo de ejecución: " + str(timeit.default_timer() - ini)) # In[25]: lr_dask.best_estimator_ # In[26]:
def test_rabit_ops(): from distributed import Client, LocalCluster n_workers = 3 with LocalCluster(n_workers=n_workers) as cluster: with Client(cluster) as client: run_rabit_ops(client, n_workers)
from lib_experimental_utils import simulation, qasm_simulator, ibmqx4, create_direction_only_pass_manager, \ create_experiment_qobj, get_gate_times, experiment, ibmqx2, ibmq_16_melbourne, create_optimize_only_pass_manager, \ ibmq_ourense, ibmq_vigo LOG = logging.getLogger(__name__) def setup_logging(): import lib_experimental_utils logging.basicConfig(format=logging.BASIC_FORMAT, level='WARN') logging.getLogger(lib_experimental_utils.__name__).setLevel('DEBUG') logging.getLogger(__name__).setLevel('DEBUG') module_path = os.path.dirname(__file__) client = Client(address='localhost:8786') # type: Client class BackendEnum(Enum): SIMULATOR = 0 IBMQX2 = 1 IBMQX4 = 2 IBMQ_OURENSE = 3 IBMQ_VIGO = 4 def update_files(): client.upload_file("{}/lib_circuits.py".format(module_path)) client.upload_file("{}/lib_experimental_utils.py".format(module_path)) client.upload_file("{}/lib_experiment_setups.py".format(module_path))
def test_loc_sync(loop): with cluster() as (c, [a, b]): with Client(('127.0.0.1', c['port']), loop=loop) as c: df = pd.util.testing.makeTimeDataFrame() ddf = dd.from_pandas(df, npartitions=10) ddf.loc['2000-01-17':'2000-01-24'].compute(get=c.get)
def register_dask_scheduler_plugin(cls, client: distributed.Client): plugin = SharedMemoryRefCounter(cls.REFCOUNT_TAG) client.register_scheduler_plugin(plugin, idempotent=True, name="metagraph_shared_csr_refcount")
async def test_security_dict_input_no_security(cleanup): async with Scheduler(security={}) as s: async with Worker(s.address, security={}) as w: async with Client(s.address, security={}, asynchronous=True) as c: result = await c.submit(inc, 1) assert result == 2
async def test_no_workers(cleanup): async with Client( n_workers=0, silence_logs=False, dashboard_address=None, asynchronous=True ) as c: pass
def get_dask_client(self): return Client(self.scheduler)
def test_Client_kwargs(loop): with Client(loop=loop, processes=False, n_workers=2, silence_logs=False) as c: assert len(c.cluster.workers) == 2 assert all(isinstance(w, Worker) for w in c.cluster.workers.values()) assert c.cluster.status == Status.closed
from dask.diagnostics import ProgressBar from dask.diagnostics import ResourceProfiler from dask.dot import dot_graph from dask.array.core import map_blocks import dask.bag as db import dask.dataframe as df import random import logging logging.basicConfig(stream=sys.stdout, level=logging.CRITICAL) from chest import Chest from random import shuffle import os.path from distributed import Client hostname = "198.202.115.240:8786" client = Client(hostname) #from multiprocessing.pool import ThreadPool RESULT_DIR = "results" RESULT_FILE_PREFIX = "pair-distance-" HEADER_CSV = "Scenario, Type, Time" #BASE_DIRECTORY=os.getcwd() # Dask has issues with NFS home directory on Comet # BASE_DIRECTORY='/scratch/luckow/7146882' BASE_DIRECTORY = '/oasis/scratch/comet/luckow/temp_project' #BASE_DIRECTORY='/scratch/luckow/7218009/' OUT_DIR = os.path.join(BASE_DIRECTORY, "npy_stack") FILENAMES = [ "../132k_dataset/atom_pos_132K.npy", "../145K_dataset/atom_pos_145K.npy", "../300K_dataset/atom_pos_291K.npy", '../840K_dataset/atom_pos_839K.npy'
def test_blocks_until_full(loop): with Client(loop=loop) as c: assert len(c.nthreads()) > 0
def test_empty_dmatrix_approx(): with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: parameters = {'tree_method': 'approx'} run_empty_dmatrix_reg(client, parameters) run_empty_dmatrix_cls(client, parameters)
def preprocessing_script(): """ This script will process all the hybridization folders combined in a processing folder. The input parameters are passed using arparse Parameters: ----------- scheduler: string tcp address of the dask.distributed scheduler (ex. tcp://192.168.0.4:7003). default = False. If False the process will run on the local computer using nCPUs-1 path: string Path to the processing directory """ # Inputs of the function parser = argparse.ArgumentParser(description='Preprocessing script') parser.add_argument('-scheduler', default=False, help='dask scheduler address ex. tcp://192.168.0.4:7003') parser.add_argument('-path', help='processing directory') args = parser.parse_args() # Directory to process processing_directory = args.path # Dask scheduler address scheduler_address = args.scheduler if scheduler_address: # Start dask client on server or cluster client=Client(scheduler_address) else: # Start dask client on local machine. It will use all the availabe # cores -1 # number of core to use ncores = multiprocessing.cpu_count()-1 cluster = LocalCluster(n_workers=ncores) client=Client(cluster) # Subdirectories of the processing_directory that need to be skipped for the # analysis blocked_directories = ['_logs'] # Starting logger utils.init_file_logger(processing_directory) logger = logging.getLogger() # Determine the operating system running the code os_windows, add_slash = utils.determine_os() # Check training slash in the processing directory processing_directory=utils.check_trailing_slash(processing_directory,os_windows) # Get a list of the hybridization to process processing_hyb_list = next(os.walk(processing_directory))[1] # Remove the blocked directories from the directories to process processing_hyb_list = [el for el in processing_hyb_list if el not in blocked_directories ] for processing_hyb in processing_hyb_list: # Determine the hyb number from the name hybridization_number = processing_hyb.split('_hyb')[-1] hybridization = 'Hybridization' + hybridization_number hyb_dir = processing_directory + processing_hyb + add_slash # Parse the Experimental metadata file (serial) experiment_infos,image_properties, hybridizations_infos, \ converted_positions, microscope_parameters =\ utils.experimental_metadata_parser(hyb_dir) # Parse the configuration file flt_rawcnt_config = utils.filtering_raw_counting_config_parser(hyb_dir) # ----------------- .nd2 FILE CONVERSION ------------------------------ # Create the temporary subdirectory tree (serial) tmp_dir_path, tmp_gene_dirs=utils.create_subdirectory_tree(hyb_dir,\ hybridization,hybridizations_infos,processing_hyb,suffix='tmp',add_slash=add_slash) # Get the list of the nd2 files to process inside the directory files_list = glob.glob(hyb_dir+processing_hyb+'_raw_data'+add_slash+'*.nd2') # Get the list of genes that are analyzed in the current hybridization gene_list = list(hybridizations_infos[hybridization].keys()) # Organize the file to process in a list which order match the gene_list for # parallel processing organized_files_list = [f for gene in gene_list for f in files_list if gene+'.nd2' in f ] organized_tmp_dir_list = [f for gene in gene_list for f in tmp_gene_dirs if gene in f ] # Each .nd2 file will be processed in a worker part of a different node # Get the addresses of one process/node to use for conversion node_addresses = utils.identify_nodes(client) workers_conversion = [list(el.items())[0][1] for key,el in node_addresses.items()] # Run the conversion futures_processes=client.map(io.nd2_to_npy,gene_list,organized_files_list, tmp_gene_dirs,processing_hyb=processing_hyb, use_ram=flt_rawcnt_config['use_ram'], max_ram=flt_rawcnt_config['max_ram'], workers=workers_conversion) client.gather(futures_processes) # --------------------------------------------------------------------- # ----------------- FILTERING AND RAW COUNTING ------------------------ # Create directories # Create the directory where to save the filtered images suffix = 'filtered_png' filtered_png_img_dir_path, filtered_png_img_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos, processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name']) suffix = 'filtered_npy' filtered_img_dir_path, filtered_img_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos, processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name']) # Create the directory where to save the counting suffix = 'counting' counting_dir_path, counting_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb, suffix,add_slash,flt_rawcnt_config['skip_tags_counting'], flt_rawcnt_config['skip_genes_counting'], analysis_name=flt_rawcnt_config['analysis_name']) if flt_rawcnt_config['illumination_correction']: # Create the directory where to save the counting suffix = 'illumination_funcs' illumination_func_dir_path, illumination_func_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb, suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name']) # Loop through channels and calculate illumination for gene in hybridizations_infos[hybridization].keys(): flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy') logger.debug('Create average image for gene %s', gene) # Chunking the image list num_chunks = sum(list(client.ncores().values())) chunked_list = utils.list_chunking(flist_img_to_filter,num_chunks) # Scatter the images sublists to process in parallel futures = client.scatter(chunked_list) # Create dask processing graph output = [] for future in futures: ImgMean = delayed(utils.partial_image_mean)(future) output.append(ImgMean) ImgMean_all = delayed(sum)(output) ImgMean_all = ImgMean_all/float(len(futures)) # Compute the graph ImgMean = ImgMean_all.compute() logger.debug('Create illumination function for gene %s',gene) # Create illumination function Illumination=filters.gaussian(ImgMean,sigma=(20,300,300)) # Normalization of the illumination Illumination_flat=np.amax(Illumination,axis=0) Illumination_norm=Illumination_flat/np.amax(Illumination_flat) logger.debug('Save illumination function for gene %s',gene) # Save the illumination function illumination_path = [ill_path for ill_path in illumination_func_gene_dirs if gene in ill_path][0] illumination_fname=illumination_path+gene+'_illumination_func.npy' np.save(illumination_fname,Illumination_norm,allow_pickle=False) # Broadcast the illumination function to all the cores client.scatter(Illumination_norm, broadcast=True) logger.debug('Filtering %s',gene) # Filtering and counting futures_processes=client.map(counting.filtering_and_counting_ill_correction,flist_img_to_filter, \ illumination_function=Illumination_norm,\ filtered_png_img_gene_dirs=filtered_png_img_gene_dirs,\ filtered_img_gene_dirs =filtered_img_gene_dirs,\ counting_gene_dirs=counting_gene_dirs,plane_keep=flt_rawcnt_config['plane_keep'], \ min_distance=flt_rawcnt_config['min_distance'], stringency=flt_rawcnt_config['stringency'],\ skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting']) client.gather(futures_processes) else: for gene in hybridizations_infos[hybridization].keys(): flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy') # filtering logger.debug('Filtering without illumination correction %s',gene) futures_processes=client.map(counting.filtering_and_counting,flist_img_to_filter, \ filtered_png_img_gene_dirs=filtered_png_img_gene_dirs, \ filtered_img_gene_dirs=filtered_img_gene_dirs, \ counting_gene_dirs=counting_gene_dirs, \ plane_keep=flt_rawcnt_config['plane_keep'], min_distance=flt_rawcnt_config['min_distance'],\ stringency=flt_rawcnt_config['stringency'],\ skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting']) client.gather(futures_processes) # --------------------------------------------------------------------- # # ----------------- COMBINE THE FILTERED DATA IN .ppf.hdf5 ------------------------ # # Combine the filter data in one single .ppf for each hybridization # # This step will run in serial mode and will not need to shuffle data # # between cores because everything is on the common file system # logger.debug('Create .ppf.hdf5 file') # # Create the ppf.hdf5 file that contains the filtered data in uint16 # preprocessing_file_path = hdf5_utils.hdf5_create_preprocessing_file(hybridizations_infos,processing_hyb, # hybridization,flt_rawcnt_config['analysis_name'], hyb_dir,converted_positions,image_properties) # logger.debug('Write the .npy filtered files into the .ppf file') # # Load and write the .npy tmp images into the hdf5 file # # open the hdf5 file # with h5py.File(preprocessing_file_path) as f_hdl: # # Loop through each gene # for gene in hybridizations_infos[hybridization].keys(): # logger.debug('Writing %s images in .ppf.hdf5',gene) # # list of the files to transfer # filtered_gene_dir = [fdir for fdir in filtered_img_gene_dirs if gene in fdir][0] # filtered_files_list = glob.glob(filtered_gene_dir+'*.npy') # # loop through the list of file # for f_file in filtered_files_list: # pos = f_file.split('/')[-1].split('_')[-1].split('.')[0] # f_hdl[gene]['FilteredData'][pos][:] =np.load(f_file) # f_hdl.flush() # # --------------------------------------------------------------------- # # ----------------- STITCHING ------------------------ # # Load the stitching parameters from the .yaml file # # Stitch the image in 2D or 3D (3D need more work/testing) # nr_dim = flt_rawcnt_config['nr_dim'] # # Estimated overlapping between images according to the Nikon software # est_overlap = image_properties['Overlapping_percentage'] # # Number of peaks to use for the alignment # nr_peaks = flt_rawcnt_config['nr_peaks'] # # Determine if the coords need to be flipped # y_flip = flt_rawcnt_config['y_flip'] # # Method to use for blending # # can be 'linear' or 'non linear' # # The methods that performs the best is the 'non linear' # blend = flt_rawcnt_config['blend'] # # Reference gene for stitching # reference_gene = flt_rawcnt_config['reference_gene'] # pixel_size = image_properties['PixelSize'] # # Get the list of the filtered files of the reference gene # filtered_gene_dir = [gene_dir for gene_dir in filtered_img_gene_dirs if reference_gene in gene_dir][0] # filtered_files_list = glob.glob(filtered_gene_dir+'*.npy') # # Create pointer of the hdf5 file that will store the stitched reference image # # for the current hybridization # # Writing # tile_file_base_name = flt_rawcnt_config['analysis_name']+'_'+ processing_hyb # data_name = (tile_file_base_name # + '_' + reference_gene # + '_stitching_data') # stitching_file_name = tile_file_base_name + '.sf.hdf5' # stitching_file= h5py.File(hyb_dir+stitching_file_name,'w',libver='latest') # replace with 'a' as soon as you fix the error # # Determine the tiles organization # tiles, contig_tuples, nr_pixels, z_count, micData = stitching.get_pairwise_input_npy(image_properties,converted_positions, hybridization, # est_overlap = est_overlap, y_flip = False, nr_dim = 2) # # Align the tiles # futures_processes=client.map(pairwisesingle.align_single_pair_npy,contig_tuples, # filtered_files_list=filtered_files_list,micData=micData, # nr_peaks=nr_peaks) # # Gather the futures # data = client.gather(futures_processes) # # In this case the order of the returned contingency tuples is with # # the order of the input contig_tuples # # P_all = [el for data_single in data for el in data_single[0]] # P_all =[data_single[0] for data_single in data ] # P_all = np.array(P_all) # P_all = P_all.flat[:] # covs_all = [data_single[1] for data_single in data] # alignment = {'P': P_all, # 'covs': covs_all} # # Calculates a shift in global coordinates for each tile (global # # alignment) and then applies these shifts to the corner coordinates # # of each tile and returns and saves these shifted corner coordinates. # joining = stitching.get_place_tile_input(hyb_dir, tiles, contig_tuples, # micData, nr_pixels, z_count, # alignment, data_name, # nr_dim=nr_dim) # # Create the hdf5 file structure # stitched_group, linear_blending, blend = hdf5preparation.create_structures_hdf5_stitched_ref_gene_file_npy(stitching_file, joining, nr_pixels, # reference_gene, blend = 'non linear') # # Fill the hdf5 containing the stitched image with empty data and # # create the blending mask # stitched_group['final_image'][:]= np.zeros(joining['final_image_shape'],dtype=np.float64) # if blend is not None: # # make mask # stitched_group['blending_mask'][:] = np.zeros(joining['final_image_shape'][-2:],dtype=np.float64) # tilejoining.make_mask(joining, nr_pixels, stitched_group['blending_mask']) # # Create the subdirectory used to save the blended tiles # suffix = 'blended_tiles' # blended_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash, # analysis_name=flt_rawcnt_config['analysis_name']) # # Get the directory with the filtered npy images of the reference_gene to use for stitching # stitching_files_dir = [npy_dir for npy_dir in filtered_img_gene_dirs if reference_gene in npy_dir][0] # # Create the tmp directory where to save the masks # suffix = 'masks' # masked_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash, # analysis_name=flt_rawcnt_config['analysis_name']) # # Create and save the mask files # for corn_value,corner_coords in joining['corner_list']: # if not(np.isnan(corner_coords[0])): # cur_mask = stitched_group['blending_mask'][int(corner_coords[0]):int(corner_coords[0]) + int(nr_pixels), # int(corner_coords[1]):int(corner_coords[1]) + int(nr_pixels)] # fname = masked_tiles_directory + flt_rawcnt_config['analysis_name'] +'_'+processing_hyb+'_'+reference_gene+'_masks_joining_pos_'+str(corn_value) # np.save(fname,cur_mask) # # Blend all the tiles and save them in a directory # futures_processes = client.map(tilejoining.generate_blended_tile_npy,joining['corner_list'], # stitching_files_dir = stitching_files_dir, # blended_tiles_directory = blended_tiles_directory, # masked_tiles_directory = masked_tiles_directory, # analysis_name = flt_rawcnt_config['analysis_name'], # processing_hyb = processing_hyb,reference_gene = reference_gene, # micData = micData,tiles = tiles,nr_pixels=nr_pixels, # linear_blending=linear_blending) # _ = client.gather(futures_processes) # # Write the stitched image # tilejoining.make_final_image_npy(joining, stitching_file, blended_tiles_directory, tiles,reference_gene, nr_pixels) # # close the hdf5 file # stitching_file.close() # # Delete the directories with blended tiles and masks # shutil.rmtree(blended_tiles_directory) # shutil.rmtree(masked_tiles_directory) # ----------------- DELETE FILES ------------------------ # Don't delete the *.npy files here because can be used to # create the final images using the apply stitching related function client.close()
from netCDF4 import Dataset import numpy as np from datetime import datetime, timedelta from copy import deepcopy import glob import math import dask.array as da from distributed import Client, LocalCluster from dask import delayed, compute import time import sys from scipy import ndimage # Start a cluster with x workers cluster = LocalCluster(n_workers=int(sys.argv[1])) client = Client(cluster) # Input the range of dates and time wanted for the collection of images start_year = 2006 start_day = 1 start_month = 1 start_hour = 1 start_minute = 0 start_second = 0 end_year = 2006 end_month = 3 end_day = 1 end_hour = 0 end_minute = 00 end_second = 0
def _get_dask_client(self, options): if options.dask_cluster_uri: function = mlrun.import_function(options.dask_cluster_uri) return function.client, function.metadata.name return Client(), None
def test_Client_twice(loop): with Client(loop=loop) as c: with Client(loop=loop) as f: assert c.cluster.scheduler.port != f.cluster.scheduler.port
def test_publish_roundtrip(s, a, b): c = Client((s.ip, s.port), start=False) yield c._start() f = Client((s.ip, s.port), start=False) yield f._start() data = yield c._scatter([0, 1, 2]) yield c._publish_dataset(data=data) assert 'published-data' in s.who_wants[data[0].key] result = yield f._get_dataset(name='data') assert len(result) == len(data) out = yield f._gather(result) assert out == [0, 1, 2] with pytest.raises(KeyError) as exc_info: result = yield f._get_dataset(name='nonexistent') assert "not found" in str(exc_info.value) assert "nonexistent" in str(exc_info.value) yield c._shutdown() yield f._shutdown()
def test_Client_twice(loop): with Client(loop=loop, silence_logs=False) as c: with Client(loop=loop, silence_logs=False) as f: assert c.cluster.scheduler.port != f.cluster.scheduler.port
async def test_get_cluster_details( clusters_config: None, registered_user: Callable[..., Dict[str, Any]], async_client: httpx.AsyncClient, local_dask_gateway_server: DaskGatewayServer, cluster: Callable[..., Cluster], dask_gateway_cluster: GatewayCluster, dask_gateway_cluster_client: DaskClient, ): user_1 = registered_user() # define the cluster in the DB some_cluster = cluster( user_1, endpoint=local_dask_gateway_server.address, authentication=SimpleAuthentication( username="******", password=local_dask_gateway_server.password).dict(by_alias=True), ) # in its present state, the cluster should have no workers cluster_out = await _get_cluster_details(async_client, user_1["id"], some_cluster.id) assert not cluster_out.scheduler.workers, "the cluster should not have any worker!" # now let's scale the cluster _NUM_WORKERS = 1 await dask_gateway_cluster.scale(_NUM_WORKERS) async for attempt in AsyncRetrying(reraise=True, stop=stop_after_delay(60), wait=wait_fixed(1)): with attempt: cluster_out = await _get_cluster_details(async_client, user_1["id"], some_cluster.id) assert cluster_out.scheduler.workers, "the cluster has no workers!" assert ( len(cluster_out.scheduler.workers) == _NUM_WORKERS ), f"the cluster is missing {_NUM_WORKERS}, currently has {len(cluster_out.scheduler.workers)}" print( f"cluster now has its {_NUM_WORKERS}, after {json.dumps(attempt.retry_state.retry_object.statistics)}" ) print(f"!!> cluster dashboard link: {dask_gateway_cluster.dashboard_link}") # let's start some computation _TASK_SLEEP_TIME = 5 def do_some_work(x: int): import time time.sleep(x) return True task = dask_gateway_cluster_client.submit(do_some_work, _TASK_SLEEP_TIME) # wait for the computation to start, we should see this in the cluster infos async for attempt in AsyncRetrying(reraise=True, stop=stop_after_delay(10), wait=wait_fixed(1)): with attempt: cluster_out = await _get_cluster_details(async_client, user_1["id"], some_cluster.id) assert (next(iter( cluster_out.scheduler.workers.values())).metrics.executing == 1 ), "worker is not executing the task" print( f"!!> cluster metrics: {next(iter(cluster_out.scheduler.workers.values())).metrics=}" ) # let's wait for the result result = task.result(timeout=_TASK_SLEEP_TIME + 5) assert result assert await result == True # wait for the computation to effectively stop async for attempt in AsyncRetrying(reraise=True, stop=stop_after_delay(60), wait=wait_fixed(1)): with attempt: cluster_out = await _get_cluster_details(async_client, user_1["id"], some_cluster.id) print( f"!!> cluster metrics: {next(iter(cluster_out.scheduler.workers.values())).metrics=}" ) assert (next(iter( cluster_out.scheduler.workers.values())).metrics.executing == 0 ), "worker is still executing the task" assert (next(iter( cluster_out.scheduler.workers.values())).metrics.in_memory == 1 ), "worker did not keep the result in memory" assert (next(iter( cluster_out.scheduler.workers.values())).metrics.cpu == 0 ), "worker did not update the cpu metrics" # since the task is completed the worker should have stopped executing cluster_out = await _get_cluster_details(async_client, user_1["id"], some_cluster.id) worker_data = next(iter(cluster_out.scheduler.workers.values())) assert worker_data.metrics.executing == 0 # in dask, the task remains in memory until the result is deleted assert worker_data.metrics.in_memory == 1
def main(): # Define parameters to use for multiprocessing client = Client() num_workers = min(multiprocessing.cpu_count(), 7) print('Number of workers = ', num_workers) run_start_time = time.time() # Directories to save data CUR_DIR = os.path.dirname(os.path.realpath(__file__)) base_dir = os.path.join(CUR_DIR, BASELINE_DIR) reform_dir = os.path.join(CUR_DIR, REFORM_DIR) # Set some OG model parameters # See default_parameters.json for more description of these parameters alpha_T = np.zeros(50) # Adjusting the path of transfer spending alpha_T[0:2] = 0.09 alpha_T[2:10] = 0.09 + 0.01 alpha_T[10:40] = 0.09 - 0.01 alpha_T[40:] = 0.09 alpha_G = np.zeros(7) # Adjusting the path of non-transfer spending alpha_G[0:3] = 0.05 - 0.01 alpha_G[3:6] = 0.05 - 0.005 alpha_G[6:] = 0.05 # Set start year for baseline and reform. START_YEAR = 2021 # Also adjust the Frisch elasticity, the start year, the # effective corporate income tax rate, and the SS debt-to-GDP ratio og_spec = { 'frisch': 0.41, 'start_year': START_YEAR, 'cit_rate': [0.21], 'debt_ratio_ss': 1.0, 'alpha_T': alpha_T.tolist(), 'alpha_G': alpha_G.tolist() } ''' ------------------------------------------------------------------------ Run baseline policy first ------------------------------------------------------------------------ ''' p = Specifications( baseline=True, num_workers=num_workers, baseline_dir=base_dir, output_base=base_dir, ) # Update parameters for baseline from default json file p.update_specifications(og_spec) start_time = time.time() runner(p, time_path=True, client=client) print('run time = ', time.time() - start_time) ''' ------------------------------------------------------------------------ Run reform policy ------------------------------------------------------------------------ ''' # update the effective corporate income tax rate og_spec.update({'cit_rate': [0.35]}) p2 = Specifications( baseline=False, num_workers=num_workers, baseline_dir=base_dir, output_base=reform_dir, ) # Update parameters for baseline from default json file p2.update_specifications(og_spec) start_time = time.time() runner(p2, time_path=True, client=client) print('run time = ', time.time() - start_time) # return ans - the percentage changes in macro aggregates and prices # due to policy changes from the baseline to the reform base_tpi = safe_read_pickle(os.path.join(base_dir, 'TPI', 'TPI_vars.pkl')) base_params = safe_read_pickle(os.path.join(base_dir, 'model_params.pkl')) reform_tpi = safe_read_pickle( os.path.join(reform_dir, 'TPI', 'TPI_vars.pkl')) reform_params = safe_read_pickle( os.path.join(reform_dir, 'model_params.pkl')) ans = ot.macro_table(base_tpi, base_params, reform_tpi=reform_tpi, reform_params=reform_params, var_list=['Y', 'C', 'K', 'L', 'r', 'w'], output_type='pct_diff', num_years=10, start_year=og_spec['start_year']) # create plots of output op.plot_all(base_dir, reform_dir, os.path.join(CUR_DIR, 'run_example_plots')) print("total time was ", (time.time() - run_start_time)) print('Percentage changes in aggregates:', ans) # save percentage change output to csv file ans.to_csv('ogcore_example_output.csv') client.close()