def main(_): #Generate scheduler data = da.from_array(np.array(Image.open(r'dota2.jpg')), chunks=(600, 400, 3)) client = Client(args.address) client.upload_file('calcov.py') temp3 = np.zeros((3, 3)) temp3[0, :] = [0.062467, 0.125000, 0.062467] temp3[1, :] = [0.125000, 0.250131, 0.125000] temp3[2, :] = [0.062467, 0.125000, 0.062467] D = [] B = [] for i in range(args.queue): D.append(np.array(data + i * 10)) B.append(temp3 + 0.05) future = client.map(calcov.calCov, B, D) result = [[np.array(_[0]), str(_[1]), str(_[2])] for _ in client.gather(future)] shutil.rmtree(r'./data', ignore_errors=True) os.mkdir(r'./data') i = 0 for _ in result: data = _[0] time = _[1] name = _[2].strip('tcp://') new_im = Image.fromarray(data) new_im.save('./data/result_%s_%s_(%s).jpg' % (i, time, name)) i += 1
def test_run_multiple_computational_sidecar_dask( event_loop: asyncio.AbstractEventLoop, dask_client: Client, ubuntu_task: ServiceExampleParam, mocker: MockerFixture, ): NUMBER_OF_TASKS = 50 mocker.patch( "simcore_service_dask_sidecar.computational_sidecar.core.get_integration_version", autospec=True, return_value=ubuntu_task.integration_version, ) futures = [ dask_client.submit( run_computational_sidecar, ubuntu_task.docker_basic_auth, ubuntu_task.service_key, ubuntu_task.service_version, ubuntu_task.input_data, ubuntu_task.output_data_keys, ubuntu_task.log_file_url, ubuntu_task.command, resources={}, ) for _ in range(NUMBER_OF_TASKS) ] results = dask_client.gather(futures) # for result in results: # check that the task produce the expected data, not less not more for output_data in results: for k, v in ubuntu_task.expected_output_data.items(): assert k in output_data assert output_data[k] == v
def main(): #define parallel mcmc wrapper def parallel_mcmc(_): return (mcmc(initial_parameters=epa_0, proposer=normal_prop, param2res=param2res, costfunction=costfunction, nsimu=5000)) #check jobs resources to initialize dask workers num_threads = int( environ.get('SLURM_CPUS_PER_TASK', environ.get('OMP_NUM_THREADS', 1))) initialize(interface='ib0', nthreads=num_threads) client = Client() #run 10 chains [[c_form1, j_form1], [c_form2, j_form2], [c_form3, j_form3], [c_form4, j_form4], [c_form5, j_form5], [c_form6, j_form6], [c_form7, j_form7], [c_form8, j_form8], [c_form9, j_form9], [c_form10, j_form10]] = client.gather(client.map(parallel_mcmc, range(0, 10))) #print chain5 output as test formal_c_path = dataPath.joinpath('chain5_pmcmc_c.csv') formal_j_path = dataPath.joinpath('chain5_pmcmc_j.csv') pd.DataFrame(c_form5).to_csv(formal_c_path, sep=',') pd.DataFrame(j_form5).to_csv(formal_j_path, sep=',')
class LocalDaskDistributor(DistributorBaseClass): """ Distributor using a local dask cluster and inproc communication. """ def __init__(self, n_workers): """ Initiates a LocalDaskDistributor instance. Parameters ---------- n_workers : int How many workers should the local dask cluster have? """ super().__init__() import tempfile from distributed import Client, LocalCluster # attribute .local_dir_ is the path where the local dask workers store temporary files self.local_dir_ = tempfile.mkdtemp() cluster = LocalCluster(n_workers=n_workers, processes=False, local_dir=self.local_dir_) self.client = Client(cluster) self.n_workers = n_workers def distribute(self, func, partitioned_chunks, kwargs): """ Calculates the features in a parallel fashion by distributing the map command to the dask workers on a local machine Parameters ---------- func : Callable Function to send to each worker. partitioned_chunks : List List of data chunks, each chunk is processed by one woker kwargs : Dict Parameters for the map function Returns ------- List The result of the calculation as a list - each item should be the result of the application of func to a single element. """ if isinstance(partitioned_chunks, Iterable): # since dask 2.0.0 client map no longer accepts iterables partitioned_chunks = list(partitioned_chunks) result = self.client.gather( self.client.map(partial(func, **kwargs), partitioned_chunks)) return result def close(self): """ Closes the connection to the local Dask Scheduler """ self.client.close()
class ClusterDaskDistributor(DistributorBaseClass): """ Distributor using a dask cluster, meaning that the calculation is spread over a cluster """ def __init__(self, address): """ Sets up a distributor that connects to a Dask Scheduler to distribute the calculaton of the features :param address: the ip address and port number of the Dask Scheduler :type address: str """ from distributed import Client self.client = Client(address=address) def calculate_best_chunk_size(self, data_length): """ Uses the number of dask workers in the cluster (during execution time, meaning when you start the extraction) to find the optimal chunk_size. :param data_length: A length which defines how many calculations there need to be. :type data_length: int """ n_workers = len(self.client.scheduler_info()["workers"]) chunk_size, extra = divmod(data_length, n_workers * 5) if extra: chunk_size += 1 return chunk_size def distribute(self, func, partitioned_chunks, kwargs): """ Calculates the features in a parallel fashion by distributing the map command to the dask workers on a cluster :param func: the function to send to each worker. :type func: callable :param partitioned_chunks: The list of data chunks - each element is again a list of chunks - and should be processed by one worker. :type partitioned_chunks: iterable :param kwargs: parameters for the map function :type kwargs: dict of string to parameter :return: The result of the calculation as a list - each item should be the result of the application of func to a single element. """ if isinstance(partitioned_chunks, Iterable): # since dask 2.0.0 client map no longer accepts iterables partitioned_chunks = list(partitioned_chunks) result = self.client.gather( self.client.map(partial(func, **kwargs), partitioned_chunks)) return [item for sublist in result for item in sublist] def close(self): """ Closes the connection to the Dask Scheduler """ self.client.close()
def dask_evaluate(outputs): utils.port_increment += 2 scheduler_port = 8786 + utils.port_increment diagnostics_port = 8787 + utils.port_increment cluster = LocalCluster(n_workers=1, threads_per_worker=10, nanny=False, scheduler_port=scheduler_port, diagnostics_port=diagnostics_port) client = Client(cluster) futures = client.persist(outputs) return client.gather(futures)
class ClusterDaskDistributor(DistributorBaseClass): """ Distributor using a dask cluster, meaning that the calculation is spread over a cluster """ def __init__(self, address): """ Sets up a distributor that connects to a Dask Scheduler to distribute the calculaton of the features :param address: the ip address and port number of the Dask Scheduler :type address: str """ from distributed import Client self.client = Client(address=address) def calculate_best_chunk_size(self, data_length): """ Uses the number of dask workers in the cluster (during execution time, meaning when you start the extraction) to find the optimal chunk_size. :param data_length: A length which defines how many calculations there need to be. :type data_length: int """ n_workers = len(self.client.scheduler_info()["workers"]) chunk_size, extra = divmod(data_length, n_workers * 5) if extra: chunk_size += 1 return chunk_size def distribute(self, func, partitioned_chunks, kwargs): """ Calculates the features in a parallel fashion by distributing the map command to the dask workers on a cluster :param func: the function to send to each worker. :type func: callable :param partitioned_chunks: The list of data chunks - each element is again a list of chunks - and should be processed by one worker. :type partitioned_chunks: iterable :param kwargs: parameters for the map function :type kwargs: dict of string to parameter :return: The result of the calculation as a list - each item should be the result of the application of func to a single element. """ result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks)) return [item for sublist in result for item in sublist] def close(self): """ Closes the connection to the Dask Scheduler """ self.client.close()
def main(): #get command line arguments controling launch threads = 1 workers = 8 for x in sys.argv[1:]: if x.find("threads") > -1: z = x.split("=") threads = int(z[1]) if x.find("workers") > -1: z = x.split("=") workers = int(z[1]) # launch with either threads and/or workers specified (0 = default) if threads == 0 and workers != 0: print("lanching %d workers, default threads" % (workers)) cluster = LocalCluster(n_workers=workers) if threads != 0 and workers == 0: print("lanching %d threads, defalut workers" % (threads)) cluster = LocalCluster(threads_per_worker=threads) if threads != 0 and workers != 0: print("lanching %d workers with %d threads" % (workers, threads)) cluster = LocalCluster(n_workers=workers, threads_per_worker=threads) print(cluster) client = Client(cluster) print(client) # do serial # NOTE: it is possible to launch an asynchronous client # but here we just do serial synchronous. See: # https://distributed.dask.org/en/latest/asynchronous.html result = [] print(" pid Start T") for i in range(0, 5): j = 2 result.append(client.submit(test, i, j).result()) print(result) print(Counter(result)) #do parallel n = 15 np.random.seed(1234) x = np.random.random(n) * 20 #set to uniform nonzero to get uniform run times for each task x = np.ones(n) * 10 print(x) print(" pid Start T") L = client.map(test, range(n), x) mylist = client.gather(L) pids = [] for m in mylist: x = m.split()[0] pids.append(x) print(m) pids = sorted(set(pids)) print(len(pids), pids)
class LocalDaskDistributor(DistributorBaseClass): """ Distributor using a local dask cluster and inproc communication. """ def __init__(self, n_workers): """ Initiates a LocalDaskDistributor instance. :param n_workers: How many workers should the local dask cluster have? :type n_workers: int """ from distributed import LocalCluster, Client import tempfile # attribute .local_dir_ is the path where the local dask workers store temporary files self.local_dir_ = tempfile.mkdtemp() cluster = LocalCluster(n_workers=n_workers, processes=False, local_dir=self.local_dir_) self.client = Client(cluster) self.n_workers = n_workers def distribute(self, func, partitioned_chunks, kwargs): """ Calculates the features in a parallel fashion by distributing the map command to the dask workers on a local machine :param func: the function to send to each worker. :type func: callable :param partitioned_chunks: The list of data chunks - each element is again a list of chunks - and should be processed by one worker. :type partitioned_chunks: iterable :param kwargs: parameters for the map function :type kwargs: dict of string to parameter :return: The result of the calculation as a list - each item should be the result of the application of func to a single element. """ if isinstance(partitioned_chunks, Iterable): # since dask 2.0.0 client map no longer accepts iteratables partitioned_chunks = list(partitioned_chunks) result = self.client.gather( self.client.map(partial(func, **kwargs), partitioned_chunks)) return [item for sublist in result for item in sublist] def close(self): """ Closes the connection to the local Dask Scheduler """ self.client.close()
def run(spec: dict, scheduler: str): class CompleteDaskJob: def __init__(self, message: str = ""): self.message = message class InvalidDaskJob(): def __init__(self, message: str = ""): self.message = message class DaskQueryJob(): def __init__(self, job_spec: dict): self.query_string = job_spec.get("query_string") self.database = job_spec.get("database") self.output_path = job_spec.get("output_path") def run_job(self) -> Union[CompleteDaskJob, InvalidDaskJob]: # df: DataFrame = dd.read_sql_table(self.query_string) if self.output_path: # df.to_parquet(self.output_path) return CompleteDaskJob( f"Job to query via Dask succesfully queued to scheduler") else: return InvalidDaskJob( "Output path required for Dask implementation of table query" ) dask_job = DaskQueryJob(spec) mode = "async" if scheduler == "local": client = Client() dask_job.run_job() else: dask.config.set({'distributed.scheduler.allowed-failures': 50}) client = Client(scheduler) future = client.submit(dask_job.run_job) if mode == "sync": client.gather(future) else: fire_and_forget(future)
class LocalDaskDistributor(DistributorBaseClass): """ Distributor using a local dask cluster and inproc communication. """ def __init__(self, n_workers): """ Initiates a LocalDaskDistributor instance. :param n_workers: How many workers should the local dask cluster have? :type n_workers: int """ from distributed import LocalCluster, Client import tempfile # attribute .local_dir_ is the path where the local dask workers store temporary files self.local_dir_ = tempfile.mkdtemp() cluster = LocalCluster(n_workers=n_workers, processes=False, local_dir=self.local_dir_) self.client = Client(cluster) self.n_workers = n_workers def distribute(self, func, partitioned_chunks, kwargs): """ Calculates the features in a parallel fashion by distributing the map command to the dask workers on a local machine :param func: the function to send to each worker. :type func: callable :param partitioned_chunks: The list of data chunks - each element is again a list of chunks - and should be processed by one worker. :type partitioned_chunks: iterable :param kwargs: parameters for the map function :type kwargs: dict of string to parameter :return: The result of the calculation as a list - each item should be the result of the application of func to a single element. """ result = self.client.gather(self.client.map(partial(func, **kwargs), partitioned_chunks)) return [item for sublist in result for item in sublist] def close(self): """ Closes the connection to the local Dask Scheduler """ self.client.close()
def main(): """ Use the Dask distributed client to run a function in parallel. """ client = Client(n_workers=8) numbers = [3, 4, 5, 8, 12, 18, 25] futures = [] for n in numbers: a = client.submit(adder, n) futures.append(a) results = client.gather(futures) print(results) client.close()
def test_use_with_dask(): try: import dask import dask.distributed from distributed import Client except ImportError: import warnings warnings.warn("Dask and/or Distributed are not installed") return with open(f"{CURRENT_DIR}/test-ogusa-remote.json") as f: remote_outputs = json.loads(f.read()) outputs = cs_storage.read(remote_outputs["outputs"]) c = Client() futures = c.map(cs_storage.screenshot, outputs["renderable"]) results = c.gather(futures) for result in results: assert isinstance(result, bytes)
def main(): from argparse import ArgumentParser parser = ArgumentParser() #parser.add_argument('min_num', type=int) #parser.add_argument('max_num', type=int) args = parser.parse_args() num_threads = int( environ.get('SLURM_CPUS_PER_TASK', environ.get('OMP_NUM_THREADS', 1))) initialize(interface='ib0', nthreads=num_threads) client = Client() min_num = 10 max_num = 100 start_time = datetime.now() num_primes = sum( client.gather(client.map(slow_is_prime, range(min_num, max_num + 1)))) end_time = datetime.now() print(f'{num_primes} primes between {min_num} and {max_num} ' f'[{end_time - start_time}]')
class DaskClient(Thread): def __init__(self, clientUrl, clientId, daqObjectGenerator, resultQ): Thread.__init__(self, name='DaskClient-%s' % clientId) self.client = Client(clientUrl) self.clientId = clientId self.daqObjectGenerator = daqObjectGenerator self.resultQ = resultQ self.idQ = Queue() self.remoteIdQ = self.client.scatter(self.idQ) self.generatorQ = self.client.map(self.daqObjectGenerator.generate, self.remoteIdQ) self.pvQ = self.client.gather(self.generatorQ) self.nGenerated = 0 self.event = Event() def putTask(self, objectId): #t0 = time.time() self.idQ.put(objectId) #t1 = time.time() #dt = t1-t0 #print('PUSH TASK: %s' % dt) #self.event.set() def getPv(self, timeout=None): #t0 = time.time() pv = self.pvQ.get(timeout=timeout) #t1 = time.time() #dt = t1-t0 #print('GET PV: %s' % dt) return pv def run(self): print('STARTING THREAD, CLIENT ID: %s' % self.clientId) while True: pv = self.pvQ.get(timeout=None) self.nGenerated += 1 #print('GOT PV , CLIENT ID %s: %s' % (self.clientId, pv['ArrayId'])) #print('CLIENT ID %s: N GENERATED=%s' % (self.clientId, self.nGenerated)) self.resultQ.put((pv, self.clientId))
def dask_compute_grid(ddclient=None, func=None, **kwargs): temp_cluster = False completed = [] if ddclient is None: print('creating local dask distributed cluster...') # ddclient = Client() ddclient = Client() temp_cluster = True # print('cluster dashboard available at: ' + get_ddclient_dashboard_address(ddclient)) try: print('cluster dashboard available at: ' + dask_get_ddclient_dashboard_address(ddclient)) from IPython.display import display display(ddclient) tfunc = make_return_tuple(func) kwargs_list = ([(k, i) for i in v] for k, v in kwargs.items()) # tuple of cartesian products of {{(arg_name, arg_val) | arg_val in arg_vals} | arg_name in arg_names} cart_prod_tup = product(*kwargs_list) cart_prod_dicts = [dict(i) for i in cart_prod_tup] print('submitting {} jobs to cluster...'.format(len(cart_prod_dicts))) futures = [ddclient.submit(tfunc, **kwargs) for kwargs in cart_prod_dicts] print('computing jobs...') completed = ddclient.gather(futures) print('computation done') finally: if temp_cluster: print('shutting down cluster...') ddclient.close() print('done') return completed
def test_distributed_handler_distributed(values, expected_values): cluster = LocalCluster(processes=False) with DistributedHandler(cluster.scheduler_address) as handler: futures = handler.client.map(lambda x: x + 1, values) handler_map_results = handler.gather(futures) with DistributedHandler(cluster.scheduler_address) as handler: handler_batched_results = handler.batched_map(lambda x: x + 1, values) client = Client(cluster) futures = client.map(lambda x: x + 1, values) distributed_results = client.gather(futures) handler_map_results = set(handler_map_results) handler_batched_results = set(handler_batched_results) distributed_results = set(distributed_results) assert (handler_map_results == handler_batched_results and handler_map_results == distributed_results) cluster.close()
class ClusterDaskDistributor(DistributorBaseClass): """ Distributor using a dask cluster, meaning that the calculation is spread over a cluster """ def __init__(self, address): """ Sets up a distributor that connects to a Dask Scheduler to distribute the calculaton of the features Parameters ---------- address : str The ip address and port number of the Dask Scheduler """ super().__init__() from distributed import Client self.client = Client(address=address) def calculate_best_chunk_size(self, data_length): """ Uses the number of dask workers in the cluster (during execution time, meaning when you start the extraction) to find the optimal chunk_size. Parameters ---------- data_length: int A length which defines how many calculations there need to be. """ n_workers = len(self.client.scheduler_info()["workers"]) chunk_size, extra = divmod(data_length, n_workers * 5) if extra: chunk_size += 1 return chunk_size def distribute(self, func, partitioned_chunks, kwargs): """ Calculates the features in a parallel fashion by distributing the map command to the dask workers on a cluster Parameters ---------- func : Callable Function to send to each worker. partitioned_chunks : List List of data chunks, each chunk is processed by one woker kwargs : Dict Parameters for the map function Returns ------- List The result of the calculation as a list - each item should be the result of the application of func to a single element """ if isinstance(partitioned_chunks, Iterable): # since dask 2.0.0 client map no longer accepts iterables partitioned_chunks = list(partitioned_chunks) result = self.client.gather( self.client.map(partial(func, **kwargs), partitioned_chunks)) return result def close(self): """ Closes the connection to the Dask Scheduler """ self.client.close()
a.append(url) return a def get_url(r): url = 'https://s3.amazonaws.com/cloudydap/bytestream/'+r['md5'] return url def compute(url): # print url response = urllib2.urlopen(url) buf = response.read() # print len(buf) dec = zlib.decompressobj(32+zlib.MAX_WBITS) unzipped = dec.decompress(buf) # print len(unzipped) # Pick a specific point a = unzipped[1]+unzipped[13104]+unzipped[26208]+unzipped[39312] # print struct.unpack('<f', a) return struct.unpack('<f', a) # a = search("PRECCU AND chunk_position:\[0,0,0\] AND filename:MERRA2_100*") a = search("PRECCU AND chunk_position:\[0,91,288\] AND filename:MERRA2_100*") # a = search("PRECCU AND chunk_position:\[0,0,0\] AND filename:*tavgM_2d_int_*") # search("PRECCU AND chunk_position:\[0,91,288\] AND filename: MERRA2_400.tavgM_2d_int_Nx.201507.nc4") c = Client('localhost:8786') m = c.map(compute, a) x = c.gather(m) print x
arg_parser.add_argument('--n', type=int, default=100, help='number of terms in sum') arg_parser.add_argument('--verbose', action='store_true', help='give verbose output') options = arg_parser.parse_args() client = Client('{0}:{1}'.format(options.scheduler, options.scheduler_port)) if options.verbose: print('Client: {0}'.format(str(client)), flush=True) futures = client.map(square, range(options.n)) total = client.submit(sum, futures) expected_total = (options.n - 1) * options.n * (2 * options.n - 1) // 6 print('sum_i=0..99 i^2 = {0:d}, expected {1:d}'.format( total.result(), expected_total)) futures = client.map(get_hostname, range(options.n)) process_locations = client.gather(futures) if options.verbose: print('task placement:') print('\t' + '\n\t'.join(process_locations)) count = dict() for process_location in process_locations: _, _, hostname = process_location.split() if hostname not in count: count[hostname] = 0 count[hostname] += 1 for hostname, nr_tasks in count.items(): print('{0:d} tasks on {1}'.format(nr_tasks, hostname))
with np.load("../dask_fft_data_s0000.npz") as df: num_channels, num_fft = df["fft_data"].shape print(num_channels, num_fft) fft_data = da.from_array(df["fft_data"], chunks=(1, num_fft)) dask_client.persist(fft_data) # Calculate the crosspower using the array interface res1 = (fft_data[:2, :] * fft_data[-2:, :].conj()).mean(axis=1) print("type res1 = ", type(res1)) res2 = da.arctan2(res1.real, res1.imag).real print("type res2 = ", type(res2)) print("result res2 = ", res2.compute()) # Calculate the crosspower using the distributed interface def cross_phase(ft_data, ch1, ch2): _tmp1 = (ft_data[ch1, :] * ft_data[ch2, :].conj()).mean().compute() print("** crosspower: type(tmp1) =", type(_tmp1)) _tmp2 = np.arctan2(_tmp1.real, _tmp1.imag).real #_tmp2 = _tmp1.real + _tmp1.imag return (_tmp2) res_d = dask_client.submit(cross_phase, fft_data, 1, 6) print("type resd = ", type(res_d)) print("results resd = ", dask_client.gather(res_d)) # End of file test_crossphase.py
# Set up scheduler s = Scheduler(loop=loop) s.start() #Set up Workers w = Worker('comet-14-02.sdsc.edu', loop=loop) w.start(0) # Set up client client = Client('comet-14-02.sdsc.edu:8786') def chunks(l, n): for i in range(0, len(l), n): yield l[i:i + n] #pprint.pprint(list(chunks(range(0, 255), 64))) output = [] y = list(chunks(range(0, 255), 64)) #print y[0] for ix in y: a = client.map(sum, ix) output.append(a) total = client.submit(sum, output) total.visualize() print total.compute() client.gather(total)
def parallel_calculate_chunks(chunks, features, approximate, training_window, verbose, save_progress, entityset, n_jobs, no_unapproximated_aggs, cutoff_df_time_var, target_time, pass_columns, dask_kwargs=None): from distributed import Client, LocalCluster, as_completed from dask.base import tokenize client = None cluster = None try: if 'cluster' in dask_kwargs: cluster = dask_kwargs['cluster'] else: diagnostics_port = None if 'diagnostics_port' in dask_kwargs: diagnostics_port = dask_kwargs['diagnostics_port'] del dask_kwargs['diagnostics_port'] workers = n_jobs_to_workers(n_jobs) workers = min(workers, len(chunks)) cluster = LocalCluster(n_workers=workers, threads_per_worker=1, diagnostics_port=diagnostics_port, **dask_kwargs) # if cluster has bokeh port, notify user if unxepected port number if diagnostics_port is not None: if hasattr(cluster, 'scheduler') and cluster.scheduler: info = cluster.scheduler.identity() if 'bokeh' in info['services']: msg = "Dashboard started on port {}" print(msg.format(info['services']['bokeh'])) client = Client(cluster) # scatter the entityset # denote future with leading underscore start = time.time() es_token = "EntitySet-{}".format(tokenize(entityset)) if es_token in client.list_datasets(): print("Using EntitySet persisted on the cluster as dataset %s" % (es_token)) _es = client.get_dataset(es_token) else: _es = client.scatter([entityset])[0] client.publish_dataset(**{_es.key: _es}) # save features to a tempfile and scatter it pickled_feats = cloudpickle.dumps(features) _saved_features = client.scatter(pickled_feats) client.replicate([_es, _saved_features]) end = time.time() scatter_time = end - start scatter_string = "EntitySet scattered to workers in {:.3f} seconds" print(scatter_string.format(scatter_time)) # map chunks # TODO: consider handling task submission dask kwargs _chunks = client.map(calculate_chunk, chunks, features=_saved_features, entityset=_es, approximate=approximate, training_window=training_window, profile=False, verbose=False, save_progress=save_progress, no_unapproximated_aggs=no_unapproximated_aggs, cutoff_df_time_var=cutoff_df_time_var, target_time=target_time, pass_columns=pass_columns) feature_matrix = [] iterator = as_completed(_chunks).batches() if verbose: pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | " "Progress: {l_bar}{bar}| " "Calculated: {n}/{total} chunks") pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str) for batch in iterator: results = client.gather(batch) for result in results: feature_matrix.append(result) if verbose: pbar.update() if verbose: pbar.close() except Exception: raise finally: if 'cluster' not in dask_kwargs and cluster is not None: cluster.close() if client is not None: client.close() return feature_matrix
from distributed import Client import time client = Client("192.168.0.106:8786") client.restart() from funcs import create_dirs, get_dirs, add_flag future = client.map(create_dirs, range(100)) flags = client.submit(get_dirs, future) client.gather(flags) print(flags)
class Distributed(object): ''' Distributed objects represent SUMMA configurations where there are multiple GRU/HRU which are expected to be run in parallel. Currently only supports GRU based parallelization. Attributes ---------- executable: Path to the SUMMA executable manager: FileManager object num_workers: Number of parallel workers to use chunk_args: List of dictionaries containing ``startGRU`` and ``countGRU`` values simulations: Dictionary of run names and Simulation objects ''' def __init__(self, executable: str, filemanager: str, num_workers: int = 1, threads_per_worker: int = OMP_NUM_THREADS, chunk_size: int = None, num_chunks: int = None, scheduler: str = None, client: Client = None): """ Initialize a new distributed object Parameters ---------- executable: Path to the SUMMA executable filemanager: Path to the file manager num_workers: Number of workers to use for parallel runs threads_per_worker: Number of threads each worker has chunk_size: Number of GRU per job (cannot be used with num_chunks) num_chunks: How many jobs to split the run into (Cannot be used with chunk_size) scheduler: Not used currently """ self._status = 'Initialized' self.executable = executable self.manager_path = Path(os.path.abspath( os.path.realpath(filemanager))) self.manager = FileManager(self.manager_path.parent, self.manager_path.name) self.simulations: Dict[str, Simulation] = {} self.submissions: List = [] self.num_workers: int = num_workers # Try to get a client, and if none exists then start a new one if client: self._client = client workers = len(self._client.get_worker_logs()) if workers <= self.num_workers: self._client.cluster.scale(workers) else: try: self._client = get_client() # Start more workers if necessary: workers = len(self._client.get_worker_logs()) if workers <= self.num_workers: self._client.cluster.scale(workers) except ValueError: self._client = Client(n_workers=self.num_workers, threads_per_worker=threads_per_worker) self.chunk_args = self._generate_args(chunk_size, num_chunks) self._generate_simulation_objects() def _generate_simulation_objects(self): """ Create each of the required simulation objects """ for argdict in self.chunk_args: start = argdict['startGRU'] stop = argdict['startGRU'] + argdict['countGRU'] - 1 name = f"g{start}-{stop}" self.simulations[name] = Simulation(self.executable, self.manager_path, False) def _generate_args(self, chunk_size: int = None, num_chunks: int = None): ''' Generate the arguments that will be used to start multiple runs from the base ``self.simulation`` ''' assert not (chunk_size and num_chunks), \ "Only specify at most one of `chunk_size` or `num_chunks`!" start, stop = 0, 0 sim_size = len(self.manager.local_attributes['gru']) if not (chunk_size or num_chunks): chunk_size = 12 if chunk_size: sim_truncated = (chunk_size - 1) * (sim_size // (chunk_size - 1)) starts = np.arange(1, sim_truncated + 1, chunk_size).astype(int) stops = np.append(starts[1:], sim_size + 1) chunks = np.vstack([starts, stops]).T elif num_chunks: chunk_size = np.ceil(sim_size / num_chunks).astype(int) starts = np.arange(1, sim_size, chunk_size) stops = np.append(starts[1:], sim_size + 1) chunks = np.vstack([starts, stops]).T return [{ 'startGRU': start, 'countGRU': stop - start } for start, stop in chunks] def start(self, run_option: str, prerun_cmds: List = None): """ Start running the ensemble members. Parameters ---------- run_option: The run type. Should be either 'local' or 'docker' prerun_cmds: A list of preprocessing commands to run """ for idx, (name, sim) in enumerate(self.simulations.items()): kwargs = self.chunk_args[idx] self.submissions.append( self._client.submit(_submit, sim, name, run_option, prerun_cmds, kwargs)) def run(self, run_option: str, prerun_cmds=None, monitor: bool = True): """ Run the ensemble Parameters ---------- run_option: Where to run the simulation. Can be ``local`` or ``docker`` prerun_cmds: A list of shell commands to run before running SUMMA monitor: Whether to halt operation until runs are complete """ self.start(run_option, prerun_cmds) if monitor: return self.monitor() else: return True def monitor(self): """ Halt computation until submitted simulations are complete """ simulations = self._client.gather(self.submissions) for s in simulations: self.simulations[s.run_suffix] = s def merge_output(self): pass
arg_parser.add_argument('--scheduler', help='scheduler host') arg_parser.add_argument('--scheduler_port', default='8786', help='scheduler port to use') arg_parser.add_argument('--n', type=int, default=100, help='number of terms in sum') arg_parser.add_argument('--verbose', action='store_true', help='give verbose output') options = arg_parser.parse_args() client = Client('{0}:{1}'.format(options.scheduler, options.scheduler_port)) if options.verbose: print('Client: {0}'.format(str(client)), flush=True) futures = client.map(square, range(options.n)) total = client.submit(sum, futures) expected_total = (options.n - 1)*options.n*(2*options.n - 1)//6 print('sum_i=0..99 i^2 = {0:d}, expected {1:d}'.format(total.result(), expected_total)) futures = client.map(get_hostname, range(options.n)) process_locations = client.gather(futures) if options.verbose: print('task placement:') print('\t' + '\n\t'.join(process_locations)) count = dict() for process_location in process_locations: _, _, hostname = process_location.split() if hostname not in count: count[hostname] = 0 count[hostname] += 1 for hostname, nr_tasks in count.items(): print('{0:d} tasks on {1}'.format(nr_tasks, hostname))
class Ensemble(object): ''' Ensembles represent an multiple SUMMA configurations based on changing the decisions or parameters of a given run. Attributes ---------- executable: Path to the SUMMA executable filemanager: (optional) Path to the file manager configuration: Dictionary of runs, along with settings num_workers: Number of parallel workers to use simulations: Dictionary of run names and Simulation objects ''' def __init__(self, executable: str, configuration: dict, filemanager: str = None, num_workers: int = 1, threads_per_worker: int = OMP_NUM_THREADS, scheduler: str = None, client: Client = None): """ Create a new Ensemble object. The API mirrors that of the Simulation object. """ self._status = 'Initialized' self.executable: str = executable self.filemanager: str = filemanager self.configuration: dict = configuration self.num_workers: int = num_workers self.simulations: dict = {} self.submissions: list = [] # Try to get a client, and if none exists then start a new one if client: self._client = client workers = len(self._client.get_worker_logs()) if workers <= self.num_workers: self._client.cluster.scale(workers) else: try: self._client = get_client() # Start more workers if necessary: workers = len(self._client.get_worker_logs()) if workers <= self.num_workers: self._client.cluster.scale(workers) except ValueError: self._client = Client(n_workers=self.num_workers, threads_per_worker=threads_per_worker) self._generate_simulation_objects() def _generate_simulation_objects(self): """ Create a mapping of configurations to the simulation objects. """ if self.filemanager: for name, config in self.configuration.items(): self.simulations[name] = Simulation(self.executable, self.filemanager, False) else: for name, config in self.configuration.items(): assert config['file_manager'] is not None, \ "No filemanager found in configuration or Ensemble!" self.simulations[name] = Simulation(self.executable, config['file_manager'], False) def _generate_coords(self): """ Generate the coordinates that can be used to merge the output of the ensemble runs into a single dataset. """ decision_dims = ChainDict() manager_dims = ChainDict() parameter_dims = ChainDict() for name, conf in self.configuration.items(): for k, v in conf.get('decisions', {}).items(): decision_dims[k] = v for k, v in conf.get('file_manager', {}).items(): manager_dims[k] = v #for k, v in conf.get('parameters', {}).items(): # parameter_dims[k] = v for k, v in conf.get('trial_parameters', {}).items(): parameter_dims[k] = v return { 'decisions': decision_dims, 'managers': manager_dims, 'parameters': parameter_dims } def merge_output(self): """ Open and merge all of the output datasets from the ensemble run into a single dataset. """ nc = self._generate_coords() new_coords = (list(nc.get('decisions', {})) + list(nc.get('parameters', {}))) decision_tuples = [ tuple(n.split('++')[1:-1]) for n in self.configuration.keys() ] for i, t in enumerate(decision_tuples): decision_tuples[i] = tuple( (float(l.split('=')[-1]) if '=' in l else l for l in t)) decision_names = [ '++'.join(tuple(n.split('++')[1:-1])) for n in self.configuration.keys() ] if sum([len(dt) for dt in decision_tuples]) == 0: raise NameError("Simulations in the ensemble do not share all" " common decisions! Please use `open_output`" " to retrieve the output of this Ensemble") for i, t in enumerate(decision_names): decision_names[i] = '++'.join(l.split('=')[0] for l in t) new_idx = pd.MultiIndex.from_tuples(decision_tuples, names=new_coords) out_file_paths = [ s.get_output_files() for s in self.simulations.values() ] out_file_paths = [fi for sublist in out_file_paths for fi in sublist] full = xr.open_mfdataset(out_file_paths, concat_dim='run_number', combine='nested') merged = full.assign_coords(run_number=decision_names) merged['run_number'] = new_idx merged = merged.unstack('run_number') return merged def start(self, run_option: str, prerun_cmds: list = None): """ Start running the ensemble members. Parameters ---------- run_option: The run type. Should be either 'local' or 'docker' prerun_cmds: A list of preprocessing commands to run """ for n, s in self.simulations.items(): # Sleep calls are to ensure writeout happens config = self.configuration[n] self.submissions.append( self._client.submit(_submit, s, n, run_option, prerun_cmds, config)) def run(self, run_option: str, prerun_cmds=None, monitor: bool = True): """ Run the ensemble Parameters ---------- run_option: Where to run the simulation. Can be ``local`` or ``docker`` prerun_cmds: A list of shell commands to run before running SUMMA monitor: Whether to halt operation until runs are complete """ self.start(run_option, prerun_cmds) if monitor: return self.monitor() else: return True def map(self, fun, args, include_sims=True, monitor=True): for n, s in self.simulations.items(): config = self.configuration[n] if include_sims: all_args = (s, n, *args, {'config': config}) else: all_args = (*args, {'config': config}) self.submissions.append(self._client.submit(fun, *all_args)) if monitor: return self.monitor() else: return True def monitor(self): """ Halt computation until submitted simulations are complete """ simulations = self._client.gather(self.submissions) for s in simulations: self.simulations[s.run_suffix] = s def summary(self): """ Show the user information about ensemble status """ success, error, other = [], [], [] for n, s in self.simulations.items(): if s.status == 'Success': success.append(n) elif s.status == 'Error': error.append(n) else: other.append(n) return {'success': success, 'error': error, 'other': other} def rerun_failed(self, run_option: str, prerun_cmds=None, monitor: bool = True): """ Try to re-run failed simulations. Parameters ---------- run_option: Where to run the simulation. Can be ``local`` or ``docker`` prerun_cmds: A list of shell commands to run before running SUMMA monitor: Whether to halt operation until runs are complete """ run_summary = self.summary() self.submissions = [] for n in run_summary['error']: config = self.configuration[n] s = self.simulations[n] s.reset() self.submissions.append( self._client.submit(_submit, s, n, run_option, prerun_cmds, config)) if monitor: return self.monitor() else: return True
def preprocessing_script(): """ This script will process all the hybridization folders combined in a processing folder. The input parameters are passed using arparse Parameters: ----------- scheduler: string tcp address of the dask.distributed scheduler (ex. tcp://192.168.0.4:7003). default = False. If False the process will run on the local computer using nCPUs-1 path: string Path to the processing directory """ # Inputs of the function parser = argparse.ArgumentParser(description='Preprocessing script') parser.add_argument('-scheduler', default=False, help='dask scheduler address ex. tcp://192.168.0.4:7003') parser.add_argument('-path', help='processing directory') args = parser.parse_args() # Directory to process processing_directory = args.path # Dask scheduler address scheduler_address = args.scheduler if scheduler_address: # Start dask client on server or cluster client=Client(scheduler_address) else: # Start dask client on local machine. It will use all the availabe # cores -1 # number of core to use ncores = multiprocessing.cpu_count()-1 cluster = LocalCluster(n_workers=ncores) client=Client(cluster) # Subdirectories of the processing_directory that need to be skipped for the # analysis blocked_directories = ['_logs'] # Starting logger utils.init_file_logger(processing_directory) logger = logging.getLogger() # Determine the operating system running the code os_windows, add_slash = utils.determine_os() # Check training slash in the processing directory processing_directory=utils.check_trailing_slash(processing_directory,os_windows) # Get a list of the hybridization to process processing_hyb_list = next(os.walk(processing_directory))[1] # Remove the blocked directories from the directories to process processing_hyb_list = [el for el in processing_hyb_list if el not in blocked_directories ] for processing_hyb in processing_hyb_list: # Determine the hyb number from the name hybridization_number = processing_hyb.split('_hyb')[-1] hybridization = 'Hybridization' + hybridization_number hyb_dir = processing_directory + processing_hyb + add_slash # Parse the Experimental metadata file (serial) experiment_infos,image_properties, hybridizations_infos, \ converted_positions, microscope_parameters =\ utils.experimental_metadata_parser(hyb_dir) # Parse the configuration file flt_rawcnt_config = utils.filtering_raw_counting_config_parser(hyb_dir) # ----------------- .nd2 FILE CONVERSION ------------------------------ # Create the temporary subdirectory tree (serial) tmp_dir_path, tmp_gene_dirs=utils.create_subdirectory_tree(hyb_dir,\ hybridization,hybridizations_infos,processing_hyb,suffix='tmp',add_slash=add_slash) # Get the list of the nd2 files to process inside the directory files_list = glob.glob(hyb_dir+processing_hyb+'_raw_data'+add_slash+'*.nd2') # Get the list of genes that are analyzed in the current hybridization gene_list = list(hybridizations_infos[hybridization].keys()) # Organize the file to process in a list which order match the gene_list for # parallel processing organized_files_list = [f for gene in gene_list for f in files_list if gene+'.nd2' in f ] organized_tmp_dir_list = [f for gene in gene_list for f in tmp_gene_dirs if gene in f ] # Each .nd2 file will be processed in a worker part of a different node # Get the addresses of one process/node to use for conversion node_addresses = utils.identify_nodes(client) workers_conversion = [list(el.items())[0][1] for key,el in node_addresses.items()] # Run the conversion futures_processes=client.map(io.nd2_to_npy,gene_list,organized_files_list, tmp_gene_dirs,processing_hyb=processing_hyb, use_ram=flt_rawcnt_config['use_ram'], max_ram=flt_rawcnt_config['max_ram'], workers=workers_conversion) client.gather(futures_processes) # --------------------------------------------------------------------- # ----------------- FILTERING AND RAW COUNTING ------------------------ # Create directories # Create the directory where to save the filtered images suffix = 'filtered_png' filtered_png_img_dir_path, filtered_png_img_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos, processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name']) suffix = 'filtered_npy' filtered_img_dir_path, filtered_img_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos, processing_hyb,suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name']) # Create the directory where to save the counting suffix = 'counting' counting_dir_path, counting_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb, suffix,add_slash,flt_rawcnt_config['skip_tags_counting'], flt_rawcnt_config['skip_genes_counting'], analysis_name=flt_rawcnt_config['analysis_name']) if flt_rawcnt_config['illumination_correction']: # Create the directory where to save the counting suffix = 'illumination_funcs' illumination_func_dir_path, illumination_func_gene_dirs = \ utils.create_subdirectory_tree(hyb_dir,hybridization,hybridizations_infos,processing_hyb, suffix,add_slash,analysis_name=flt_rawcnt_config['analysis_name']) # Loop through channels and calculate illumination for gene in hybridizations_infos[hybridization].keys(): flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy') logger.debug('Create average image for gene %s', gene) # Chunking the image list num_chunks = sum(list(client.ncores().values())) chunked_list = utils.list_chunking(flist_img_to_filter,num_chunks) # Scatter the images sublists to process in parallel futures = client.scatter(chunked_list) # Create dask processing graph output = [] for future in futures: ImgMean = delayed(utils.partial_image_mean)(future) output.append(ImgMean) ImgMean_all = delayed(sum)(output) ImgMean_all = ImgMean_all/float(len(futures)) # Compute the graph ImgMean = ImgMean_all.compute() logger.debug('Create illumination function for gene %s',gene) # Create illumination function Illumination=filters.gaussian(ImgMean,sigma=(20,300,300)) # Normalization of the illumination Illumination_flat=np.amax(Illumination,axis=0) Illumination_norm=Illumination_flat/np.amax(Illumination_flat) logger.debug('Save illumination function for gene %s',gene) # Save the illumination function illumination_path = [ill_path for ill_path in illumination_func_gene_dirs if gene in ill_path][0] illumination_fname=illumination_path+gene+'_illumination_func.npy' np.save(illumination_fname,Illumination_norm,allow_pickle=False) # Broadcast the illumination function to all the cores client.scatter(Illumination_norm, broadcast=True) logger.debug('Filtering %s',gene) # Filtering and counting futures_processes=client.map(counting.filtering_and_counting_ill_correction,flist_img_to_filter, \ illumination_function=Illumination_norm,\ filtered_png_img_gene_dirs=filtered_png_img_gene_dirs,\ filtered_img_gene_dirs =filtered_img_gene_dirs,\ counting_gene_dirs=counting_gene_dirs,plane_keep=flt_rawcnt_config['plane_keep'], \ min_distance=flt_rawcnt_config['min_distance'], stringency=flt_rawcnt_config['stringency'],\ skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting']) client.gather(futures_processes) else: for gene in hybridizations_infos[hybridization].keys(): flist_img_to_filter=glob.glob(hyb_dir+processing_hyb+'_tmp/'+processing_hyb+'_'+gene+'_tmp/*.npy') # filtering logger.debug('Filtering without illumination correction %s',gene) futures_processes=client.map(counting.filtering_and_counting,flist_img_to_filter, \ filtered_png_img_gene_dirs=filtered_png_img_gene_dirs, \ filtered_img_gene_dirs=filtered_img_gene_dirs, \ counting_gene_dirs=counting_gene_dirs, \ plane_keep=flt_rawcnt_config['plane_keep'], min_distance=flt_rawcnt_config['min_distance'],\ stringency=flt_rawcnt_config['stringency'],\ skip_genes_counting=flt_rawcnt_config['skip_genes_counting'],skip_tags_counting=flt_rawcnt_config['skip_tags_counting']) client.gather(futures_processes) # --------------------------------------------------------------------- # # ----------------- COMBINE THE FILTERED DATA IN .ppf.hdf5 ------------------------ # # Combine the filter data in one single .ppf for each hybridization # # This step will run in serial mode and will not need to shuffle data # # between cores because everything is on the common file system # logger.debug('Create .ppf.hdf5 file') # # Create the ppf.hdf5 file that contains the filtered data in uint16 # preprocessing_file_path = hdf5_utils.hdf5_create_preprocessing_file(hybridizations_infos,processing_hyb, # hybridization,flt_rawcnt_config['analysis_name'], hyb_dir,converted_positions,image_properties) # logger.debug('Write the .npy filtered files into the .ppf file') # # Load and write the .npy tmp images into the hdf5 file # # open the hdf5 file # with h5py.File(preprocessing_file_path) as f_hdl: # # Loop through each gene # for gene in hybridizations_infos[hybridization].keys(): # logger.debug('Writing %s images in .ppf.hdf5',gene) # # list of the files to transfer # filtered_gene_dir = [fdir for fdir in filtered_img_gene_dirs if gene in fdir][0] # filtered_files_list = glob.glob(filtered_gene_dir+'*.npy') # # loop through the list of file # for f_file in filtered_files_list: # pos = f_file.split('/')[-1].split('_')[-1].split('.')[0] # f_hdl[gene]['FilteredData'][pos][:] =np.load(f_file) # f_hdl.flush() # # --------------------------------------------------------------------- # # ----------------- STITCHING ------------------------ # # Load the stitching parameters from the .yaml file # # Stitch the image in 2D or 3D (3D need more work/testing) # nr_dim = flt_rawcnt_config['nr_dim'] # # Estimated overlapping between images according to the Nikon software # est_overlap = image_properties['Overlapping_percentage'] # # Number of peaks to use for the alignment # nr_peaks = flt_rawcnt_config['nr_peaks'] # # Determine if the coords need to be flipped # y_flip = flt_rawcnt_config['y_flip'] # # Method to use for blending # # can be 'linear' or 'non linear' # # The methods that performs the best is the 'non linear' # blend = flt_rawcnt_config['blend'] # # Reference gene for stitching # reference_gene = flt_rawcnt_config['reference_gene'] # pixel_size = image_properties['PixelSize'] # # Get the list of the filtered files of the reference gene # filtered_gene_dir = [gene_dir for gene_dir in filtered_img_gene_dirs if reference_gene in gene_dir][0] # filtered_files_list = glob.glob(filtered_gene_dir+'*.npy') # # Create pointer of the hdf5 file that will store the stitched reference image # # for the current hybridization # # Writing # tile_file_base_name = flt_rawcnt_config['analysis_name']+'_'+ processing_hyb # data_name = (tile_file_base_name # + '_' + reference_gene # + '_stitching_data') # stitching_file_name = tile_file_base_name + '.sf.hdf5' # stitching_file= h5py.File(hyb_dir+stitching_file_name,'w',libver='latest') # replace with 'a' as soon as you fix the error # # Determine the tiles organization # tiles, contig_tuples, nr_pixels, z_count, micData = stitching.get_pairwise_input_npy(image_properties,converted_positions, hybridization, # est_overlap = est_overlap, y_flip = False, nr_dim = 2) # # Align the tiles # futures_processes=client.map(pairwisesingle.align_single_pair_npy,contig_tuples, # filtered_files_list=filtered_files_list,micData=micData, # nr_peaks=nr_peaks) # # Gather the futures # data = client.gather(futures_processes) # # In this case the order of the returned contingency tuples is with # # the order of the input contig_tuples # # P_all = [el for data_single in data for el in data_single[0]] # P_all =[data_single[0] for data_single in data ] # P_all = np.array(P_all) # P_all = P_all.flat[:] # covs_all = [data_single[1] for data_single in data] # alignment = {'P': P_all, # 'covs': covs_all} # # Calculates a shift in global coordinates for each tile (global # # alignment) and then applies these shifts to the corner coordinates # # of each tile and returns and saves these shifted corner coordinates. # joining = stitching.get_place_tile_input(hyb_dir, tiles, contig_tuples, # micData, nr_pixels, z_count, # alignment, data_name, # nr_dim=nr_dim) # # Create the hdf5 file structure # stitched_group, linear_blending, blend = hdf5preparation.create_structures_hdf5_stitched_ref_gene_file_npy(stitching_file, joining, nr_pixels, # reference_gene, blend = 'non linear') # # Fill the hdf5 containing the stitched image with empty data and # # create the blending mask # stitched_group['final_image'][:]= np.zeros(joining['final_image_shape'],dtype=np.float64) # if blend is not None: # # make mask # stitched_group['blending_mask'][:] = np.zeros(joining['final_image_shape'][-2:],dtype=np.float64) # tilejoining.make_mask(joining, nr_pixels, stitched_group['blending_mask']) # # Create the subdirectory used to save the blended tiles # suffix = 'blended_tiles' # blended_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash, # analysis_name=flt_rawcnt_config['analysis_name']) # # Get the directory with the filtered npy images of the reference_gene to use for stitching # stitching_files_dir = [npy_dir for npy_dir in filtered_img_gene_dirs if reference_gene in npy_dir][0] # # Create the tmp directory where to save the masks # suffix = 'masks' # masked_tiles_directory = utils.create_single_directory(hyb_dir,reference_gene, hybridization,processing_hyb,suffix,add_slash, # analysis_name=flt_rawcnt_config['analysis_name']) # # Create and save the mask files # for corn_value,corner_coords in joining['corner_list']: # if not(np.isnan(corner_coords[0])): # cur_mask = stitched_group['blending_mask'][int(corner_coords[0]):int(corner_coords[0]) + int(nr_pixels), # int(corner_coords[1]):int(corner_coords[1]) + int(nr_pixels)] # fname = masked_tiles_directory + flt_rawcnt_config['analysis_name'] +'_'+processing_hyb+'_'+reference_gene+'_masks_joining_pos_'+str(corn_value) # np.save(fname,cur_mask) # # Blend all the tiles and save them in a directory # futures_processes = client.map(tilejoining.generate_blended_tile_npy,joining['corner_list'], # stitching_files_dir = stitching_files_dir, # blended_tiles_directory = blended_tiles_directory, # masked_tiles_directory = masked_tiles_directory, # analysis_name = flt_rawcnt_config['analysis_name'], # processing_hyb = processing_hyb,reference_gene = reference_gene, # micData = micData,tiles = tiles,nr_pixels=nr_pixels, # linear_blending=linear_blending) # _ = client.gather(futures_processes) # # Write the stitched image # tilejoining.make_final_image_npy(joining, stitching_file, blended_tiles_directory, tiles,reference_gene, nr_pixels) # # close the hdf5 file # stitching_file.close() # # Delete the directories with blended tiles and masks # shutil.rmtree(blended_tiles_directory) # shutil.rmtree(masked_tiles_directory) # ----------------- DELETE FILES ------------------------ # Don't delete the *.npy files here because can be used to # create the final images using the apply stitching related function client.close()
def do(param): dataset = pickle.load(open(f'{os.environ["HOME"]}/dataset.pkl', 'rb')) Xs, ys, Xst, yst = dataset criterion, n_estimators, max_features, max_depth = param model = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_features=max_features, max_depth=max_depth) model.fit(Xs, ys) ysp = model.predict(Xst) acc = accuracy_score(yst, ysp) print(acc) return [acc, list(param)] params = [] for cri in ['gini', 'entropy']: for n_esti in range(5, 15): for max_features in range(10, 20): for max_depth in range(4, 20): params.append((cri, n_esti, max_features, max_depth)) L = client.map(do, params) ga = client.gather(L) import json json.dump(ga, open('ga.json', 'w'), indent=2) print(ga)
help='port of the dask scheduler') options = arg_parser.parse_args() client = Client(f'{options.host}:{options.port:d}') if options.implementation == 'python': from julia_python import julia_set elif options.implementation == 'cython': from julia_cython import julia_set client.register_worker_callbacks(init_pyx) elif options.implementation == 'cython_omp': from julia_cython_omp import julia_set client.register_worker_callbacks(init_omp_pyx) else: msg = '{0} version not implemented\n' sys.stderr.write(msg.format(options.implementation)) sys.exit(1) domain = init_julia((options.re_min, options.re_max), (options.im_min, options.im_max), (options.n_re, options.n_im)) domains = np.array_split(domain, options.partitions) iterations = np.array_split( np.zeros(options.n_re * options.n_im, dtype=np.int32), options.partitions) start_time = time.time() futures = client.map(julia_set, domains, iterations) results = client.gather(futures) end_time = time.time() print('compute time = {0:.6f} s'.format(end_time - start_time)) np.savetxt('julia.txt', np.concatenate(results).reshape(options.n_re, options.n_im))
# ----------- # monte carlo # ----------- # define output file names OUTPUT = init_outputs() if CUTOFF < NSMPL: init_headers() # initialize simulation if RESTART: STATE = load_samples_restart() replica_exchange() else: if DASK: STATE = CLIENT.gather(init_samples()) else: STATE = init_samples() # loop through to number of samples that need to be collected for STEP in tqdm(range(NSMPL)): if VERBOSE and DASK: client_info() # generate samples STATE[:] = gen_samples() # generate mc parameters if (STEP + 1) > CUTOFF: # write data write_outputs() if DASK: # gather results from cluster STATE[:] = CLIENT.gather(STATE)
###Aux channels### ################## chunk = 16384 pad = 256 # Find the data #cache1=find_raw_frames(ifo, st1, st1+dur) #cache2=find_raw_frames(ifo, st2, st2+dur) # Connect to Dask scheduler client = Client(args.address) for t1, t2 in chunk_segments(segs, chunk, pad): print 'Getting chunk', t1, t2 # Set up the channel list params_list = [(chan, ifo, t1, t2) for chan in channels ] #Add in st1, st2, dur for psd comparison tool # Run jobs on the cluster and return results jobs = client.map(aux_feat_get, params_list) result = client.gather(jobs) # Write out the results #Will sort the results by how much difference in the PSD there is #result.sort(key=lambda x: x[1], reverse=True) with open('results_of_aux_%u-%u.dat' % (t1, (t2 - t1)), 'wb') as fout: pickle.dump(result, fout)
# dask client from distributed import Client from os.path import join from math import ceil from thredds_configuration import file_list_url, data_request, data_folder, thredds_servers from dask_configuration import dask_scheduler_url from thredds_utils import list_thredds_folder, compute_url_to_thredds_server_map, compute_avg_func array_list = [] file_list = list_thredds_folder(file_list_url) # connect to dask client = Client(dask_scheduler_url) url_list = [] for f in file_list: url_list.append(data_request + "/" + data_folder + "/" + f + "?time1[0],Temperature_surface[0][0:360][0:719]") # allocate url to threads servers server_url_mapping = compute_url_to_thredds_server_map(url_list, thredds_servers) # launch the dask computation and collect results avg_results_status = client.map(compute_avg_func, server_url_mapping) avg_results = client.gather(avg_results_status) final_avg = np.mean(avg_results) print(final_avg)