def __init__(self, name, worker=None, client=None): if worker is None and client is None: from distributed.worker import get_worker, get_client try: worker = get_worker() except Exception: client = get_client() self.worker = worker self.client = client if self.worker: self.loop = self.worker.loop elif self.client: self.loop = self.client.loop self.name = name self.buffer = deque() if self.worker: pubsub = self.worker.extensions["pubsub"] elif self.client: pubsub = self.client.extensions["pubsub"] self.loop.add_callback(pubsub.subscribers[name].add, self) msg = {"op": "pubsub-add-subscriber", "name": self.name} if self.worker: self.loop.add_callback(self.worker.batched_stream.send, msg) elif self.client: self.loop.add_callback(self.client.scheduler_comm.send, msg) else: raise Exception() weakref.finalize(self, pubsub.trigger_cleanup)
def __init__(self, name, worker=None, client=None): if worker is None and client is None: from distributed import get_worker, get_client try: worker = get_worker() except Exception: client = get_client() self.subscribers = dict() self.worker = worker self.client = client assert client or worker if self.worker: self.scheduler = self.worker.scheduler self.loop = self.worker.loop elif self.client: self.scheduler = self.client.scheduler self.loop = self.client.loop self.name = name self._started = False self._buffer = [] self.loop.add_callback(self._start) if self.worker: pubsub = self.worker.extensions['pubsub'] self.loop.add_callback(pubsub.publishers[name].add, self) finalize(self, pubsub.trigger_cleanup)
def __init__(self, name, worker=None, client=None): if worker is None and client is None: from distributed import get_worker, get_client try: worker = get_worker() except Exception: client = get_client() self.subscribers = dict() self.worker = worker self.client = client assert client or worker if self.worker: self.scheduler = self.worker.scheduler self.loop = self.worker.loop elif self.client: self.scheduler = self.client.scheduler self.loop = self.client.loop self.name = name self._started = False self._buffer = [] self.loop.add_callback(self._start) if self.worker: pubsub = self.worker.extensions["pubsub"] self.loop.add_callback(pubsub.publishers[name].add, self) weakref.finalize(self, pubsub.trigger_cleanup)
def get_worker_wrapper(): global global_worker try: this_worker = get_worker() except Exception as e: this_worker = global_worker return this_worker
def process(self, events): output = self.accumulator.identity() dataset = events.metadata["dataset"] print(events.metadata) if "checkusermeta" in events.metadata: metaname, metavalue = self.expected_usermeta[dataset] assert metavalue == events.metadata[metaname] mapping = events.behavior["__events_factory__"]._mapping muon_pt = events.Muon.pt if isinstance(mapping, nanoevents.mapping.CachedMapping): keys_in_cache = list(mapping.cache.cache.keys()) has_canaries = [ canary in keys_in_cache for canary in self._canaries ] if has_canaries: try: from distributed import get_worker worker = get_worker() output["worker"].add(worker.name) except ValueError: pass dimuon = ak.combinations(events.Muon, 2) dimuon = dimuon["0"] + dimuon["1"] output["pt"].fill(dataset=dataset, pt=ak.flatten(muon_pt)) output["mass"].fill(dataset=dataset, mass=ak.flatten(dimuon.mass)) output["cutflow"]["%s_pt" % dataset] += sum(ak.num(events.Muon)) output["cutflow"]["%s_mass" % dataset] += sum(ak.num(dimuon)) return output
def worker_state(sessionId: Optional[int] = None) -> dict: """Retrieve the state(s) of the current worker Parameters ---------- sessionId: int, optional Worker session state ID. If None, all states of the worker are returned. Returns ------- state: dict Either a single state dict or a dict of state dict """ worker = get_worker() if not hasattr(worker, "_explicit_comm_state"): worker._explicit_comm_state = {} if sessionId is not None: if sessionId not in worker._explicit_comm_state: worker._explicit_comm_state[sessionId] = { "ts": time.time(), "eps": {}, "loop": worker.loop.asyncio_loop, "worker": worker, } return worker._explicit_comm_state[sessionId] return worker._explicit_comm_state
def predict_in_block(block): from distributed import get_worker read_roi = block.read_roi write_roi = block.write_roi predict_script = '/groups/saalfeld/home/hanslovskyp/experiments/quasi-isotropic/predict/predict.py' cuda_visible_devices = get_worker().cuda_visible_devices predict_scripts_args = '' name = 'predict-%s-%s' % (write_roi.get_begin(), write_roi.get_size()) log_file = os.path.join(cwd, '%s.log' % name) pythonpath = ':'.join([ '%s/workspace-pycharm/u-net/gunpowder' % _HOME, '%s/workspace-pycharm/u-net/CNNectome' % _HOME, '/groups/saalfeld/home/papec/Work/my_projects/z5/bld/python' ]) pythonpath_export_str = 'export PYTHONPATH=%s:$PYTHONPATH' % pythonpath daisy.call([ 'nvidia-docker', 'run', '--rm', '-u', os.getuid(), '-v', '/groups/turaga:/groups/turaga:rshared', '-v', '/groups/saalfeld:/groups/saalfeld:rshared', '-v', '/nrs/saalfeld:/nrs/saalfeld:rshared', '-w', cwd, '--name', name, 'neptunes5thmoon/gunpowder:v0.3-pre6-dask1' '/bin/bash', '-c', '"export CUDA_VISIBLE_DEVICES=%s; %s; python -u %s %s 2>&1 > %s"' % (cuda_visible_devices, pythonpath_export_str, predict_script, predict_script_args, log_file) ])
def inside_dask_worker(): """Check whether the current function is executed inside a Dask worker. """ # This function can not be in joblib._dask because there would be a # circular import: # _dask imports _parallel_backend that imports _dask ... try: from distributed import get_worker except ImportError: return False try: get_worker() return True except ValueError: return False
def __init__(self, name, worker=None, client=None): if worker is None and client is None: from distributed.worker import get_worker, get_client try: worker = get_worker() except Exception: client = get_client() self.worker = worker self.client = client if self.worker: self.loop = self.worker.loop elif self.client: self.loop = self.client.loop self.name = name self.buffer = deque() self.condition = tornado.locks.Condition() if self.worker: pubsub = self.worker.extensions['pubsub'] elif self.client: pubsub = self.client.extensions['pubsub'] self.loop.add_callback(pubsub.subscribers[name].add, self) msg = {'op': 'pubsub-add-subscriber', 'name': self.name} if self.worker: self.loop.add_callback(self.worker.batched_stream.send, msg) elif self.client: self.loop.add_callback(self.client.scheduler_comm.send, msg) else: raise Exception() finalize(self, pubsub.trigger_cleanup)
async def test_cudf_cluster_device_spill(loop, params): async with LocalCUDACluster( 1, device_memory_limit=params["device_memory_limit"], memory_limit=params["memory_limit"], memory_target_fraction=params["host_target"], memory_spill_fraction=params["host_spill"], death_timeout=300, asynchronous=True, ) as cluster: async with Client(cluster, asynchronous=True) as client: cdf = dask.datasets.timeseries(dtypes={ "x": int, "y": float }, freq="30ms").map_partitions( cudf.from_pandas) sizes = await client.compute( cdf.map_partitions(lambda df: df.__sizeof__())) sizes = sizes.tolist() nbytes = sum(sizes) part_index_nbytes = (await client.compute(cdf.partitions[0].index )).__sizeof__() cdf2 = cdf.persist() await wait(cdf2) del cdf await client.run(worker_assert, nbytes, 32, 2048 + part_index_nbytes) host_chunks = await client.run(lambda: len(get_worker().data.host)) disk_chunks = await client.run(lambda: len(get_worker().data.disk)) for hc, dc in zip(host_chunks.values(), disk_chunks.values()): if params["spills_to_disk"]: assert dc > 0 else: assert hc > 0 assert dc == 0 del cdf2 await client.run(worker_assert, 0, 0, 0)
def log_event(topic: str, msg: dict) -> None: try: import distributed worker = distributed.get_worker() except (ImportError, ValueError): return worker.log_event(topic, dict(msg, thread=_curthread()))
async def f(_): worker = get_worker() if hasattr(worker, "running"): assert not worker.running worker.running = True await asyncio.sleep(0.5) assert worker.running worker.running = False
def evaluate(individual, context=context): """ concurrently evaluate the given individual This is what's invoked on each dask worker to evaluate each individual. We log the start and end times for evaluation. An individual is viable if an exception is NOT thrown, else it is NOT a viable individual. If not viable, we increment the context['leap'][ 'distrib']['non_viable'] count to track such instances. This function sets: individual.start_eval_time has the time() of when evaluation started. individual.stop_eval_time has the time() of when evaluation finished. individual.is_viable is True if viable, else False individual.exception will be assigned any raised exceptions individual.fitness will be NaN if not viable, else the calculated fitness individual.hostname is the name of the host on which this individual was evaluated individual.pid is the process ID associated with evaluating the individual :param individual: to be evaluated :return: evaluated individual """ worker = distributed.get_worker() individual.start_eval_time = time.time() if hasattr(worker, 'logger'): worker.logger.debug( f'Worker {worker.id} started evaluating {individual!s}') # Any thrown exceptions are now handled inside Individual.evaluate() individual.evaluate() if hasattr(individual, 'is_viable') and not individual.is_viable: # is_viable will be False if an exception was thrown during evaluation. # We track the number of such failures on the off chance that this # might be useful. context['leap']['distrib']['non_viable'] += 1 if hasattr(worker, 'logger'): worker.logger.warning( f'Worker {worker.id}: {individual.exception!s} raised for {individual!s}' ) individual.stop_eval_time = time.time() individual.hostname = platform.node() individual.pid = os.getpid() if hasattr(worker, 'logger'): worker.logger.debug( f'Worker {worker.id} evaluated {individual!s} in ' f'{individual.stop_eval_time - individual.start_eval_time} ' f'seconds') return individual
async def test_cupy_cluster_device_spill(params): cupy = pytest.importorskip("cupy") with dask.config.set({"distributed.worker.memory.terminate": False}): async with LocalCUDACluster( 1, scheduler_port=0, processes=True, silence_logs=False, dashboard_address=None, asynchronous=True, death_timeout=60, device_memory_limit=params["device_memory_limit"], memory_limit=params["memory_limit"], memory_target_fraction=params["host_target"], memory_spill_fraction=params["host_spill"], memory_pause_fraction=params["host_pause"], ) as cluster: async with Client(cluster, asynchronous=True) as client: rs = da.random.RandomState(RandomState=cupy.random.RandomState) x = rs.random(int(250e6), chunks=10e6) await wait(x) xx = x.persist() await wait(xx) # Allow up to 1024 bytes overhead per chunk serialized await client.run(worker_assert, x.nbytes, 1024, 1024) y = client.compute(x.sum()) res = await y assert (abs(res / x.size) - 0.5) < 1e-3 await client.run(worker_assert, x.nbytes, 1024, 1024) host_chunks = await client.run( lambda: len(get_worker().data.host)) disk_chunks = await client.run( lambda: len(get_worker().data.disk or list())) for hc, dc in zip(host_chunks.values(), disk_chunks.values()): if params["spills_to_disk"]: assert dc > 0 else: assert hc > 0 assert dc == 0
def test_client(c, s): with pytest.raises(Exception): get_worker() sub = Sub("a") pub = Pub("a") sps = s.extensions["pubsub"] cps = c.extensions["pubsub"] start = time() while not set(sps.client_subscribers["a"]) == {c.id}: yield gen.sleep(0.01) assert time() < start + 3 pub.put(123) result = yield sub.__anext__() assert result == 123
def test_client(c, s): with pytest.raises(Exception): get_worker() sub = Sub('a') pub = Pub('a') sps = s.extensions['pubsub'] cps = c.extensions['pubsub'] start = time() while not set(sps.client_subscribers['a']) == {c.id}: yield gen.sleep(0.01) assert time() < start + 3 pub.put(123) result = yield sub.__anext__() assert result == 123
def ps_task(tf_spec, verbose=False): worker = distributed.get_worker() server = worker.tensorflow_server ps_device = "/job:%s/task:%d" % (server.server_def.job_name, server.server_def.task_index) if verbose == True: print('PS task') print(ps_device) worker.tensorflow_server.join()
def _wrapped_function(function, *args, **kwargs): available_resources = kwargs['available_resources'] protocols_to_import = kwargs.pop('available_protocols') per_worker_logging = kwargs.pop('per_worker_logging') gpu_assignments = kwargs.pop('gpu_assignments') # Each spun up worker doesn't automatically import # all of the modules which were imported in the main # launch script, and as such custom plugins will no # longer be registered. We re-import / register them # here. for protocol_class in protocols_to_import: module_name = '.'.join(protocol_class.split('.')[:-1]) class_name = protocol_class.split('.')[-1] imported_module = importlib.import_module(module_name) available_protocols[class_name] = getattr(imported_module, class_name) # Set up the logging per worker if the flag is set to True. if per_worker_logging: formatter = logging.Formatter( fmt='%(asctime)s.%(msecs)03d %(levelname)-8s %(message)s', datefmt='%H:%M:%S') # Each worker should have its own log file. logger = logging.getLogger() if not len(logger.handlers): logger_handler = logging.FileHandler('{}.log'.format( get_worker().id)) logger_handler.setFormatter(formatter) logger.setLevel(logging.INFO) logger.addHandler(logger_handler) if available_resources.number_of_gpus > 0: worker_id = distributed.get_worker().id available_resources._gpu_device_indices = ( '0' if worker_id not in gpu_assignments else gpu_assignments[worker_id]) logging.info( f'Launching a job with access to GPUs {available_resources._gpu_device_indices}' ) return function(*args, **kwargs)
def func(): with worker_client() as c: correct = True for data in [[1, 2], (1, 2), {1, 2}]: futures = c.scatter(data) correct &= type(futures) == type(data) o = object() futures = c.scatter({'x': o}) correct &= get_worker().data['x'] is o return correct
def func(): with worker_client() as c: futures = c.scatter([1, 2, 3, 4, 5]) assert isinstance(futures, (list, tuple)) assert len(futures) == 5 x = dict(get_worker().data) y = {f.key: i for f, i in zip(futures, [1, 2, 3, 4, 5])} assert x == y total = c.submit(sum, futures) return total.result()
def fix_mask(good_file_path, variable, data_path, new_version_path, dataset_id, id_map=None, verbose=False): if id_map: addr = get_worker().address position = id_map[addr] + 1 else: position = 0 good_data = cdms2.open(good_file_path)[variable] mask = np.ma.getmask(good_data[:]) files_to_fix = sorted(os.listdir(data_path)) if verbose: pbar = tqdm(total=len(files_to_fix), position=position, leave=False) for chunk in files_to_fix: if verbose: year = get_year_from_file(chunk) desc = "{} -> {}".format(dataset_id, year) pbar.set_description(desc) source = os.path.join(data_path, chunk) dest = os.path.join(new_version_path, chunk) # create the input pointer in read mode ip = cdms2.open(source, 'r') data = ip[variable] data_copy = data[:] data_copy._set_mask(mask) # create the output pointer in write mode op = cdms2.open(dest, 'w') for k, v in ip.attributes.items(): setattr(op, k, v) # write out the new dataset op.write(data_copy) op.close() ip.close() if verbose: pbar.update(1) if verbose: pbar.close()
def calCov(tem,img): st = time.time() print 'Input image : ', img.shape size = getTemSize(tem) # 扩充图片的像素数 es = (size - 1) / 2 # 区别灰度图像与彩色图像 if len(img.shape) is 2: y, x, z = img.shape[0], img.shape[1], 1 else: y, x, z = img.shape[0], img.shape[1], img.shape[2] # 将图像上下左右各增加(模板宽度-1)/2个像素,用于计算卷积时的边界计算。 # Expand image by es pixel around for edge calculation. if len(img.shape) is 2: eimg = np.uint8(np.zeros((y + 2 * es, x + 2 * es, 1))) eimg[es:y + es, es:x + es, 0] = img[:, :] else: eimg = np.uint8(np.zeros((y + 2 * es, x + 2 * es, z))) eimg[es:y + es, es:x + es, :] = img result = np.uint8(np.zeros(eimg.shape)) x = x + es y = y + es for i in range(z): a = b = 1 # 扩充后图像的坐标,从0开始 # 设置一个dif变量,用于卷积窗内计算。 dif = (size - 1) / 2 while (b < y): a = 1 while (a < x): cntx = 0 dify = size / 2 result[b][a][i] = 0 while (cntx < size): cnty = 0 difx = size / 2 while (cnty < size): # print b,a,b-dify,a-difx result[b][a][i] += tem[cntx][cnty] * eimg[b - dify][a - difx][i] cnty += 1 difx -= 1 cntx += 1 dify -= 1 a += 1 b += 1 et = time.time() print 'Cal time:', str(et - st) work = distributed.get_worker() return [result,str(et - st),work.address]
def test_device_spill(client, scheduler, worker): cudf = pytest.importorskip("cudf") # There's a known issue with datetime64: # https://github.com/numpy/numpy/issues/4983#issuecomment-441332940 # The same error above happens when spilling datetime64 to disk cdf = (dask.datasets.timeseries( dtypes={ "x": int, "y": float }, freq="20ms").reset_index(drop=True).map_partitions( cudf.from_pandas)) sizes = yield client.compute( cdf.map_partitions(lambda df: df.__sizeof__())) sizes = sizes.tolist() nbytes = sum(sizes) part_index_nbytes = (yield client.compute( cdf.partitions[0].index)).__sizeof__() cdf2 = cdf.persist() yield wait(cdf2) del cdf host_chunks = yield client.run(lambda: len(get_worker().data.host)) disk_chunks = yield client.run( lambda: len(get_worker().data.disk or list())) for hc, dc in zip(host_chunks.values(), disk_chunks.values()): if params["spills_to_disk"]: assert dc > 0 else: assert hc > 0 assert dc == 0 yield client.run(worker_assert, nbytes, 32, 2048 + part_index_nbytes) del cdf2 yield client.run(delayed_worker_assert, 0, 0, 0)
def f(i): with worker_client(separate_thread=False) as client: get_worker().count += 1 assert get_worker().count <= 3 sleep(random.random() / 40) assert get_worker().count <= 3 get_worker().count -= 1 return i
def kill_init_proc(): try: worker_addr = get_worker().address except ValueError: # Special case for synchronous cluster. # See run_on_each_worker worker_addr = 'tcp://127.0.0.1' try: pid_to_kill = worker_init_pids[worker_addr] except KeyError: return None else: return kill_if_running(pid_to_kill, 10.0)
def worker_state(sessionId=None): worker = get_worker() if not hasattr(worker, "_explicit_comm_state"): worker._explicit_comm_state = {} if sessionId is not None and sessionId not in worker._explicit_comm_state: worker._explicit_comm_state[sessionId] = { "ts": time.time(), "eps": {}, "loop": worker.loop.asyncio_loop, "worker": worker, } if sessionId is not None: return worker._explicit_comm_state[sessionId] return worker._explicit_comm_state
def test_device_spill(client, scheduler, worker): rs = da.random.RandomState(RandomState=cupy.random.RandomState) x = rs.random(int(250e6), chunks=10e6) xx = x.persist() yield wait(xx) # Allow up to 1024 bytes overhead per chunk serialized yield client.run(worker_assert, x.nbytes, 1024, 1024) y = client.compute(x.sum()) res = yield y assert (abs(res / x.size) - 0.5) < 1e-3 yield client.run(worker_assert, x.nbytes, 1024, 1024) host_chunks = yield client.run(lambda: len(get_worker().data.host)) disk_chunks = yield client.run(lambda: len(get_worker().data.disk)) for hc, dc in zip(host_chunks.values(), disk_chunks.values()): if params["spills_to_disk"]: assert dc > 0 else: assert hc > 0 assert dc == 0
def dask_incref(cls, csr): def shared_csr_loader_incref(x): # This does nothing. Exists only to trick scheduler into generating an event pass key = distributed.get_worker().get_current_task() client = distributed.get_client() for shm in [csr.pointers_shm, csr.indices_shm, csr.values_shm]: task_name = f"{cls.REFCOUNT_TAG}:{key}:{shm.name}" dummy_arg = key + shm.name client.submit(shared_csr_loader_incref, dummy_arg, key=task_name, pure=False)
def register_plugins(client, add=defaultdict(dict)): """ Usage: plugins = { "MEMCache": {"maxmem": 5e8}, "ConfigureXRootD": {"proxy_file": None} } register_plugins(client, add=plugins) """ plugins = set() for p in client.run(lambda: set(get_worker().plugins)).values(): plugins |= p for name, opts in add.items(): plugin = globals()[name] if plugin.name not in plugins: client.register_worker_plugin(plugin(**opts))
def get_ext() -> ShuffleWorkerExtension: from distributed import get_worker try: worker = get_worker() except ValueError as e: raise RuntimeError( "`shuffle='p2p'` requires Dask's distributed scheduler. This task is not running on a Worker; " "please confirm that you've created a distributed Client and are submitting this computation through it." ) from e extension: ShuffleWorkerExtension | None = worker.extensions.get("shuffle") if not extension: raise RuntimeError( f"The worker {worker.address} does not have a ShuffleExtension. " "Is pandas installed on the worker?") return extension
def execute_subgraph(self, SG): futures = {} client = self._client worker = get_worker() logger.info(f'Computing subgraph') edge = lambda G, f, t: G.edges[(f, t)]['field'] result = lambda G, n: \ futures[hash(G.nodes[n]['job'])] \ if isinstance(G.nodes[n]['job'].resource, Job) else \ G.nodes[n]['job']() for resource in nx.topological_sort(SG): job = SG.nodes[resource]['job'] if not isinstance(job.resource, Job): continue dependencies = { edge(SG, dependency, resource): result(SG, dependency) for dependency in SG.predecessors(resource) } logger.info( f'Computing job {job.resource} with deps {dependencies}') resources = job.resources() try: del resources['storage'] except: pass futures[hash(job)] = client.submit(job, **dependencies, resources=resources, workers=[worker.address], key=str(job), pure=False) logger.info(f'Gathering subgraph') return { k: v if v is not None else futures[k].exception() for k, v in self._client.gather(futures, errors='skip').items() }
def __init__(self, client=None, storage=None): try: self.client = client or Client.current() except ValueError: # Initialise new client self.client = get_worker().client self.storage = storage if self.client.asynchronous or getattr(thread_state, "on_event_loop_thread", False): async def _register(): await self.client.run_on_scheduler(register_with_scheduler) self.client.loop.add_callback(_register) else: self.client.run_on_scheduler(register_with_scheduler)
def scoring_task(tf_spec, xval, yval, keras_model, verbose=False): #run partial of this to configure it to xval and yval with local_client() as c: # Scores Channel scores = c.channel('scores', maxlen=10) worker = distributed.get_worker() queue = worker.tensorflow_queue server = worker.tensorflow_server # Make Model sess, _, _, _, _, loss = model(server, tf_spec, keras_model) # Testing Data test_data = {x: xval, y_: yval} # Main Loop while True: score = sess.run(loss, feed_dict=test_data) scores.append(float(score)) time.sleep(1)
def f(): with worker_client(): pass return threading.current_thread() in get_worker().executor._threads
def some_name(): return get_worker().get_current_task()
def f(x): sleep(0.1) if get_worker().address == a_address: raise Reschedule()
def f(): with worker_client() as lc: return lc.loop is get_worker().loop
def _worker_address(_): from distributed import get_worker return get_worker().address