def test_backfill_integration(self): """ Test that DaskExecutor can be used to backfill example dags """ cluster = LocalCluster() dags = [ dag for dag in self.dagbag.dags.values() if dag.dag_id in [ 'example_bash_operator', # 'example_python_operator', ] ] for dag in dags: dag.clear( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)): job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True, executor=DaskExecutor( cluster_address=cluster.scheduler_address)) job.run() cluster.close()
def test_bokeh_shutsdown_without_cluster___del__(loop): c = LocalCluster(2, loop=loop, scheduler_port=0, services={('http', 0): HTTPScheduler}) proc = c.diagnostics.process # don't run the del, as it isn't ever run in python < 3.5 due to cycles c.__del__ = lambda self: None del c start = time() while True: if proc.poll() is not None: break assert time() < start + 5 sleep(0.01)
class DaskExecutorTest(BaseDaskTest): def setUp(self): self.dagbag = DagBag(include_examples=True) self.cluster = LocalCluster() @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration') def test_dask_executor_functions(self): executor = DaskExecutor(cluster_address=self.cluster.scheduler_address) self.assert_tasks_on_executor(executor) @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration') def test_backfill_integration(self): """ Test that DaskExecutor can be used to backfill example dags """ dags = [ dag for dag in self.dagbag.dags.values() if dag.dag_id in [ 'example_bash_operator', # 'example_python_operator', ] ] for dag in dags: dag.clear( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)): job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True, executor=DaskExecutor( cluster_address=self.cluster.scheduler_address)) job.run() def tearDown(self): self.cluster.close(timeout=5)
def test_dask_executor_functions(self): cluster = LocalCluster() executor = DaskExecutor(cluster_address=cluster.scheduler_address) # start the executor executor.start() success_command = 'echo 1' fail_command = 'exit 1' executor.execute_async(key='success', command=success_command) executor.execute_async(key='fail', command=fail_command) success_future = next( k for k, v in executor.futures.items() if v == 'success') fail_future = next( k for k, v in executor.futures.items() if v == 'fail') # wait for the futures to execute, with a timeout timeout = datetime.datetime.now() + datetime.timedelta(seconds=30) while not (success_future.done() and fail_future.done()): if datetime.datetime.now() > timeout: raise ValueError( 'The futures should have finished; there is probably ' 'an error communciating with the Dask cluster.') # both tasks should have finished self.assertTrue(success_future.done()) self.assertTrue(fail_future.done()) # check task exceptions self.assertTrue(success_future.exception() is None) self.assertTrue(fail_future.exception() is not None) cluster.close()
def test_dask_missing_value_reg() -> None: with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: X_0 = np.ones((20 // 2, kCols)) X_1 = np.zeros((20 // 2, kCols)) X = np.concatenate([X_0, X_1], axis=0) np.random.shuffle(X) X = da.from_array(X) X = X.rechunk(20, 1) y = da.random.randint(0, 3, size=20) y.rechunk(20) regressor = xgb.dask.DaskXGBRegressor(verbosity=1, n_estimators=2, missing=0.0) regressor.client = client regressor.set_params(tree_method='hist') regressor.fit(X, y, eval_set=[(X, y)]) dd_predt = regressor.predict(X).compute() np_X = X.compute() np_predt = regressor.get_booster().predict( xgb.DMatrix(np_X, missing=0.0)) np.testing.assert_allclose(np_predt, dd_predt)
def get_LocalCluster(threads_per_worker: int = 1, n_workers: int = 0, **kwargs): """ Creata a distributed.LocalCluster with defaults that make it more similar to a deployment on the Janelia Compute cluster. This function is a light wrapper around the distributed.LocalCluster constructor. Parameters ---------- n_workers: int The number of workers to start the cluster with. This defaults to 0 here. threads_per_worker: int The number of threads to assign to each worker. **kwargs: Additional keyword arguments passed to the LocalCluster constructor Examples -------- >>> cluster = get_LocalCluster(threads_per_worker=8) """ return LocalCluster(n_workers=n_workers, threads_per_worker=threads_per_worker, **kwargs)
def test_no_more_workers_than_tasks(): loop = IOLoop.current() cluster = yield LocalCluster( 0, scheduler_port=0, silence_logs=False, processes=False, dashboard_address=None, loop=loop, asynchronous=True, ) yield cluster._start() try: adapt = cluster.adapt(minimum=0, maximum=4, interval="10 ms") client = yield Client(cluster, asynchronous=True, loop=loop) cluster.scheduler.task_duration["slowinc"] = 1000 yield client.submit(slowinc, 1, delay=0.100) assert len(cluster.scheduler.workers) <= 1 finally: yield client.close() yield cluster.close()
async def test_adapt_down(): """ Ensure that redefining adapt with a lower maximum removes workers """ async with LocalCluster( 0, asynchronous=True, processes=False, scheduler_port=0, silence_logs=False, dashboard_address=None, ) as cluster: async with Client(cluster, asynchronous=True) as client: cluster.adapt(interval="20ms", maximum=5) futures = client.map(slowinc, range(1000), delay=0.1) while len(cluster.scheduler.workers) < 5: await gen.sleep(0.1) cluster.adapt(maximum=2) start = time() while len(cluster.scheduler.workers) != 2: await gen.sleep(0.1) assert time() < start + 1
def test_dask_regressor() -> None: with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: X, y, w = generate_array(with_weights=True) regressor = xgb.dask.DaskXGBRegressor(verbosity=1, n_estimators=2) assert regressor._estimator_type == "regressor" assert sklearn.base.is_regressor(regressor) regressor.set_params(tree_method='hist') regressor.client = client regressor.fit(X, y, sample_weight=w, eval_set=[(X, y)]) prediction = regressor.predict(X) assert prediction.ndim == 1 assert prediction.shape[0] == kRows history = regressor.evals_result() assert isinstance(prediction, da.Array) assert isinstance(history, dict) assert list(history['validation_0'].keys())[0] == 'rmse' assert len(history['validation_0']['rmse']) == 2
def test_worker_keys(): """ Ensure that redefining adapt with a lower maximum removes workers """ cluster = yield LocalCluster( 0, asynchronous=True, processes=False, scheduler_port=0, silence_logs=False, dashboard_address=None, ) try: yield [ cluster.start_worker(name="a-1"), cluster.start_worker(name="a-2"), cluster.start_worker(name="b-1"), cluster.start_worker(name="b-2"), ] while len(cluster.scheduler.workers) != 4: yield gen.sleep(0.01) def key(ws): return ws.name.split("-")[0] cluster._adaptive_options = {"worker_key": key} adaptive = cluster.adapt(minimum=1) yield adaptive._adapt() while len(cluster.scheduler.workers) == 4: yield gen.sleep(0.01) names = {ws.name for ws in cluster.scheduler.workers.values()} assert names == {"a-1", "a-2"} or names == {"b-1", "b-2"} finally: yield cluster.close()
def test_data_initialization(self): '''Assert each worker has the correct amount of data, and DMatrix initialization doesn't generate unnecessary copies of data. ''' with LocalCluster(n_workers=2) as cluster: with Client(cluster) as client: X, y = generate_array() n_partitions = X.npartitions m = xgb.dask.DaskDMatrix(client, X, y) workers = list(xgb.dask._get_client_workers(client).keys()) rabit_args = client.sync(xgb.dask._get_rabit_args, workers, client) n_workers = len(workers) def worker_fn(worker_addr, data_ref): with xgb.dask.RabitContext(rabit_args): local_dtrain = xgb.dask._dmatrix_from_worker_map(**data_ref) total = np.array([local_dtrain.num_row()]) total = xgb.rabit.allreduce(total, xgb.rabit.Op.SUM) assert total[0] == kRows futures = client.map( worker_fn, workers, [m.create_fn_args()] * len(workers), pure=False, workers=workers) client.gather(futures) has_what = client.has_what() cnt = 0 data = set() for k, v in has_what.items(): for d in v: cnt += 1 data.add(d) assert len(data) == cnt # Subtract the on disk resource from each worker assert cnt - n_workers == n_partitions
def test_adaptive_local_cluster_multi_workers(): cluster = yield LocalCluster(0, scheduler_port=0, silence_logs=False, processes=False, diagnostics_port=None, asynchronous=True) try: cluster.scheduler.allowed_failures = 1000 alc = Adaptive(cluster.scheduler, cluster, interval=100) c = yield Client(cluster, asynchronous=True) futures = c.map(slowinc, range(100), delay=0.01) start = time() while not cluster.scheduler.workers: yield gen.sleep(0.01) assert time() < start + 15, alc.log yield c.gather(futures) del futures start = time() # while cluster.workers: while cluster.scheduler.workers: yield gen.sleep(0.01) assert time() < start + 15, alc.log # assert not cluster.workers assert not cluster.scheduler.workers yield gen.sleep(0.2) # assert not cluster.workers assert not cluster.scheduler.workers futures = c.map(slowinc, range(100), delay=0.01) yield c.gather(futures) finally: yield c._close() yield cluster._close()
def test_dask_classifier(): with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: X, y = generate_array() y = (y * 10).astype(np.int32) classifier = xgb.dask.DaskXGBClassifier(verbosity=1, n_estimators=2) classifier.client = client classifier.fit(X, y, eval_set=[(X, y)]) prediction = classifier.predict(X) assert prediction.ndim == 2 assert prediction.shape[0] == kRows history = classifier.evals_result() assert isinstance(prediction, da.Array) assert isinstance(history, dict) assert list(history.keys())[0] == 'validation_0' assert list(history['validation_0'].keys())[0] == 'merror' assert len(list(history['validation_0'])) == 1 assert len(history['validation_0']['merror']) == 2 assert classifier.n_classes_ == 10 # Test with dataframe. X_d = dd.from_dask_array(X) y_d = dd.from_dask_array(y) classifier.fit(X_d, y_d) assert classifier.n_classes_ == 10 prediction = classifier.predict(X_d) assert prediction.ndim == 2 assert prediction.shape[0] == kRows
def test_from_dask_dataframe(): with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: X, y = generate_array() X = dd.from_dask_array(X) y = dd.from_dask_array(y) dtrain = DaskDMatrix(client, X, y) booster = xgb.dask.train( client, {}, dtrain, num_boost_round=2)['booster'] prediction = xgb.dask.predict(client, model=booster, data=dtrain) assert prediction.ndim == 1 assert isinstance(prediction, da.Array) assert prediction.shape[0] == kRows with pytest.raises(ValueError): # evals_result is not supported in dask interface. xgb.dask.train( client, {}, dtrain, num_boost_round=2, evals_result={}) # force prediction to be computed from_dmatrix = prediction.compute() prediction = xgb.dask.predict(client, model=booster, data=X) from_df = prediction.compute() assert isinstance(prediction, dd.Series) assert np.all(prediction.compute().values == from_dmatrix) assert np.all(from_dmatrix == from_df.to_numpy()) series_predictions = xgb.dask.inplace_predict(client, booster, X) assert isinstance(series_predictions, dd.Series) np.testing.assert_allclose(series_predictions.compute().values, from_dmatrix)
def test_adaptive_local_cluster(loop): with LocalCluster(0, scheduler_port=0, silence_logs=False, diagnostics_port=None, loop=loop) as cluster: alc = Adaptive(cluster.scheduler, cluster, interval=100) with Client(cluster, loop=loop) as c: assert not c.ncores() future = c.submit(lambda x: x + 1, 1) assert future.result() == 2 assert c.ncores() sleep(0.1) assert c.ncores() # still there after some time del future start = time() while cluster.scheduler.ncores: sleep(0.01) assert time() < start + 5 assert not c.ncores()
def test_avoid_churn(): """ We want to avoid creating and deleting workers frequently Instead we want to wait a few beats before removing a worker in case the user is taking a brief pause between work """ cluster = yield LocalCluster(0, asynchronous=True, processes=False, scheduler_port=0, silence_logs=False, diagnostics_port=None) client = yield Client(cluster, asynchronous=True) try: adapt = Adaptive(cluster.scheduler, cluster, interval=20, wait_count=5) for i in range(10): yield client.submit(slowinc, i, delay=0.040) yield gen.sleep(0.040) assert frequencies(pluck(1, adapt.log)) == {'up': 1} finally: yield client._close() yield cluster._close()
def main(args=None): args = parse_args(args) if args.protocol == 'ucx': sched_str = "ucx://"+ args.server + ":13337" client = Client(sched_str) elif args.protocol == 'tcp': sched_str = "tcp://"+ args.server + ":13337" client = Client(sched_str) else: kwargs = {'n_workers': 2, 'threads_per_worker': 40} kwargs['processes'] = args.protocol == 'tcp' cluster = LocalCluster(**kwargs) client = Client(cluster) print(f"Connected to {client}") N = int(args.length) P = int(args.length) RS = da.random.RandomState(RandomState=cupy.random.RandomState) #RS = da.random.RandomState(123) X = RS.normal(10, 1, size=(N, P)) #X = da.random.uniform(size=(N, P), chunks=(N/100, P/100)) X.persist() print(format_bytes(X.nbytes)) result = (X + X.T).sum() #(x + x.T).sum().compute() start = clock() result.compute() #with get_task_stream() as ts: # result.compute() stop = clock() #print(ts.data) print(result) print(format_bytes(X.nbytes)) print(f"\tTook {stop - start:0.2f}s") time.sleep(1)
def spawn_cluster_and_client( address: Optional[str] = None, **kwargs) -> Tuple[Optional[LocalCluster], Optional[Client]]: """ If provided an address, create a Dask Client connection. If not provided an address, create a LocalCluster and Client connection. If not provided an address, other Dask kwargs are accepted and passed down to the LocalCluster object. Notes ----- When using this function, the processing machine or container must have networking capabilities enabled to function properly. """ cluster = None if address is not None: client = Client(address) log.info(f"Connected to Remote Dask Cluster: {client}") else: cluster = LocalCluster(**kwargs) client = Client(cluster) log.info(f"Connected to Local Dask Cluster: {client}") return cluster, client
def test_adaptive_local_cluster(loop): with LocalCluster(0, scheduler_port=0, silence_logs=False, dashboard_address=None, loop=loop) as cluster: alc = cluster.adapt(interval="100 ms") with Client(cluster, loop=loop) as c: assert not c.nthreads() future = c.submit(lambda x: x + 1, 1) assert future.result() == 2 assert c.nthreads() sleep(0.1) assert c.nthreads() # still there after some time del future start = time() while cluster.scheduler.nthreads: sleep(0.01) assert time() < start + 5 assert not c.nthreads()
async def test_adaptive_local_cluster_multi_workers(cleanup): async with LocalCluster( 0, scheduler_port=0, silence_logs=False, processes=False, dashboard_address=None, asynchronous=True, ) as cluster: cluster.scheduler.allowed_failures = 1000 adapt = cluster.adapt(interval="100 ms") async with Client(cluster, asynchronous=True) as c: futures = c.map(slowinc, range(100), delay=0.01) start = time() while not cluster.scheduler.workers: await asyncio.sleep(0.01) assert time() < start + 15, adapt.log await c.gather(futures) del futures start = time() # while cluster.workers: while cluster.scheduler.workers: await asyncio.sleep(0.01) assert time() < start + 15, adapt.log # no workers for a while for i in range(10): assert not cluster.scheduler.workers await asyncio.sleep(0.05) futures = c.map(slowinc, range(100), delay=0.01) await c.gather(futures)
def __init__(self, template=None, cleanup_interval=1000, hostname=None, script=None, preexec_commands=(), copy_script=True, ip='', **kwargs): """ Dask workers launched by a DRMAA-compatible cluster Parameters ---------- template: dict Dictionary specifying options to pass to the DRMAA cluster and the worker. Relevant items are: jobName: string Name of the job as known by the DRMAA cluster. args: list Extra string arguments to pass to dask-worker outputPath: string Path to the dask-worker stdout. Must start with ':'. Defaults to worker.JOBID.TASKID.out in current directory. errorPath: string Path to the dask-worker stderr. Must start with ':' Defaults to worker.JOBID.TASKID.err in current directory. workingDirectory: string Where dask-worker runs, defaults to current directory nativeSpecification: string Options native to the job scheduler cleanup_interval: int Time interval in seconds at which closed workers are cleaned. Defaults to 1000 hostname: string Host on which to start the local scheduler, defaults to localhost script: string (optional) Path to the dask-worker executable script. A temporary file will be made if none is provided (recommended) preexec_commands: tuple (optional) Commands to be executed first by temporary script. Cannot be specified at the same time as script. copy_script: bool Whether should copy the passed script to the current working directory. This is primarily to work around an issue with SGE. ip: string IP of the scheduler, default is the empty string which will listen on the primary ip address of the host **kwargs: Additional keyword arguments to be passed to the local scheduler Examples -------- >>> from dask_drmaa import DRMAACluster # doctest: +SKIP >>> cluster = DRMAACluster() # doctest: +SKIP >>> cluster.start_workers(10) # doctest: +SKIP >>> from distributed import Client # doctest: +SKIP >>> client = Client(cluster) # doctest: +SKIP >>> future = client.submit(lambda x: x + 1, 10) # doctest: +SKIP >>> future.result() # doctest: +SKIP 11 """ self.hostname = hostname or socket.gethostname() logger.info("Start local scheduler at %s", self.hostname) self.local_cluster = LocalCluster(n_workers=0, ip=ip, **kwargs) if script is None: fn = os.path.abspath(tempfile.mktemp( suffix='.sh', prefix='dask-worker-script-', dir=os.path.curdir, )) self.script = fn self._should_cleanup_script = True script_contents = make_job_script(executable=worker_bin_path, name='%s.%s' % (JOB_ID, TASK_ID), preexec=preexec_commands) with open(fn, 'wt') as f: f.write(script_contents) @atexit.register def remove_script(): if os.path.exists(fn): os.remove(fn) os.chmod(self.script, 0o777) else: self._should_cleanup_script = False if copy_script: with ignoring(EnvironmentError): # may be in the same path shutil.copy(script, os.path.curdir) # python 2.x returns None script = os.path.join(os.path.curdir, os.path.basename(script)) self._should_cleanup_script = True self.script = os.path.abspath(script) assert not preexec_commands, "Cannot specify both script and preexec_commands" # TODO: check that user-provided script is executable self.template = merge(default_template, {'remoteCommand': self.script}, template or {}) self._cleanup_callback = PeriodicCallback(callback=self.cleanup_closed_workers, callback_time=cleanup_interval, io_loop=self.scheduler.loop) self._cleanup_callback.start() self.workers = {} # {job-id: WorkerSpec}
class DRMAACluster(Cluster): def __init__(self, template=None, cleanup_interval=1000, hostname=None, script=None, preexec_commands=(), copy_script=True, ip='', **kwargs): """ Dask workers launched by a DRMAA-compatible cluster Parameters ---------- template: dict Dictionary specifying options to pass to the DRMAA cluster and the worker. Relevant items are: jobName: string Name of the job as known by the DRMAA cluster. args: list Extra string arguments to pass to dask-worker outputPath: string Path to the dask-worker stdout. Must start with ':'. Defaults to worker.JOBID.TASKID.out in current directory. errorPath: string Path to the dask-worker stderr. Must start with ':' Defaults to worker.JOBID.TASKID.err in current directory. workingDirectory: string Where dask-worker runs, defaults to current directory nativeSpecification: string Options native to the job scheduler cleanup_interval: int Time interval in seconds at which closed workers are cleaned. Defaults to 1000 hostname: string Host on which to start the local scheduler, defaults to localhost script: string (optional) Path to the dask-worker executable script. A temporary file will be made if none is provided (recommended) preexec_commands: tuple (optional) Commands to be executed first by temporary script. Cannot be specified at the same time as script. copy_script: bool Whether should copy the passed script to the current working directory. This is primarily to work around an issue with SGE. ip: string IP of the scheduler, default is the empty string which will listen on the primary ip address of the host **kwargs: Additional keyword arguments to be passed to the local scheduler Examples -------- >>> from dask_drmaa import DRMAACluster # doctest: +SKIP >>> cluster = DRMAACluster() # doctest: +SKIP >>> cluster.start_workers(10) # doctest: +SKIP >>> from distributed import Client # doctest: +SKIP >>> client = Client(cluster) # doctest: +SKIP >>> future = client.submit(lambda x: x + 1, 10) # doctest: +SKIP >>> future.result() # doctest: +SKIP 11 """ self.hostname = hostname or socket.gethostname() logger.info("Start local scheduler at %s", self.hostname) self.local_cluster = LocalCluster(n_workers=0, ip=ip, **kwargs) if script is None: fn = os.path.abspath(tempfile.mktemp( suffix='.sh', prefix='dask-worker-script-', dir=os.path.curdir, )) self.script = fn self._should_cleanup_script = True script_contents = make_job_script(executable=worker_bin_path, name='%s.%s' % (JOB_ID, TASK_ID), preexec=preexec_commands) with open(fn, 'wt') as f: f.write(script_contents) @atexit.register def remove_script(): if os.path.exists(fn): os.remove(fn) os.chmod(self.script, 0o777) else: self._should_cleanup_script = False if copy_script: with ignoring(EnvironmentError): # may be in the same path shutil.copy(script, os.path.curdir) # python 2.x returns None script = os.path.join(os.path.curdir, os.path.basename(script)) self._should_cleanup_script = True self.script = os.path.abspath(script) assert not preexec_commands, "Cannot specify both script and preexec_commands" # TODO: check that user-provided script is executable self.template = merge(default_template, {'remoteCommand': self.script}, template or {}) self._cleanup_callback = PeriodicCallback(callback=self.cleanup_closed_workers, callback_time=cleanup_interval, io_loop=self.scheduler.loop) self._cleanup_callback.start() self.workers = {} # {job-id: WorkerSpec} def adapt(self, **kwargs): """ Turn on adaptivity For keyword arguments see dask_drmaa.adaptive.Adaptive Examples -------- >>> cluster.adapt(minimum=0, maximum=10, interval='500ms') See Also -------- Cluster: an interface for other clusters to inherit from """ from .adaptive import Adaptive with ignoring(AttributeError): self._adaptive.stop() if not hasattr(self, '_adaptive_options'): self._adaptive_options = {} self._adaptive_options.update(kwargs) self._adaptive = Adaptive( self, self.scheduler, **self._adaptive_options ) return self._adaptive @gen.coroutine def _start(self): pass @property def scheduler(self): return self.local_cluster.scheduler def create_job_template(self, **kwargs): template = self.template.copy() if kwargs: template.update(kwargs) template['args'] = [self.scheduler_address] + template['args'] jt = get_session().createJobTemplate() valid_attributes = dir(jt) for key, value in template.items(): if key not in valid_attributes: raise ValueError("Invalid job template attribute %s" % key) setattr(jt, key, value) return jt def start_workers(self, n=1, **kwargs): if n == 0: return with log_errors(): with self.create_job_template(**kwargs) as jt: ids = get_session().runBulkJobs(jt, 1, n, 1) logger.info("Start %d workers. Job ID: %s", len(ids), ids[0].split('.')[0]) self.workers.update( {jid: WorkerSpec(job_id=jid, kwargs=kwargs, stdout=worker_out_path_template % dict(jid=jid, ext='out'), stderr=worker_out_path_template % dict(jid=jid, ext='err'), ) for jid in ids}) @gen.coroutine def stop_workers(self, worker_ids, sync=False): if isinstance(worker_ids, str): worker_ids = [worker_ids] elif worker_ids: worker_ids = list(worker_ids) else: return # Let the scheduler gracefully retire workers first ids_to_ips = { v['name']: k for k, v in self.scheduler.worker_info.items() } worker_ips = [ids_to_ips[wid] for wid in worker_ids if wid in ids_to_ips] retired = yield self.scheduler.retire_workers(workers=worker_ips, close_workers=True) logger.info("Retired workers %s", retired) for wid in list(worker_ids): try: get_session().control(wid, drmaa.JobControlAction.TERMINATE) except drmaa.errors.InvalidJobException: pass try: self.workers.pop(wid) except KeyError: # If we have multiple callers at once, it may have already # been popped off pass logger.info("Stop workers %s", worker_ids) if sync: get_session().synchronize(worker_ids, dispose=True) @gen.coroutine def scale_up(self, n, **kwargs): yield [self.start_workers(**kwargs) for _ in range(n - len(self.workers))] @gen.coroutine def scale_down(self, workers): workers = set(workers) yield self.scheduler.retire_workers(workers=workers) def close(self): logger.info("Closing DRMAA cluster") self.stop_workers(self.workers, sync=True) self.local_cluster.close() if self._should_cleanup_script and os.path.exists(self.script): os.remove(self.script) def __enter__(self): return self def __exit__(self, *args): self.close() def cleanup_closed_workers(self): for jid in list(self.workers): if get_session().jobStatus(jid) in ('closed', 'done'): logger.info("Removing closed worker %s", jid) del self.workers[jid] def __del__(self): try: self.close() except: pass def __str__(self): return "<%s: %d workers>" % (self.__class__.__name__, len(self.workers)) __repr__ = __str__
def test_aft_survival(): with LocalCluster(n_workers=1) as cluster: with Client(cluster) as client: run_aft_survival(client, DaskDMatrix)
def test_empty_dmatrix_approx(): with LocalCluster(n_workers=kWorkers) as cluster: with Client(cluster) as client: parameters = {'tree_method': 'approx'} run_empty_dmatrix_reg(client, parameters) run_empty_dmatrix_cls(client, parameters)
def test_boost_from_prediction(tree_method): if tree_method == 'approx': pytest.xfail(reason='test_boost_from_prediction[approx] is flaky') from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) X_ = dd.from_array(X, chunksize=100) y_ = dd.from_array(y, chunksize=100) with LocalCluster(n_workers=4) as cluster: with Client(cluster) as _: model_0 = xgb.dask.DaskXGBClassifier( learning_rate=0.3, random_state=123, n_estimators=4, tree_method=tree_method, ) model_0.fit(X=X_, y=y_) margin = model_0.predict_proba(X_, output_margin=True) model_1 = xgb.dask.DaskXGBClassifier( learning_rate=0.3, random_state=123, n_estimators=4, tree_method=tree_method, ) model_1.fit(X=X_, y=y_, base_margin=margin) predictions_1 = model_1.predict(X_, base_margin=margin) proba_1 = model_1.predict_proba(X_, base_margin=margin) cls_2 = xgb.dask.DaskXGBClassifier( learning_rate=0.3, random_state=123, n_estimators=8, tree_method=tree_method, ) cls_2.fit(X=X_, y=y_) predictions_2 = cls_2.predict(X_) proba_2 = cls_2.predict_proba(X_) cls_3 = xgb.dask.DaskXGBClassifier( learning_rate=0.3, random_state=123, n_estimators=8, tree_method=tree_method, ) cls_3.fit(X=X_, y=y_) proba_3 = cls_3.predict_proba(X_) # compute variance of probability percentages between two of the # same model, use this to check to make sure approx is functioning # within normal parameters expected_variance = np.max(np.abs(proba_3 - proba_2)).compute() if expected_variance > 0: margin_variance = np.max(np.abs(proba_1 - proba_2)).compute() # Ensure the margin variance is less than the expected variance + 10% assert np.all(margin_variance <= expected_variance + .1) else: np.testing.assert_equal(predictions_1.compute(), predictions_2.compute()) np.testing.assert_almost_equal(proba_1.compute(), proba_2.compute())
import dask.array as da from distributed import Client, LocalCluster from sklearn.datasets import make_regression from sklearn.metrics import mean_squared_error import lightgbm as lgb if __name__ == "__main__": print("loading data") X, y = make_regression(n_samples=1000, n_features=50) print("initializing a Dask cluster") cluster = LocalCluster(n_workers=2) client = Client(cluster) print("created a Dask LocalCluster") print("distributing training data on the Dask cluster") dX = da.from_array(X, chunks=(100, 50)) dy = da.from_array(y, chunks=(100, )) print("beginning training") dask_model = lgb.DaskLGBMRegressor(n_estimators=10) dask_model.fit(dX, dy) assert dask_model.fitted_ print("done training")
class DaskExecutorTest(unittest.TestCase): def setUp(self): self.dagbag = DagBag(include_examples=True) self.cluster = LocalCluster() @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration') def test_dask_executor_functions(self): executor = DaskExecutor(cluster_address=self.cluster.scheduler_address) # start the executor executor.start() success_command = 'echo 1' fail_command = 'exit 1' executor.execute_async(key='success', command=success_command) executor.execute_async(key='fail', command=fail_command) success_future = next( k for k, v in executor.futures.items() if v == 'success') fail_future = next( k for k, v in executor.futures.items() if v == 'fail') # wait for the futures to execute, with a timeout timeout = timezone.utcnow() + timedelta(seconds=30) while not (success_future.done() and fail_future.done()): if timezone.utcnow() > timeout: raise ValueError( 'The futures should have finished; there is probably ' 'an error communciating with the Dask cluster.') # both tasks should have finished self.assertTrue(success_future.done()) self.assertTrue(fail_future.done()) # check task exceptions self.assertTrue(success_future.exception() is None) self.assertTrue(fail_future.exception() is not None) @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration') def test_backfill_integration(self): """ Test that DaskExecutor can be used to backfill example dags """ dags = [ dag for dag in self.dagbag.dags.values() if dag.dag_id in [ 'example_bash_operator', # 'example_python_operator', ] ] for dag in dags: dag.clear( start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)): job = BackfillJob( dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True, executor=DaskExecutor( cluster_address=self.cluster.scheduler_address)) job.run() def tearDown(self): self.cluster.close(timeout=5)
if __name__ == '__main__': if sum(metadata.columns == grouping_variable) < 1: exit('Grouping variable not found in metadata.') ## Load randing databases db_fnames = glob.glob(db_folder) def name(fname): return os.path.basename(fname).split(".")[0] dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames] dbs ## Initialize cluster local_cluster = LocalCluster(n_workers=n_cores, threads_per_worker=1, processes=False, memory_limit=memory_limit) custom_client = Client(local_cluster) ## Load TFs tf_names = load_tf_names(TFs_file) ## Collect here regulons passing correlation filter cortest_passed_regulons = [] for i in range(0, iterations): ## Split to train and test data[grouping_variable] = metadata[grouping_variable]
def run_espei(run_settings): """Wrapper around the ESPEI fitting procedure, taking only a settings dictionary. Parameters ---------- run_settings : dict Dictionary of input settings Returns ------- Either a Database (for generate parameters only) or a tuple of (Database, sampler) """ run_settings = get_run_settings(run_settings) system_settings = run_settings['system'] output_settings = run_settings['output'] generate_parameters_settings = run_settings.get('generate_parameters') mcmc_settings = run_settings.get('mcmc') # handle verbosity verbosity = {0: logging.WARNING, 1: logging.INFO, 2: logging.DEBUG} logging.basicConfig(level=verbosity[output_settings['verbosity']]) # load datasets and handle i/o logging.debug('Loading and checking datasets.') dataset_path = system_settings['datasets'] datasets = load_datasets(sorted(recursive_glob(dataset_path, '*.json'))) if len(datasets.all()) == 0: logging.warning( 'No datasets were found in the path {}. This should be a directory containing dataset files ending in `.json`.' .format(dataset_path)) logging.debug('Finished checking datasets') with open(system_settings['phase_models']) as fp: phase_models = json.load(fp) if generate_parameters_settings is not None: refdata = generate_parameters_settings['ref_state'] excess_model = generate_parameters_settings['excess_model'] dbf = generate_parameters(phase_models, datasets, refdata, excess_model) dbf.to_file(output_settings['output_db'], if_exists='overwrite') if mcmc_settings is not None: tracefile = output_settings['tracefile'] probfile = output_settings['probfile'] # check that the MCMC output files do not already exist # only matters if we are actually running MCMC if os.path.exists(tracefile): raise OSError( 'Tracefile "{}" exists and would be overwritten by a new run. Use the ``output.tracefile`` setting to set a different name.' .format(tracefile)) if os.path.exists(probfile): raise OSError( 'Probfile "{}" exists and would be overwritten by a new run. Use the ``output.probfile`` setting to set a different name.' .format(probfile)) # scheduler setup if mcmc_settings['scheduler'] == 'MPIPool': # check that cores is not an input setting if mcmc_settings.get('cores') != None: logging.warning("MPI does not take the cores input setting.") from emcee.utils import MPIPool # code recommended by emcee: if not master, wait for instructions then exit client = MPIPool() if not client.is_master(): logging.debug( 'MPIPool is not master. Waiting for instructions...') client.wait() sys.exit(0) logging.info("Using MPIPool on {} MPI ranks".format(client.size)) elif mcmc_settings['scheduler'] == 'dask': from distributed import LocalCluster cores = mcmc_settings.get('cores', multiprocessing.cpu_count()) if (cores > multiprocessing.cpu_count()): cores = multiprocessing.cpu_count() logging.warning( "The number of cores chosen is larger than available. " "Defaulting to run on the {} available cores.".format( cores)) scheduler = LocalCluster(n_workers=cores, threads_per_worker=1, processes=True) client = ImmediateClient(scheduler) client.run(logging.basicConfig, level=verbosity[output_settings['verbosity']]) logging.info("Running with dask scheduler: %s [%s cores]" % (scheduler, sum(client.ncores().values()))) try: logging.info( "bokeh server for dask scheduler at localhost:{}".format( client.scheduler_info()['services']['bokeh'])) except KeyError: logging.info("Install bokeh to use the dask bokeh server.") elif mcmc_settings['scheduler'] == 'emcee': from emcee.interruptible_pool import InterruptiblePool cores = mcmc_settings.get('cores', multiprocessing.cpu_count()) if (cores > multiprocessing.cpu_count()): cores = multiprocessing.cpu_count() logging.warning( "The number of cores chosen is larger than available. " "Defaulting to run on the {} available cores.".format( cores)) client = InterruptiblePool(processes=cores) logging.info("Using multiprocessing on {} cores".format(cores)) elif mcmc_settings['scheduler'] == 'None': client = None logging.info( "Not using a parallel scheduler. ESPEI is running MCMC on a single core." ) # get a Database if mcmc_settings.get('input_db'): dbf = Database(mcmc_settings.get('input_db')) # load the restart chain if needed if mcmc_settings.get('restart_chain'): restart_chain = np.load(mcmc_settings.get('restart_chain')) else: restart_chain = None # load the remaning mcmc fitting parameters mcmc_steps = mcmc_settings.get('mcmc_steps') save_interval = mcmc_settings.get('mcmc_save_interval') chains_per_parameter = mcmc_settings.get('chains_per_parameter') chain_std_deviation = mcmc_settings.get('chain_std_deviation') deterministic = mcmc_settings.get('deterministic') dbf, sampler = mcmc_fit( dbf, datasets, scheduler=client, mcmc_steps=mcmc_steps, chains_per_parameter=chains_per_parameter, chain_std_deviation=chain_std_deviation, save_interval=save_interval, tracefile=tracefile, probfile=probfile, restart_chain=restart_chain, deterministic=deterministic, ) dbf.to_file(output_settings['output_db'], if_exists='overwrite') # close the scheduler, if possible if hasattr(client, 'close'): client.close() return dbf, sampler return dbf
def run( self, clean: bool = False, debug: bool = False, **kwargs, ): """ Run a flow with your steps. Parameters ---------- clean: bool Should the local staging directory be cleaned prior to this run. Default: False (Do not clean) debug: bool A debug flag for the developer to use to manipulate how much data runs, how it is processed, etc. Default: False (Do not debug) Notes ----- Documentation on prefect: https://docs.prefect.io/core/ Basic prefect example: https://docs.prefect.io/core/ """ # Initalize steps select_data = steps.SelectData() compute_cell_metrics = steps.ComputeCellMetrics() gather_test_visualize = steps.GatherTestVisualize() # Choose executor if debug: exe = LocalExecutor() else: # Create local cluster log.info("Creating LocalCluster") current_mem_gb = psutil.virtual_memory().available / 2**30 n_workers = int(current_mem_gb // 4) cluster = LocalCluster(n_workers=n_workers) log.info("Created LocalCluster") # Set distributed_executor_address distributed_executor_address = cluster.scheduler_address # Batch size on local cluster batch_size = int(psutil.cpu_count() // n_workers) # Log dashboard URI log.info(f"Dask dashboard available at: {cluster.dashboard_link}") # Use dask cluster exe = DaskExecutor(distributed_executor_address) # Configure your flow with Flow("polar_express") as flow: # If you want to clean the local staging directories pass clean # If you want to utilize some debugging functionality pass debug # If you don't utilize any of these, just pass the parameters you need. # step 1: select cells and store in annotation file selected_cells_manifest = select_data( clean=clean, debug=debug, distributed_executor_address=distributed_executor_address, batch_size=batch_size, ** kwargs, # Allows us to pass `--n {some integer}` or other params ) # step 2: compute metrics for each of the cells cell_metrics_manifest = compute_cell_metrics( selected_cells_manifest, clean=clean, debug=debug, distributed_executor_address=distributed_executor_address, batch_size=batch_size, ** kwargs, # Allows us to pass `--n {some integer}` or other params ) # step 3: gather the computed metrics and create visualizations gather_test_visualize( cell_metrics_manifest, clean=clean, debug=debug, ** kwargs, # Allows us to pass `--n {some integer}` or other params ) # Run flow and get ending state state = flow.run(executor=exe) # Get and display any outputs you want to see on your local terminal log.info(select_data.get_result(state, flow)) log.info(compute_cell_metrics.get_result(state, flow)) log.info(gather_test_visualize.get_result(state, flow))
from arboreto.algo import grnboost2 from itertools import izip_longest import sys def grouper(iterable, n, fillvalue=None): args = [iter(iterable)] * n return izip_longest(*args, fillvalue=fillvalue) in_file = '../../PlanExp/rawdata/GSE111764.counts.noheader' tf_file = '../../PlanExp/rawdata/go-regulators-reddien.txt' sys.stderr.write("\nStarting Dusk cluster...") local_cluster = LocalCluster(n_workers=32, threads_per_worker=1, memory_limit=8e10) custom_client = Client(local_cluster) sys.stderr.write("done.\n") # ex_matrix is a DataFrame with gene names as column names sys.stderr.write("\nReading count matrix...") ex_matrix = pd.read_csv(in_file, sep='\t', index_col=0, header=None).T sys.stderr.write("done.\n") # tf_names is read using a utility function included in Arboreto sys.stderr.write("\nLoading putative transcription factors...") tf_names = load_tf_names(tf_file) sys.stderr.write("done.\n") sys.stderr.write("\nPredicting co-expression network in chunks...\n") i = 0
from distributed import LocalCluster c = LocalCluster(processes=False) c.scheduler( from dask_jobqueue import SLURMCluster from dask.distributed import Client from distributed import Scheduler from tornado.ioloop import IOLoop from threading import Thread cluster = SLURMCluster(processes=12, queue="DGE", project="davek", memory="36GB") cluster.start_workers(12) client = Client(cluster) def inc(x): return x + 1 x = client.submit(inc,10) L = client.map(inc, range(1000))
class DRMAACluster(object): def __init__(self, template=None, cleanup_interval=1000, hostname=None, script=None, preexec_commands=(), **kwargs): """ Dask workers launched by a DRMAA-compatible cluster Parameters ---------- jobName: string Name of the job as known by the DRMAA cluster. script: string (optional) Path to the dask-worker executable script. A temporary file will be made if none is provided (recommended) args: list Extra string arguments to pass to dask-worker outputPath: string errorPath: string workingDirectory: string Where dask-worker runs, defaults to current directory nativeSpecification: string Options native to the job scheduler Examples -------- >>> from dask_drmaa import DRMAACluster # doctest: +SKIP >>> cluster = DRMAACluster() # doctest: +SKIP >>> cluster.start_workers(10) # doctest: +SKIP >>> from distributed import Client # doctest: +SKIP >>> client = Client(cluster) # doctest: +SKIP >>> future = client.submit(lambda x: x + 1, 10) # doctest: +SKIP >>> future.result() # doctest: +SKIP 11 """ self.hostname = hostname or socket.gethostname() logger.info("Start local scheduler at %s", self.hostname) self.local_cluster = LocalCluster(n_workers=0, ip='', **kwargs) if script is None: fn = tempfile.mktemp(suffix='sh', prefix='dask-worker-script', dir=os.path.curdir) self.script = fn script_contents = make_job_script(executable=worker_bin_path, name='%s.%s' % (JOB_ID, TASK_ID), preexec=preexec_commands) with open(fn, 'wt') as f: f.write(script_contents) @atexit.register def remove_script(): if os.path.exists(fn): os.remove(fn) os.chmod(self.script, 0o777) else: assert not preexec_commands, "Cannot specify both script and preexec_commands" # TODO: check that user-provided script is executable self.template = merge(default_template, {'remoteCommand': self.script}, template or {}) self._cleanup_callback = PeriodicCallback( callback=self.cleanup_closed_workers, callback_time=cleanup_interval, io_loop=self.scheduler.loop) self._cleanup_callback.start() self.workers = {} # {job-id: WorkerSpec} @gen.coroutine def _start(self): pass @property def scheduler(self): return self.local_cluster.scheduler @property def scheduler_address(self): return self.scheduler.address def create_job_template(self, **kwargs): template = self.template.copy() if kwargs: template.update(kwargs) template['args'] = [self.scheduler_address] + template['args'] jt = get_session().createJobTemplate() valid_attributes = dir(jt) for key, value in template.items(): if key not in valid_attributes: raise ValueError("Invalid job template attribute %s" % key) setattr(jt, key, value) return jt def start_workers(self, n=1, **kwargs): with log_errors(): with self.create_job_template(**kwargs) as jt: ids = get_session().runBulkJobs(jt, 1, n, 1) logger.info("Start %d workers. Job ID: %s", len(ids), ids[0].split('.')[0]) self.workers.update({ jid: WorkerSpec( job_id=jid, kwargs=kwargs, stdout=worker_out_path_template % dict(jid=jid, kind='out'), stderr=worker_out_path_template % dict(jid=jid, kind='err'), ) for jid in ids }) def stop_workers(self, worker_ids, sync=False): if isinstance(worker_ids, str): worker_ids = [worker_ids] for wid in list(worker_ids): try: get_session().control(wid, drmaa.JobControlAction.TERMINATE) except drmaa.errors.InvalidJobException: pass self.workers.pop(wid) logger.info("Stop workers %s", worker_ids) if sync: get_session().synchronize(worker_ids, dispose=True) def close(self): logger.info("Closing DRMAA cluster") self.local_cluster.close() if self.workers: self.stop_workers(self.workers, sync=True) if os.path.exists(self.script): os.remove(self.script) def __enter__(self): return self def __exit__(self, *args): self.close() def cleanup_closed_workers(self): for jid in list(self.workers): if get_session().jobStatus(jid) in ('closed', 'done'): logger.info("Removing closed worker %s", jid) del self.workers[jid] def __del__(self): try: self.close() except: pass def __str__(self): return "<%s: %d workers>" % (self.__class__.__name__, len( self.workers)) __repr__ = __str__
from dask.distributed import Client from distributed import LocalCluster from examples.multiobjective.parallel.zdt1_modified import ZDT1Modified from jmetal.algorithm.multiobjective.nsgaii import DistributedNSGAII from jmetal.operator import PolynomialMutation, SBXCrossover from jmetal.util.termination_criterion import StoppingByEvaluations if __name__ == '__main__': problem = ZDT1Modified() # setup Dask client client = Client(LocalCluster(n_workers=24)) ncores = sum(client.ncores().values()) print(f'{ncores} cores available') # creates the algorithm max_evaluations = 25000 algorithm = DistributedNSGAII( problem=problem, population_size=100, mutation=PolynomialMutation(probability=1.0 / problem.number_of_variables, distribution_index=20), crossover=SBXCrossover(probability=1.0, distribution_index=20), termination_criterion=StoppingByEvaluations(max=max_evaluations), number_of_cores=ncores, client=client ) algorithm.run()
def __init__(self, template=None, cleanup_interval=1000, hostname=None, script=None, preexec_commands=(), **kwargs): """ Dask workers launched by a DRMAA-compatible cluster Parameters ---------- jobName: string Name of the job as known by the DRMAA cluster. script: string (optional) Path to the dask-worker executable script. A temporary file will be made if none is provided (recommended) args: list Extra string arguments to pass to dask-worker outputPath: string errorPath: string workingDirectory: string Where dask-worker runs, defaults to current directory nativeSpecification: string Options native to the job scheduler Examples -------- >>> from dask_drmaa import DRMAACluster # doctest: +SKIP >>> cluster = DRMAACluster() # doctest: +SKIP >>> cluster.start_workers(10) # doctest: +SKIP >>> from distributed import Client # doctest: +SKIP >>> client = Client(cluster) # doctest: +SKIP >>> future = client.submit(lambda x: x + 1, 10) # doctest: +SKIP >>> future.result() # doctest: +SKIP 11 """ self.hostname = hostname or socket.gethostname() logger.info("Start local scheduler at %s", self.hostname) self.local_cluster = LocalCluster(n_workers=0, ip='', **kwargs) if script is None: fn = tempfile.mktemp(suffix='sh', prefix='dask-worker-script', dir=os.path.curdir) self.script = fn script_contents = make_job_script(executable=worker_bin_path, name='%s.%s' % (JOB_ID, TASK_ID), preexec=preexec_commands) with open(fn, 'wt') as f: f.write(script_contents) @atexit.register def remove_script(): if os.path.exists(fn): os.remove(fn) os.chmod(self.script, 0o777) else: assert not preexec_commands, "Cannot specify both script and preexec_commands" # TODO: check that user-provided script is executable self.template = merge(default_template, {'remoteCommand': self.script}, template or {}) self._cleanup_callback = PeriodicCallback( callback=self.cleanup_closed_workers, callback_time=cleanup_interval, io_loop=self.scheduler.loop) self._cleanup_callback.start() self.workers = {} # {job-id: WorkerSpec}
def setUp(self): self.dagbag = DagBag(include_examples=True) self.cluster = LocalCluster()