Пример #1
0
def dask_client():
    cluster = LocalCluster(n_workers=NUM_WORKERS, threads_per_worker=2)
    client = Client(cluster)
    yield client
    # teardown
    client.close()
    cluster.close()
Пример #2
0
class TestDaskExecutor(TestBaseDask):
    def setUp(self):
        self.dagbag = DagBag(include_examples=True)
        self.cluster = LocalCluster()

    def test_dask_executor_functions(self):
        executor = DaskExecutor(cluster_address=self.cluster.scheduler_address)
        self.assert_tasks_on_executor(executor)

    @pytest.mark.quarantined
    def test_backfill_integration(self):
        """
        Test that DaskExecutor can be used to backfill example dags
        """
        dag = self.dagbag.get_dag('example_bash_operator')

        job = BackfillJob(dag=dag,
                          start_date=DEFAULT_DATE,
                          end_date=DEFAULT_DATE,
                          ignore_first_depends_on_past=True,
                          executor=DaskExecutor(
                              cluster_address=self.cluster.scheduler_address))
        job.run()

    def tearDown(self):
        self.cluster.close(timeout=5)
Пример #3
0
    def test_dask_read_combine_instastack(self):

        from distributed import Client, LocalCluster
        from dask.distributed import wait
        cluster = LocalCluster(n_workers=1, threads_per_worker=1)
        c = Client(cluster)
        anxcor = Anxcor()
        anxcor.set_window_length(120.0)
        times = anxcor.get_starttimes(starttime_stamp, endtime_stamp, 0.5)
        bank = WavebankWrapper(source_dir)
        anxcor.add_dataset(bank, 'nodals')

        anxcor.save_at_task(target_dir, 'combine')
        result = anxcor.process(times,dask_client=c,stack=True)

        anxcor = Anxcor()
        anxcor.set_window_length(120.0)
        bank = WavebankWrapper(source_dir)
        anxcor.add_dataset(bank, 'nodals')
        anxcor.load_at_task(target_dir, 'combine')
        result = anxcor.process(times,dask_client=c,stack=True)

        how_many_nc = _how_many_fmt(target_dir, format='.nc')
        _clean_files_in_dir(target_dir)
        c.close()
        cluster.close()
        assert 48 == how_many_nc
Пример #4
0
class DaskExecutorTest(BaseDaskTest):
    def setUp(self):
        self.dagbag = DagBag(include_examples=True)
        self.cluster = LocalCluster()

    def test_dask_executor_functions(self):
        executor = DaskExecutor(cluster_address=self.cluster.scheduler_address)
        self.assert_tasks_on_executor(executor)

    def test_backfill_integration(self):
        """
        Test that DaskExecutor can be used to backfill example dags
        """
        dags = [
            dag for dag in self.dagbag.dags.values() if dag.dag_id in [
                'example_bash_operator',
                # 'example_python_operator',
            ]
        ]

        for dag in dags:
            dag.clear(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)):
            job = BackfillJob(
                dag=dag,
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
                ignore_first_depends_on_past=True,
                executor=DaskExecutor(
                    cluster_address=self.cluster.scheduler_address))
            job.run()

    def tearDown(self):
        self.cluster.close(timeout=5)
def test_get_batch_size_distributed():
    cluster = LocalCluster(processes=False)

    with DistributedHandler(cluster.scheduler_address) as handler:
        assert handler._get_batch_size(handler.client) == DEFAULT_MAX_THREADS

    cluster.close()
Пример #6
0
    def test_backfill_integration(self):
        """
        Test that DaskExecutor can be used to backfill example dags
        """
        cluster = LocalCluster()

        dags = [
            dag for dag in self.dagbag.dags.values()
            if dag.dag_id in [
                'example_bash_operator',
                # 'example_python_operator',
            ]
        ]

        for dag in dags:
            dag.clear(
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE)

        for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)):
            job = BackfillJob(
                dag=dag,
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
                ignore_first_depends_on_past=True,
                executor=DaskExecutor(
                    cluster_address=cluster.scheduler_address))
            job.run()

        cluster.close()
    def test_backfill_integration(self):
        """
        Test that DaskExecutor can be used to backfill example dags
        """
        cluster = LocalCluster()

        dags = [
            dag for dag in self.dagbag.dags.values() if dag.dag_id in [
                'example_bash_operator',
                # 'example_python_operator',
            ]
        ]

        for dag in dags:
            dag.clear(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)):
            job = BackfillJob(dag=dag,
                              start_date=DEFAULT_DATE,
                              end_date=DEFAULT_DATE,
                              ignore_first_depends_on_past=True,
                              executor=DaskExecutor(
                                  cluster_address=cluster.scheduler_address))
            job.run()

        cluster.close()
Пример #8
0
class StartCluster():
    def __init__(self, n_cores=None):
        if n_cores is None:
            n_cores = psutil.cpu_count() - 2
        self.cluster = LocalCluster(processes=True, n_workers=1)
        self.client = Client(self.cluster)

    def __exit__(self, type, value, traceback):
        self.cluster.close()
    def test_submit_task_instance_to_dask_cluster(self):
        """
        Test that the DaskExecutor properly submits tasks to the cluster
        """
        cluster = LocalCluster(nanny=False)

        executor = DaskExecutor(cluster_address=cluster.scheduler_address)

        args = dict(start_date=DEFAULT_DATE)

        def fail():
            raise ValueError('Intentional failure.')

        with DAG('test-dag', default_args=args) as dag:
            # queue should be allowed, but ignored
            success_operator = PythonOperator(task_id='success',
                                              python_callable=lambda: True,
                                              queue='queue')

            fail_operator = PythonOperator(task_id='fail',
                                           python_callable=fail)

        success_ti = TaskInstance(success_operator,
                                  execution_date=DEFAULT_DATE)

        fail_ti = TaskInstance(fail_operator, execution_date=DEFAULT_DATE)

        # queue the tasks
        executor.queue_task_instance(success_ti)
        executor.queue_task_instance(fail_ti)

        # the tasks haven't been submitted to the cluster yet
        self.assertTrue(len(executor.futures) == 0)
        # after the heartbeat, they have been submitted
        executor.heartbeat()
        self.assertTrue(len(executor.futures) == 2)

        # wait a reasonable amount of time for the tasks to complete
        for _ in range(2):
            time.sleep(0.25)
            executor.heartbeat()

        # check that the futures were completed
        if len(executor.futures) == 2:
            raise ValueError('Failed to reach cluster before timeout.')
        self.assertTrue(len(executor.futures) == 0)

        # check that the taskinstances were updated
        success_ti.refresh_from_db()
        self.assertTrue(success_ti.state == State.SUCCESS)
        fail_ti.refresh_from_db()
        self.assertTrue(fail_ti.state == State.FAILED)

        cluster.close()
Пример #10
0
class DRMAACluster(object):
    def __init__(self, **kwargs):
        self.local_cluster = LocalCluster(n_workers=0, **kwargs)
        self.session = drmaa.Session()
        self.session.initialize()

        self.worker_template = self.session.createJobTemplate()
        self.worker_template.remoteCommand = os.path.join(
            sys.exec_prefix, 'bin', 'dask-worker')
        self.worker_template.jobName = 'dask-worker'
        self.worker_template.args = [
            '%s:%d' % (socket.gethostname(), self.local_cluster.scheduler.port)
        ]
        self.worker_template.outputPath = ':/%s/out' % os.getcwd()
        self.worker_template.errorPath = ':/%s/err' % os.getcwd()
        self.worker_template.workingDirectory = os.getcwd()

        self.workers = []

    @property
    def scheduler_address(self):
        return self.local_cluster.scheduler_address

    def start_workers(self, n=1):
        ids = self.session.runBulkJobs(self.worker_template, 1, n, 1)
        self.workers.extend(ids)

    def stop_workers(self, worker_ids, sync=False):
        for wid in worker_ids:
            try:
                self.session.control(wid, drmaa.JobControlAction.TERMINATE)
            except drmaa.errors.InvalidJobException:
                pass

        if sync:
            self.session.synchronize(worker_ids, dispose=True)

    def close(self):
        if self.workers:
            self.stop_workers(self.workers, sync=True)
        self.local_cluster.close()

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def __del__(self):
        try:
            self.close()
        except:
            pass
Пример #11
0
    def test_client(self):
        lc = LocalCluster(diagnostics_port=None)
        passed = Client(lc)

        client, shutdown_callback = _prepare_client(passed)

        self.assertEquals(client, passed)

        shutdown_callback()
        lc.close()

        self.assertEquals(lc.status, 'closed')
Пример #12
0
    def test_dask_cluster_client(self):
        port = 8788
        cluster = LocalCluster(processes=False, scheduler_port=port)

        client = configure_dask_cluster(address=f"localhost:{port}")
        self.assertEqual(None, client.cluster)
        self.assertEqual("running", client.status)

        close_dask_client()
        self.assertEqual("closed", client.status)
        self.assertEqual("running", cluster.status)

        cluster.close(timeout=10)
        self.assertEqual("closed", cluster.status)
Пример #13
0
    def test_dask_execution(self):

        from distributed import Client, LocalCluster
        cluster = LocalCluster(n_workers=1, threads_per_worker=1)
        c = Client(cluster)
        anxcor = Anxcor()
        anxcor.set_window_length(120.0)
        times = anxcor.get_starttimes(starttime_stamp,endtime_stamp, 0.5)
        bank = WavebankWrapper(source_dir)
        anxcor.add_dataset(bank, 'nodals')
        result = anxcor.process(times,dask_client=c)
        pairs  = list(result.coords['rec'].values)+ list(result.coords['src'].values)
        c.close()
        cluster.close()
        assert 6 ==len(pairs)
Пример #14
0
 def test_dask_execution_exclude_with_stack_number(self):
     from distributed import Client, LocalCluster
     cluster = LocalCluster(n_workers=1, threads_per_worker=1)
     c = Client(cluster)
     anxcor = Anxcor()
     anxcor.set_window_length(120)
     anxcor.set_task_kwargs('crosscorrelate', dict(max_tau_shift=20.0))
     times = anxcor.get_starttimes(starttime_stamp, endtime_stamp, 0.5)
     bank = WavebankWrapper(source_dir)
     anxcor.set_must_exclude_single_stations('AX.1')
     anxcor.add_dataset(bank, 'nodals')
     result = anxcor.process(times, dask_client=c, stack=10)
     pairs = list(result.coords['rec'].values) + list(result.coords['src'].values)
     c.close()
     cluster.close()
     assert 4 == len(pairs)
Пример #15
0
def test_twpice_case():
    """ Use a test case from TWP-ICE """
    Grid0 = pyart.io.read_grid(pydda.tests.EXAMPLE_RADAR0)
    Grid1 = pyart.io.read_grid(pydda.tests.EXAMPLE_RADAR1)
    sounding = pyart.io.read_arm_sonde(pydda.tests.SOUNDING_PATH)

    u_init, v_init, w_init = pydda.initialization.make_wind_field_from_profile(
        Grid0, sounding[1], vel_field='corrected_velocity')

    Grids = pydda.retrieval.get_dd_wind_field(
        [Grid0, Grid1], u_init, v_init, w_init, Co=100, Cm=1500.0,
        Cz=0, Cmod=0.0, vel_name='corrected_velocity',
        refl_field='reflectivity', frz=5000.0,
        filt_iterations=0, mask_outside_opt=True, upper_bc=1)

    # In this test grid, we expect the mean flow to be to the southeast
    # Maximum updrafts should be at least 10 m/s
    u_mean = np.nanmean(Grids[0].fields['u']['data'])
    v_mean = np.nanmean(Grids[0].fields['v']['data'])
    w_max = np.max(Grids[0].fields['v']['data'])

    assert u_mean > 0
    assert v_mean < 0
    assert w_max > 10

    # Now we will test the nesting. Do the same retrieval, and make sure
    # that we get the same result within a prescribed tolerance
    cluster = LocalCluster(n_workers=2, processes=True)
    client = Client(cluster)
    Grids2 = pydda.retrieval.get_dd_wind_field_nested(
        [Grid0, Grid1], u_init, v_init, w_init, client, Co=100, Cm=1500.0,
        Cz=0, Cmod=0.0, vel_name='corrected_velocity',
        refl_field='reflectivity', frz=5000.0,
        filt_iterations=0, mask_outside_opt=True, upper_bc=1)

    # Make sure features are correlated between both versions. No reason
    # to expect the same answer, but features should be correlated
    # Nesting tends to make the updrafts a bit better resolved, so expect
    # less of an outright correlation (but still strong)
    assert np.corrcoef(Grids2[0].fields["u"]["data"].flatten(),
                       Grids[0].fields["u"]["data"].flatten())[0, 1] > 0.9
    assert np.corrcoef(Grids2[0].fields["v"]["data"].flatten(),
                       Grids[0].fields["v"]["data"].flatten())[0, 1] > 0.9
    assert np.corrcoef(Grids2[0].fields["w"]["data"].flatten(),
                       Grids[0].fields["w"]["data"].flatten())[0, 1] > 0.5
    cluster.close()
    client.close()
Пример #16
0
    def test_with_distributed_client(self):
        lc = LocalCluster(diagnostics_port=None)
        client = Client(lc)

        graph = create_graph(net1_ex_matrix,
                             net1_gene_names,
                             net1_tf_names,
                             "GBM",
                             SGBM_KWARGS,
                             target_genes=list(self.test_range),
                             client=client)

        network_df = client.compute(graph, sync=True)

        self.assertEquals(len(self.test_range),
                          len(network_df['target'].unique()))

        client.close()
        lc.close()
Пример #17
0
class DaskExecutorTest(BaseDaskTest):

    def setUp(self):
        self.dagbag = DagBag(include_examples=True)
        self.cluster = LocalCluster()

    @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration')
    def test_dask_executor_functions(self):
        executor = DaskExecutor(cluster_address=self.cluster.scheduler_address)
        self.assert_tasks_on_executor(executor)

    @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration')
    def test_backfill_integration(self):
        """
        Test that DaskExecutor can be used to backfill example dags
        """
        dags = [
            dag for dag in self.dagbag.dags.values()
            if dag.dag_id in [
                'example_bash_operator',
                # 'example_python_operator',
            ]
        ]

        for dag in dags:
            dag.clear(
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE)

        for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)):
            job = BackfillJob(
                dag=dag,
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
                ignore_first_depends_on_past=True,
                executor=DaskExecutor(
                    cluster_address=self.cluster.scheduler_address))
            job.run()

    def tearDown(self):
        self.cluster.close(timeout=5)
Пример #18
0
    def test_dask_cluster_extraction_one_worker(self):
        cluster = LocalCluster(n_workers=1, threads_per_worker=1, dashboard_address=None)
        client = Client(cluster)
        address = client.scheduler_info()['address']
        Distributor = ClusterDaskDistributor(address=address)

        df = self.create_test_data_sample()
        extracted_features = extract_features(df, column_id="id", column_sort="sort", column_kind="kind",
                                              column_value="val",
                                              distributor=Distributor)

        self.assertIsInstance(extracted_features, pd.DataFrame)
        self.assertTrue(np.all(extracted_features.a__maximum == np.array([71, 77])))
        self.assertTrue(np.all(extracted_features.a__sum_values == np.array([691, 1017])))
        self.assertTrue(np.all(extracted_features.a__abs_energy == np.array([32211, 63167])))
        self.assertTrue(np.all(extracted_features.b__sum_values == np.array([757, 695])))
        self.assertTrue(np.all(extracted_features.b__minimum == np.array([3, 1])))
        self.assertTrue(np.all(extracted_features.b__abs_energy == np.array([36619, 35483])))
        self.assertTrue(np.all(extracted_features.b__mean == np.array([37.85, 34.75])))
        self.assertTrue(np.all(extracted_features.b__median == np.array([39.5, 28.0])))
        cluster.close()
def test_distributed_handler_distributed(values, expected_values):
    cluster = LocalCluster(processes=False)

    with DistributedHandler(cluster.scheduler_address) as handler:
        futures = handler.client.map(lambda x: x + 1, values)
        handler_map_results = handler.gather(futures)

    with DistributedHandler(cluster.scheduler_address) as handler:
        handler_batched_results = handler.batched_map(lambda x: x + 1, values)

    client = Client(cluster)
    futures = client.map(lambda x: x + 1, values)

    distributed_results = client.gather(futures)

    handler_map_results = set(handler_map_results)
    handler_batched_results = set(handler_batched_results)
    distributed_results = set(distributed_results)

    assert (handler_map_results == handler_batched_results
            and handler_map_results == distributed_results)

    cluster.close()
    def test_dask_executor_functions(self):
        cluster = LocalCluster(nanny=False)

        executor = DaskExecutor(cluster_address=cluster.scheduler_address)

        success_command = 'echo 1'
        fail_command = 'exit 1'

        executor.execute_async(key='success', command=success_command)
        executor.execute_async(key='fail', command=fail_command)

        success_future = next(k for k, v in executor.futures.items()
                              if v == 'success')
        fail_future = next(k for k, v in executor.futures.items()
                           if v == 'fail')

        # wait for the futures to execute, with a timeout
        timeout = datetime.datetime.now() + datetime.timedelta(seconds=0.5)
        while not (success_future.done() and fail_future.done()):
            if datetime.datetime.now() > timeout:
                raise ValueError(
                    'The futures should have finished; there is probably '
                    'an error communciating with the Dask cluster.')

        # both tasks should have finished
        self.assertTrue(success_future.done())
        self.assertTrue(fail_future.done())

        # check task exceptions
        self.assertTrue(success_future.exception() is None)
        self.assertTrue(fail_future.exception() is not None)

        # tell the executor to shut down
        executor.end()
        self.assertTrue(len(executor.futures) == 0)

        cluster.close()
Пример #21
0
    def test_dask_executor_functions(self):
        cluster = LocalCluster()

        executor = DaskExecutor(cluster_address=cluster.scheduler_address)

        # start the executor
        executor.start()

        success_command = 'echo 1'
        fail_command = 'exit 1'

        executor.execute_async(key='success', command=success_command)
        executor.execute_async(key='fail', command=fail_command)

        success_future = next(
            k for k, v in executor.futures.items() if v == 'success')
        fail_future = next(
            k for k, v in executor.futures.items() if v == 'fail')

        # wait for the futures to execute, with a timeout
        timeout = datetime.datetime.now() + datetime.timedelta(seconds=30)
        while not (success_future.done() and fail_future.done()):
            if datetime.datetime.now() > timeout:
                raise ValueError(
                    'The futures should have finished; there is probably '
                    'an error communciating with the Dask cluster.')

        # both tasks should have finished
        self.assertTrue(success_future.done())
        self.assertTrue(fail_future.done())

        # check task exceptions
        self.assertTrue(success_future.exception() is None)
        self.assertTrue(fail_future.exception() is not None)

        cluster.close()
Пример #22
0
import os
import pandas as pd
import argparse
from dask.distributed import Client
from distributed import Client, LocalCluster

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--cell_line', nargs=1, type=str, help='cell line to run on')
    parser.add_argument('--name', nargs=1, type=str, help='name of dataset')
    args = parser.parse_args()

    cl = args.cell_line[0]
    name = args.name[0]

    from arboreto.algo import grnboost2, genie3
    from arboreto.utils import load_tf_names

    ex_matrix = pd.read_csv('~/data/spate116/GCN/%s/%s_expression_matrix_imputed.tsv' % (cl, name), sep='\t').transpose()

    cluster = LocalCluster()
    client = Client(cluster)
    print('here')
    network = grnboost2(expression_data=ex_matrix.to_numpy(), gene_names=ex_matrix.columns, client_or_address=client)
    network.to_csv('~/data/spate116/GCN/%s/%s_GRN.tsv' % (cl, name), sep='\t', header=True, index=False)
    client.close()
    cluster.close()
Пример #23
0
class DaskExecutorTest(unittest.TestCase):
    def setUp(self):
        self.dagbag = DagBag(include_examples=True)
        self.cluster = LocalCluster()

    @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration')
    def test_dask_executor_functions(self):
        executor = DaskExecutor(cluster_address=self.cluster.scheduler_address)

        # start the executor
        executor.start()

        success_command = 'echo 1'
        fail_command = 'exit 1'

        executor.execute_async(key='success', command=success_command)
        executor.execute_async(key='fail', command=fail_command)

        success_future = next(k for k, v in executor.futures.items()
                              if v == 'success')
        fail_future = next(k for k, v in executor.futures.items()
                           if v == 'fail')

        # wait for the futures to execute, with a timeout
        timeout = timezone.utcnow() + timedelta(seconds=30)
        while not (success_future.done() and fail_future.done()):
            if timezone.utcnow() > timeout:
                raise ValueError(
                    'The futures should have finished; there is probably '
                    'an error communciating with the Dask cluster.')

        # both tasks should have finished
        self.assertTrue(success_future.done())
        self.assertTrue(fail_future.done())

        # check task exceptions
        self.assertTrue(success_future.exception() is None)
        self.assertTrue(fail_future.exception() is not None)

    @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration')
    def test_backfill_integration(self):
        """
        Test that DaskExecutor can be used to backfill example dags
        """
        dags = [
            dag for dag in self.dagbag.dags.values() if dag.dag_id in [
                'example_bash_operator',
                # 'example_python_operator',
            ]
        ]

        for dag in dags:
            dag.clear(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)):
            job = BackfillJob(
                dag=dag,
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
                ignore_first_depends_on_past=True,
                executor=DaskExecutor(
                    cluster_address=self.cluster.scheduler_address))
            job.run()

    def tearDown(self):
        self.cluster.close(timeout=5)
Пример #24
0
class JobQueueCluster(Cluster):
    """ Base class to launch Dask Clusters for Job queues

    This class should not be used directly, use inherited class appropriate for your queueing system (e.g. PBScluster
    or SLURMCluster)

    Parameters
    ----------
    name : str
        Name of Dask workers.
    cores : int
        Total number of cores per job
    memory: str
        Total amount of memory per job
    processes : int
        Number of processes per job
    interface : str
        Network interface like 'eth0' or 'ib0'.
    death_timeout : float
        Seconds to wait for a scheduler before closing workers
    local_directory : str
        Dask worker local directory for file spilling.
    extra : str
        Additional arguments to pass to `dask-worker`
    env_extra : list
        Other commands to add to script before launching worker.
    kwargs : dict
        Additional keyword arguments to pass to `LocalCluster`

    Attributes
    ----------
    submit_command: str
        Abstract attribute for job scheduler submit command,
        should be overriden
    cancel_command: str
        Abstract attribute for job scheduler cancel command,
        should be overriden

    See Also
    --------
    PBSCluster
    SLURMCluster
    """

    _script_template = """
#!/bin/bash

%(job_header)s

%(env_header)s

%(worker_command)s
""".lstrip()

    # Following class attributes should be overriden by extending classes.
    submit_command = None
    cancel_command = None
    scheduler_name = ''
    _adaptive_options = {'worker_key': lambda ws: _job_id_from_worker_name(ws.name)}
    job_id_regexp = r'(?P<job_id>\d+)'

    def __init__(self,
                 name=None,
                 cores=None,
                 memory=None,
                 processes=None,
                 interface=None,
                 death_timeout=None,
                 local_directory=None,
                 extra=None,
                 env_extra=None,
                 walltime=None,
                 threads=None,
                 **kwargs
                 ):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used directly.
        # """
        if threads is not None:
            raise ValueError(threads_deprecation_message)

        if not self.scheduler_name:
            raise NotImplementedError('JobQueueCluster is an abstract class that should not be instanciated.')

        if name is None:
            name = dask.config.get('jobqueue.%s.name' % self.scheduler_name)
        if cores is None:
            cores = dask.config.get('jobqueue.%s.cores' % self.scheduler_name)
        if memory is None:
            memory = dask.config.get('jobqueue.%s.memory' % self.scheduler_name)
        if processes is None:
            processes = dask.config.get('jobqueue.%s.processes' % self.scheduler_name)
        if interface is None:
            interface = dask.config.get('jobqueue.%s.interface' % self.scheduler_name)
        if death_timeout is None:
            death_timeout = dask.config.get('jobqueue.%s.death-timeout' % self.scheduler_name)
        if local_directory is None:
            local_directory = dask.config.get('jobqueue.%s.local-directory' % self.scheduler_name)
        if extra is None:
            extra = dask.config.get('jobqueue.%s.extra' % self.scheduler_name)
        if env_extra is None:
            env_extra = dask.config.get('jobqueue.%s.env-extra' % self.scheduler_name)

        if dask.config.get('jobqueue.%s.threads', None):
            warnings.warn(threads_deprecation_message)

        if cores is None:
            raise ValueError("You must specify how many cores to use per job like ``cores=8``")

        if memory is None:
            raise ValueError("You must specify how much memory to use per job like ``memory='24 GB'``")

        # This attribute should be overriden
        self.job_header = None

        if interface:
            extra += ' --interface  %s ' % interface
            kwargs.setdefault('ip', get_ip_interface(interface))
        else:
            kwargs.setdefault('ip', '')

        # Bokeh diagnostics server should listen on all interfaces
        diagnostics_ip_and_port = ('', 8787)
        self.local_cluster = LocalCluster(n_workers=0, diagnostics_port=diagnostics_ip_and_port,
                                          **kwargs)

        # Keep information on process, threads and memory, for use in
        # subclasses
        self.worker_memory = parse_bytes(memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name

        # plugin for tracking job status
        self._scheduler_plugin = JobQueuePlugin()
        self.local_cluster.scheduler.add_plugin(self._scheduler_plugin)

        self._adaptive = None

        self._env_header = '\n'.join(env_extra)

        # dask-worker command line build
        dask_worker_command = '%(python)s -m distributed.cli.dask_worker' % dict(python=sys.executable)
        self._command_template = ' '.join([dask_worker_command, self.scheduler.address])
        self._command_template += " --nthreads %d" % self.worker_threads
        if processes is not None and processes > 1:
            self._command_template += " --nprocs %d" % processes

        mem = format_bytes(self.worker_memory / self.worker_processes)
        mem = mem.replace(' ', '')
        self._command_template += " --memory-limit %s" % mem
        self._command_template += " --name %s--${JOB_ID}--" % name

        if death_timeout is not None:
            self._command_template += " --death-timeout %s" % death_timeout
        if local_directory is not None:
            self._command_template += " --local-directory %s" % local_directory
        if extra is not None:
            self._command_template += extra

    def __repr__(self):
        running_workers = sum(len(value) for value in self.running_jobs.values())
        running_cores = running_workers * self.worker_threads
        total_jobs = len(self.pending_jobs) + len(self.running_jobs)
        total_workers = total_jobs * self.worker_processes
        running_memory = running_workers * self.worker_memory / self.worker_processes

        return (self.__class__.__name__ +
                '(cores=%d, memory=%s, workers=%d/%d, jobs=%d/%d)' %
                (running_cores, format_bytes(running_memory), running_workers,
                 total_workers, len(self.running_jobs), total_jobs)
                )

    @property
    def pending_jobs(self):
        """ Jobs pending in the queue """
        return self._scheduler_plugin.pending_jobs

    @property
    def running_jobs(self):
        """ Jobs with currenly active workers """
        return self._scheduler_plugin.running_jobs

    @property
    def finished_jobs(self):
        """ Jobs that have finished """
        return self._scheduler_plugin.finished_jobs

    @property
    def worker_threads(self):
        return int(self.worker_cores / self.worker_processes)

    def job_script(self):
        """ Construct a job submission script """
        pieces = {'job_header': self.job_header,
                  'env_header': self._env_header,
                  'worker_command': self._command_template}
        return self._script_template % pieces

    @contextmanager
    def job_file(self):
        """ Write job submission script to temporary file """
        with tmpfile(extension='sh') as fn:
            with open(fn, 'w') as f:
                logger.debug("writing job script: \n%s" % self.job_script())
                f.write(self.job_script())
            yield fn

    def _submit_job(self, script_filename):
        return self._call(shlex.split(self.submit_command) + [script_filename])

    def start_workers(self, n=1):
        """ Start workers and point them to our local scheduler """
        logger.debug('starting %s workers' % n)
        num_jobs = math.ceil(n / self.worker_processes)
        for _ in range(num_jobs):
            with self.job_file() as fn:
                out = self._submit_job(fn)
                job = self._job_id_from_submit_output(out.decode())
                logger.debug("started job: %s" % job)
                self.pending_jobs[job] = {}

    @property
    def scheduler(self):
        """ The scheduler of this cluster """
        return self.local_cluster.scheduler

    def _calls(self, cmds, **kwargs):
        """ Call a command using subprocess.communicate

        This centralizes calls out to the command line, providing consistent outputs, logging, and an opportunity
        to go asynchronous in the future

        Parameters
        ----------
        cmd: List(List(str))
            A list of commands, each of which is a list of strings to hand to subprocess.communicate

        Examples
        --------
        >>> self._calls([['ls'], ['ls', '/foo']])

        Returns
        -------
        The stdout result as a string
        Also logs any stderr information
        """
        logger.debug("Submitting the following calls to command line")
        procs = []
        for cmd in cmds:
            logger.debug(' '.join(cmd))
            procs.append(subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs))

        result = []
        for proc in procs:
            out, err = proc.communicate()
            if err:
                logger.error(err.decode())
            result.append(out)
        return result

    def _call(self, cmd, **kwargs):
        """ Singular version of _calls """
        return self._calls([cmd], **kwargs)[0]

    def stop_workers(self, workers):
        """ Stop a list of workers"""
        logger.debug("Stopping workers: %s" % workers)
        if not workers:
            return
        jobs = self._del_pending_jobs()  # stop pending jobs too
        for w in workers:
            if isinstance(w, dict):
                jobs.append(_job_id_from_worker_name(w['name']))
            else:
                jobs.append(_job_id_from_worker_name(w.name))
        self.stop_jobs(set(jobs))

    def stop_jobs(self, jobs):
        """ Stop a list of jobs"""
        logger.debug("Stopping jobs: %s" % jobs)
        if jobs:
            jobs = list(jobs)
            self._call([self.cancel_command] + list(set(jobs)))

    def scale_up(self, n, **kwargs):
        """ Brings total worker count up to ``n`` """
        logger.debug("Scaling up to %d workers." % n)
        active_and_pending = sum([len(j) for j in self.running_jobs.values()])
        active_and_pending += self.worker_processes * len(self.pending_jobs)
        logger.debug("Found %d active/pending workers." % active_and_pending)
        self.start_workers(n - active_and_pending)

    def scale_down(self, workers):
        ''' Close the workers with the given addresses '''
        logger.debug("Scaling down. Workers: %s" % workers)
        worker_states = []
        for w in workers:
            try:
                # Get the actual WorkerState
                worker_states.append(self.scheduler.workers[w])
            except KeyError:
                logger.debug('worker %s is already gone' % w)
        self.stop_workers(worker_states)

    def stop_all_jobs(self):
        ''' Stops all running and pending jobs '''
        jobs = self._del_pending_jobs()
        jobs += list(self.running_jobs.keys())
        self.stop_jobs(set(jobs))

    def close(self):
        ''' Stops all running and pending jobs and stops scheduler '''
        self.stop_all_jobs()
        self.local_cluster.close()

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.close()
        self.local_cluster.__exit__(type, value, traceback)

    def _del_pending_jobs(self):
        jobs = list(self.pending_jobs.keys())
        logger.debug("Deleting pending jobs %s" % jobs)
        for job_id in jobs:
            del self.pending_jobs[job_id]
        return jobs

    def _job_id_from_submit_output(self, out):
        match = re.search(self.job_id_regexp, out)
        if match is None:
            msg = ('Could not parse job id from submission command '
                   "output.\nJob id regexp is {!r}\nSubmission command "
                   'output is:\n{}'.format(self.job_id_regexp, out))
            raise ValueError(msg)

        job_id = match.groupdict().get('job_id')
        if job_id is None:
            msg = ("You need to use a 'job_id' named group in your regexp, e.g. "
                   "r'(?P<job_id>\d+)', in your regexp. Your regexp was: "
                   "{!r}".format(self.job_id_regexp))
            raise ValueError(msg)

        return job_id
Пример #25
0
class DRMAACluster(Cluster):
    def __init__(self, template=None, cleanup_interval=1000, hostname=None,
                 script=None, preexec_commands=(), copy_script=True,
                 ip='',
                 **kwargs):
        """
        Dask workers launched by a DRMAA-compatible cluster

        Parameters
        ----------
        template: dict
            Dictionary specifying options to pass to the DRMAA cluster
            and the worker. Relevant items are:

            jobName: string
                Name of the job as known by the DRMAA cluster.
            args: list
                Extra string arguments to pass to dask-worker
            outputPath: string
                Path to the dask-worker stdout. Must start with ':'.
                Defaults to worker.JOBID.TASKID.out in current directory.
            errorPath: string
                Path to the dask-worker stderr. Must start with ':'
                Defaults to worker.JOBID.TASKID.err in current directory.
            workingDirectory: string
                Where dask-worker runs, defaults to current directory
            nativeSpecification: string
                Options native to the job scheduler

        cleanup_interval: int
            Time interval in seconds at which closed workers are cleaned.
            Defaults to 1000
        hostname: string
            Host on which to start the local scheduler, defaults to localhost
        script: string (optional)
            Path to the dask-worker executable script.
            A temporary file will be made if none is provided (recommended)
        preexec_commands: tuple (optional)
            Commands to be executed first by temporary script. Cannot be
            specified at the same time as script.
        copy_script: bool
            Whether should copy the passed script to the current working
            directory. This is primarily to work around an issue with SGE.
        ip: string
            IP of the scheduler, default is the empty string
            which will listen on the primary ip address of the host
        **kwargs:
            Additional keyword arguments to be passed to the local scheduler

        Examples
        --------
        >>> from dask_drmaa import DRMAACluster          # doctest: +SKIP
        >>> cluster = DRMAACluster()                     # doctest: +SKIP
        >>> cluster.start_workers(10)                    # doctest: +SKIP

        >>> from distributed import Client               # doctest: +SKIP
        >>> client = Client(cluster)                     # doctest: +SKIP

        >>> future = client.submit(lambda x: x + 1, 10)  # doctest: +SKIP
        >>> future.result()                              # doctest: +SKIP
        11
        """
        self.hostname = hostname or socket.gethostname()
        logger.info("Start local scheduler at %s", self.hostname)
        self.local_cluster = LocalCluster(n_workers=0, ip=ip, **kwargs)

        if script is None:
            fn = os.path.abspath(tempfile.mktemp(
                suffix='.sh',
                prefix='dask-worker-script-',
                dir=os.path.curdir,
            ))
            self.script = fn
            self._should_cleanup_script = True

            script_contents = make_job_script(executable=worker_bin_path,
                                              name='%s.%s' % (JOB_ID, TASK_ID),
                                              preexec=preexec_commands)
            with open(fn, 'wt') as f:
                f.write(script_contents)

            @atexit.register
            def remove_script():
                if os.path.exists(fn):
                    os.remove(fn)

            os.chmod(self.script, 0o777)

        else:
            self._should_cleanup_script = False
            if copy_script:
                with ignoring(EnvironmentError):  # may be in the same path
                    shutil.copy(script, os.path.curdir)  # python 2.x returns None
                    script = os.path.join(os.path.curdir, os.path.basename(script))
                    self._should_cleanup_script = True
            self.script = os.path.abspath(script)
            assert not preexec_commands, "Cannot specify both script and preexec_commands"

        # TODO: check that user-provided script is executable

        self.template = merge(default_template,
                              {'remoteCommand': self.script},
                              template or {})

        self._cleanup_callback = PeriodicCallback(callback=self.cleanup_closed_workers,
                                                  callback_time=cleanup_interval,
                                                  io_loop=self.scheduler.loop)
        self._cleanup_callback.start()

        self.workers = {}  # {job-id: WorkerSpec}

    def adapt(self, **kwargs):
        """ Turn on adaptivity

        For keyword arguments see dask_drmaa.adaptive.Adaptive

        Examples
        --------
        >>> cluster.adapt(minimum=0, maximum=10, interval='500ms')

        See Also
        --------
        Cluster: an interface for other clusters to inherit from
        """
        from .adaptive import Adaptive

        with ignoring(AttributeError):
            self._adaptive.stop()
        if not hasattr(self, '_adaptive_options'):
            self._adaptive_options = {}

        self._adaptive_options.update(kwargs)
        self._adaptive = Adaptive(
            self, self.scheduler, **self._adaptive_options
        )

        return self._adaptive

    @gen.coroutine
    def _start(self):
        pass

    @property
    def scheduler(self):
        return self.local_cluster.scheduler

    def create_job_template(self, **kwargs):
        template = self.template.copy()
        if kwargs:
            template.update(kwargs)
        template['args'] = [self.scheduler_address] + template['args']

        jt = get_session().createJobTemplate()
        valid_attributes = dir(jt)

        for key, value in template.items():
            if key not in valid_attributes:
                raise ValueError("Invalid job template attribute %s" % key)
            setattr(jt, key, value)

        return jt

    def start_workers(self, n=1, **kwargs):
        if n == 0:
            return

        with log_errors():
            with self.create_job_template(**kwargs) as jt:
                ids = get_session().runBulkJobs(jt, 1, n, 1)
                logger.info("Start %d workers. Job ID: %s", len(ids), ids[0].split('.')[0])
                self.workers.update(
                    {jid: WorkerSpec(job_id=jid, kwargs=kwargs,
                                     stdout=worker_out_path_template % dict(jid=jid, ext='out'),
                                     stderr=worker_out_path_template % dict(jid=jid, ext='err'),
                                     )
                     for jid in ids})

    @gen.coroutine
    def stop_workers(self, worker_ids, sync=False):
        if isinstance(worker_ids, str):
            worker_ids = [worker_ids]
        elif worker_ids:
            worker_ids = list(worker_ids)
        else:
            return

        # Let the scheduler gracefully retire workers first
        ids_to_ips = {
            v['name']: k for k, v in self.scheduler.worker_info.items()
        }
        worker_ips = [ids_to_ips[wid]
                      for wid in worker_ids
                      if wid in ids_to_ips]
        retired = yield self.scheduler.retire_workers(workers=worker_ips,
                                                      close_workers=True)
        logger.info("Retired workers %s", retired)
        for wid in list(worker_ids):
            try:
                get_session().control(wid, drmaa.JobControlAction.TERMINATE)
            except drmaa.errors.InvalidJobException:
                pass
            try:
                self.workers.pop(wid)
            except KeyError:
                # If we have multiple callers at once, it may have already
                # been popped off
                pass

        logger.info("Stop workers %s", worker_ids)
        if sync:
            get_session().synchronize(worker_ids, dispose=True)

    @gen.coroutine
    def scale_up(self, n, **kwargs):
        yield [self.start_workers(**kwargs)
               for _ in range(n - len(self.workers))]

    @gen.coroutine
    def scale_down(self, workers):
        workers = set(workers)
        yield self.scheduler.retire_workers(workers=workers)

    def close(self):
        logger.info("Closing DRMAA cluster")
        self.stop_workers(self.workers, sync=True)

        self.local_cluster.close()
        if self._should_cleanup_script and os.path.exists(self.script):
            os.remove(self.script)

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def cleanup_closed_workers(self):
        for jid in list(self.workers):
            if get_session().jobStatus(jid) in ('closed', 'done'):
                logger.info("Removing closed worker %s", jid)
                del self.workers[jid]

    def __del__(self):
        try:
            self.close()
        except:
            pass

    def __str__(self):
        return "<%s: %d workers>" % (self.__class__.__name__, len(self.workers))

    __repr__ = __str__
Пример #26
0
#load TF list from file
tf_names = load_tf_names(net1_tf_path)
#Quick inspection
tf_names[:5]
len(tf_names)

#Set computational local environment
# Obersvation: Less Ascertion errors if run with less people on cluster
from distributed import LocalCluster, Client
local_cluster = LocalCluster(n_workers=6, threads_per_worker=1)
custom_client = Client(local_cluster)
custom_client

#Start Job
network = grnboost2(expression_data=ex_matrix,
                    tf_names=tf_names,
                    client_or_address=custom_client)

#QC job
network.head()
len(network)

#Save output
wd_output = '/home/pezoldt/NAS2/pezoldt/Analysis/scRNAseq/scenic/' + sample_ID + '/' + cell_type + '/int/GRNBoost_linklist.tsv'
network.to_csv(wd_output, sep='\t', header=False, index=False)

#close client
custom_client.close()
local_cluster.close()
Пример #27
0
class DRMAACluster(object):
    def __init__(
            self,
            jobName='dask-worker',
            remoteCommand=os.path.join(sys.exec_prefix, 'bin', 'dask-worker'),
            args=(),
            outputPath=':%s/out' % os.getcwd(),
            errorPath=':%s/err' % os.getcwd(),
            workingDirectory=os.getcwd(),
            nativeSpecification='',
            max_runtime='1:00:00',  #1 hour
            **kwargs):
        """
        Dask workers launched by a DRMAA-compatible cluster

        Parameters
        ----------
        jobName: string
            Name of the job as known by the DRMAA cluster.
        remoteCommand: string
            Path to the dask-worker executable
        args: list
            Extra string arguments to pass to dask-worker
        outputPath: string
        errorPath: string
        workingDirectory: string
            Where dask-worker runs, defaults to current directory
        nativeSpecification: string
            Options native to the job scheduler
        max_runtime: string
            Maximum runtime of worker jobs in format ``"HH:MM:SS"``

        Examples
        --------
        >>> from dask_drmaa import DRMAACluster          # doctest: +SKIP
        >>> cluster = DRMAACluster()                     # doctest: +SKIP
        >>> cluster.start_workers(10)                    # doctest: +SKIP

        >>> from distributed import Client               # doctest: +SKIP
        >>> client = Client(cluster)                     # doctest: +SKIP

        >>> future = client.submit(lambda x: x + 1, 10)  # doctest: +SKIP
        >>> future.result()                              # doctest: +SKIP
        11
        """
        logger.info("Start local scheduler")
        self.local_cluster = LocalCluster(n_workers=0, **kwargs)
        logger.info("Initialize connection to job scheduler")

        self.jobName = jobName
        self.remoteCommand = remoteCommand
        self.args = [
            '%s:%d' % (socket.gethostname(), self.local_cluster.scheduler.port)
        ] + list(args)
        self.outputPath = outputPath
        self.errorPath = errorPath
        self.nativeSpecification = nativeSpecification
        self.max_runtime = max_runtime

        self._cleanup_callback = PeriodicCallback(
            callback=self.cleanup_closed_workers,
            callback_time=1000,
            io_loop=self.scheduler.loop)
        # self._cleanup_callback.start()

        self.workers = {}  # {job-id: {'resource': quanitty}}

    @property
    def scheduler(self):
        return self.local_cluster.scheduler

    @property
    def scheduler_address(self):
        return self.scheduler.address

    def createJobTemplate(self, nativeSpecification=''):
        wt = get_session().createJobTemplate()
        wt.jobName = self.jobName
        wt.remoteCommand = self.remoteCommand
        wt.args = self.args
        wt.outputPath = self.outputPath
        wt.errorPath = self.errorPath
        wt.nativeSpecification = self.nativeSpecification + ' ' + nativeSpecification
        return wt

    def start_workers(self, n=1, **kwargs):
        with log_errors():
            wt = self.createJobTemplate(**kwargs)

            ids = get_session().runBulkJobs(wt, 1, n, 1)
            logger.info("Start %d workers. Job ID: %s", len(ids),
                        ids[0].split('.')[0])
            self.workers.update({jid: kwargs for jid in ids})

    def stop_workers(self, worker_ids, sync=False):
        worker_ids = list(worker_ids)
        for wid in worker_ids:
            try:
                get_session().control(wid, drmaa.JobControlAction.TERMINATE)
            except drmaa.errors.InvalidJobException:
                pass
            self.workers.pop(wid)

        logger.info("Stop workers %s", worker_ids)
        if sync:
            get_session().synchronize(worker_ids, dispose=True)

    def close(self):
        logger.info("Closing DRMAA cluster")
        self.local_cluster.close()
        if self.workers:
            self.stop_workers(self.workers, sync=True)

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def cleanup_closed_workers(self):
        for jid in list(self.workers):
            if get_session().jobStatus(jid) == 'closed':
                logger.info("Removing closed worker %s", jid)
                del self.workers[jid]

    def __del__(self):
        try:
            self.close()
        except:
            pass

    def __str__(self):
        return "<%s: %d workers>" % (self.__class__.__name__, len(
            self.workers))

    __repr__ = __str__
Пример #28
0
class DaskYARNCluster(object):
    """
    Implements a dask cluster with YARN containers running the worker processes.
    A dask scheduler is started locally upon instantiation, but you must call
    ``start()`` to initiate the building of containers by YARN.
    
    Parameters
    ----------
    nn, nn_port, rm, rm_port, user, autodetect: see knit.Knit
    env: str or None
        If provided, the path of a zipped conda env to put in containers
    packages: list of str
        Packages to install in the env to provide to containers *if* env is 
        None. Uses conda spec for pinning versions. dask and distributed will
        always be included.
    channels: list of str
        If building an environment, pass these extra channels to conda using
        ``-c`` (i.e., in addition but of superior priority to any system
        default channels).
    conda_pars: dict
        Things to pass to CondaCreator
    ip: IP-like string or None
        Address for the scheduler to listen on. If not given, uses the system
        IP.
    """
    def __init__(self,
                 autodetect=True,
                 packages=None,
                 ip=None,
                 env=None,
                 channels=None,
                 conda_pars=None,
                 **kwargs):

        ip = ip or socket.gethostbyname(socket.gethostname())

        self.env = env
        self.application_master_container = None
        self.app_id = None
        self.channels = channels
        self.conda_pars = conda_pars

        try:
            self.local_cluster = LocalCluster(n_workers=0, ip=ip)
        except (OSError, IOError):
            self.local_cluster = LocalCluster(n_workers=0,
                                              scheduler_port=0,
                                              ip=ip)

        self.packages = list(
            sorted(unique((packages or []) + global_packages, key=first_word)))

        self.knit = Knit(autodetect=autodetect, **kwargs)

        atexit.register(self.stop)

    @property
    def scheduler_address(self):
        return self.local_cluster.scheduler_address

    def start(self, n_workers=1, cpus=1, memory=2048, checks=True, **kwargs):
        """
        Initiate workers. If required, environment is first built and uploaded
        to HDFS, and then a YARN application with the required number of
        containers is created.
        
        Parameters
        ----------
        n_workers: int
            How many containers to create
        cpus: int=1
            How many CPU cores is available in each container
        memory: int=2048
            Memory available to each dask worker (in MB)
        checks: bool=True
            Whether to run pre-flight checks before submitting app to YARN
        kwargs: passed to ``Knit.start()``
        
        Returns
        -------
        YARN application ID.
        """
        c = CondaCreator(channels=self.channels, **(self.conda_pars or {}))
        if self.env is None:
            env_name = 'dask-' + sha1('-'.join(
                self.packages).encode()).hexdigest()
            env_path = os.path.join(c.conda_envs, env_name)
            if os.path.exists(env_path + '.zip'):
                # zipfile exists, ready to upload
                self.env = env_path + '.zip'
            elif os.path.exists(env_path):
                # environment exists, can zip and upload
                c.zip_env(env_path)
                self.env = env_path + '.zip'
            else:
                # create env from scratch
                self.env = c.create_env(env_name=env_name,
                                        packages=self.packages)
        elif not self.env.endswith('.zip'):
            # given env directory, so zip it
            c.zip_env(self.env)
            self.env = self.env + '.zip'

        # TODO: memory should not be total available?
        command = '$PYTHON_BIN $CONDA_PREFIX/bin/dask-worker --nprocs=1 ' \
                  '--nthreads=%d --memory-limit=%d %s > ' \
                  '/tmp/worker-log.out 2> /tmp/worker-log.err' % (
                      cpus, memory * 1e6,
                      self.local_cluster.scheduler.address)

        app_id = self.knit.start(command,
                                 env=self.env,
                                 num_containers=n_workers,
                                 virtual_cores=cpus,
                                 memory=memory,
                                 checks=checks,
                                 **kwargs)
        self.app_id = app_id
        return app_id

    def remove_worker(self, container_id):
        """
        Stop worker and remove container

        Parameters
        ----------
        container_id

        Returns
        -------
        None
        """
        self.knit.remove_containers(container_id)

    @property
    def workers(self):
        """
        list of running container ids
        """

        # remove container ...00001 -- this is applicationMaster's container and
        # should not be removed or counted as a worker

        containers = list(self.knit.get_container_statuses())
        containers.sort()
        self.application_master_container = containers.pop(0)
        return containers

    @gen.coroutine
    def _start(self):
        pass

    def stop(self):
        """Kill the YARN application and all workers"""
        if self.knit:
            self.knit.kill()

    def add_workers(self, n_workers=1, cpus=1, memory=2048):
        """
        Non-blocking function to ask Yarn for more containers/dask-workers

        Parameters
        ----------
        n_workers: int
            number of containers to add (default: 1)

        cpus: int
            number of cpus (default: 1)
        memory: int
            amount of memory to allocate per container

        Returns
        -------
        None
        """

        self.knit.add_containers(num_containers=n_workers,
                                 virtual_cores=cpus,
                                 memory=memory)

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def close(self):
        """Stop the scheduler and workers"""
        self.stop()
        self.local_cluster.close()
def parallel_calculate_chunks(chunks,
                              features,
                              approximate,
                              training_window,
                              verbose,
                              save_progress,
                              entityset,
                              n_jobs,
                              no_unapproximated_aggs,
                              cutoff_df_time_var,
                              target_time,
                              pass_columns,
                              dask_kwargs=None):
    from distributed import Client, LocalCluster, as_completed
    from dask.base import tokenize

    client = None
    cluster = None
    try:
        if 'cluster' in dask_kwargs:
            cluster = dask_kwargs['cluster']
        else:
            diagnostics_port = None
            if 'diagnostics_port' in dask_kwargs:
                diagnostics_port = dask_kwargs['diagnostics_port']
                del dask_kwargs['diagnostics_port']

            workers = n_jobs_to_workers(n_jobs)
            workers = min(workers, len(chunks))
            cluster = LocalCluster(n_workers=workers,
                                   threads_per_worker=1,
                                   diagnostics_port=diagnostics_port,
                                   **dask_kwargs)
            # if cluster has bokeh port, notify user if unxepected port number
            if diagnostics_port is not None:
                if hasattr(cluster, 'scheduler') and cluster.scheduler:
                    info = cluster.scheduler.identity()
                    if 'bokeh' in info['services']:
                        msg = "Dashboard started on port {}"
                        print(msg.format(info['services']['bokeh']))

        client = Client(cluster)
        # scatter the entityset
        # denote future with leading underscore
        start = time.time()
        es_token = "EntitySet-{}".format(tokenize(entityset))
        if es_token in client.list_datasets():
            print("Using EntitySet persisted on the cluster as dataset %s" %
                  (es_token))
            _es = client.get_dataset(es_token)
        else:
            _es = client.scatter([entityset])[0]
            client.publish_dataset(**{_es.key: _es})

        # save features to a tempfile and scatter it
        pickled_feats = cloudpickle.dumps(features)
        _saved_features = client.scatter(pickled_feats)
        client.replicate([_es, _saved_features])
        end = time.time()
        scatter_time = end - start
        scatter_string = "EntitySet scattered to workers in {:.3f} seconds"
        print(scatter_string.format(scatter_time))

        # map chunks
        # TODO: consider handling task submission dask kwargs
        _chunks = client.map(calculate_chunk,
                             chunks,
                             features=_saved_features,
                             entityset=_es,
                             approximate=approximate,
                             training_window=training_window,
                             profile=False,
                             verbose=False,
                             save_progress=save_progress,
                             no_unapproximated_aggs=no_unapproximated_aggs,
                             cutoff_df_time_var=cutoff_df_time_var,
                             target_time=target_time,
                             pass_columns=pass_columns)

        feature_matrix = []
        iterator = as_completed(_chunks).batches()
        if verbose:
            pbar_str = ("Elapsed: {elapsed} | Remaining: {remaining} | "
                        "Progress: {l_bar}{bar}| "
                        "Calculated: {n}/{total} chunks")
            pbar = make_tqdm_iterator(total=len(_chunks), bar_format=pbar_str)
        for batch in iterator:
            results = client.gather(batch)
            for result in results:
                feature_matrix.append(result)
                if verbose:
                    pbar.update()
        if verbose:
            pbar.close()
    except Exception:
        raise
    finally:
        if 'cluster' not in dask_kwargs and cluster is not None:
            cluster.close()
        if client is not None:
            client.close()

    return feature_matrix
Пример #30
0
class JobQueueCluster(ClusterManager):
    """ Base class to launch Dask Clusters for Job queues

    This class should not be used directly, use inherited class appropriate for your queueing system (e.g. PBScluster
    or SLURMCluster)

    Parameters
    ----------
    name : str
        Name of Dask workers.
    cores : int
        Total number of cores per job
    memory: str
        Total amount of memory per job
    processes : int
        Number of processes per job
    interface : str
        Network interface like 'eth0' or 'ib0'.
    death_timeout : float
        Seconds to wait for a scheduler before closing workers
    local_directory : str
        Dask worker local directory for file spilling.
    extra : list
        Additional arguments to pass to `dask-worker`
    env_extra : list
        Other commands to add to script before launching worker.
    log_directory : str
        Directory to use for job scheduler logs.
    shebang : str
        Path to desired interpreter for your batch submission script.
    python : str
        Python executable used to launch Dask workers.
    config_name : str
        Section to use from jobqueue.yaml configuration file.
    kwargs : dict
        Additional keyword arguments to pass to `LocalCluster`

    Attributes
    ----------
    submit_command: str
        Abstract attribute for job scheduler submit command,
        should be overridden
    cancel_command: str
        Abstract attribute for job scheduler cancel command,
        should be overridden

    See Also
    --------
    PBSCluster
    SLURMCluster
    SGECluster
    OARCluster
    LSFCluster
    MoabCluster
    """

    _script_template = """
%(shebang)s

%(job_header)s

%(env_header)s

%(worker_command)s
""".lstrip()

    # Following class attributes should be overridden by extending classes.
    submit_command = None
    cancel_command = None
    job_id_regexp = r"(?P<job_id>\d+)"

    def __init__(self,
                 name=None,
                 cores=None,
                 memory=None,
                 processes=None,
                 interface=None,
                 death_timeout=None,
                 local_directory=None,
                 extra=None,
                 env_extra=None,
                 log_directory=None,
                 threads=None,
                 shebang=None,
                 python=sys.executable,
                 config_name=None,
                 **kwargs):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used directly.
        # """
        super(JobQueueCluster, self).__init__()

        if threads is not None:
            raise ValueError(threads_deprecation_message)

        if config_name is None:
            raise NotImplementedError(
                "JobQueueCluster is an abstract class that should not be instantiated."
            )

        if name is None:
            name = dask.config.get("jobqueue.%s.name" % config_name)
        if cores is None:
            cores = dask.config.get("jobqueue.%s.cores" % config_name)
        if memory is None:
            memory = dask.config.get("jobqueue.%s.memory" % config_name)
        if processes is None:
            processes = dask.config.get("jobqueue.%s.processes" % config_name)
        if interface is None:
            interface = dask.config.get("jobqueue.%s.interface" % config_name)
        if death_timeout is None:
            death_timeout = dask.config.get("jobqueue.%s.death-timeout" %
                                            config_name)
        if local_directory is None:
            local_directory = dask.config.get("jobqueue.%s.local-directory" %
                                              config_name)
        if extra is None:
            extra = dask.config.get("jobqueue.%s.extra" % config_name)
        if env_extra is None:
            env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name)
        if log_directory is None:
            log_directory = dask.config.get("jobqueue.%s.log-directory" %
                                            config_name)
        if shebang is None:
            shebang = dask.config.get("jobqueue.%s.shebang" % config_name)

        if dask.config.get("jobqueue.%s.threads", None):
            warnings.warn(threads_deprecation_message)

        if cores is None:
            raise ValueError(
                "You must specify how many cores to use per job like ``cores=8``"
            )

        if memory is None:
            raise ValueError(
                "You must specify how much memory to use per job like ``memory='24 GB'``"
            )

        # This attribute should be overridden
        self.job_header = None

        if interface:
            extra += ["--interface", interface]
            kwargs.setdefault("ip", get_ip_interface(interface))
        else:
            kwargs.setdefault("ip", "")

        # Bokeh diagnostics server should listen on all interfaces
        kwargs.setdefault("dashboard_address", ("", 8787))
        self.local_cluster = LocalCluster(n_workers=0, **kwargs)

        # Keep information on process, cores, and memory, for use in subclasses
        self.worker_memory = parse_bytes(
            memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name

        # plugin for tracking job status
        self._scheduler_plugin = JobQueuePlugin()
        self.local_cluster.scheduler.add_plugin(self._scheduler_plugin)

        self._adaptive = None

        self.shebang = shebang

        self._env_header = "\n".join(env_extra)

        # dask-worker command line build
        dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict(
            python=python)
        command_args = [dask_worker_command, self.scheduler.address]
        command_args += ["--nthreads", self.worker_process_threads]
        if processes is not None and processes > 1:
            command_args += ["--nprocs", processes]

        command_args += ["--memory-limit", self.worker_process_memory]
        command_args += ["--name", "%s--${JOB_ID}--" % name]

        if death_timeout is not None:
            command_args += ["--death-timeout", death_timeout]
        if local_directory is not None:
            command_args += ["--local-directory", local_directory]
        if extra is not None:
            command_args += extra

        self._command_template = " ".join(map(str, command_args))

        self.log_directory = log_directory
        if self.log_directory is not None:
            if not os.path.exists(self.log_directory):
                os.makedirs(self.log_directory)

    def __repr__(self):
        running_workers = self._count_active_workers()
        running_cores = running_workers * self.worker_process_threads
        total_jobs = len(self.pending_jobs) + len(self.running_jobs)
        total_workers = total_jobs * self.worker_processes
        running_memory = running_workers * self.worker_memory / self.worker_processes

        return (self.__class__.__name__ +
                "(cores=%d, memory=%s, workers=%d/%d, jobs=%d/%d)" % (
                    running_cores,
                    format_bytes(running_memory),
                    running_workers,
                    total_workers,
                    len(self.running_jobs),
                    total_jobs,
                ))

    @property
    def pending_jobs(self):
        """ Jobs pending in the queue """
        return self._scheduler_plugin.pending_jobs

    @property
    def running_jobs(self):
        """ Jobs with currently active workers """
        return self._scheduler_plugin.running_jobs

    @property
    def finished_jobs(self):
        """ Jobs that have finished """
        return self._scheduler_plugin.finished_jobs

    @property
    def worker_process_threads(self):
        return int(self.worker_cores / self.worker_processes)

    @property
    def worker_process_memory(self):
        mem = format_bytes(self.worker_memory / self.worker_processes)
        mem = mem.replace(" ", "")
        return mem

    @property
    def jobqueue_worker_spec(self):
        """ single worker process info needed for scaling on cores or memory """
        return {
            "cores": self.worker_process_threads,
            "memory": self.worker_process_memory,
        }

    @property
    def workers(self):
        """ workers currently connected to the scheduler """
        return self.scheduler.workers

    def job_script(self):
        """ Construct a job submission script """
        pieces = {
            "shebang": self.shebang,
            "job_header": self.job_header,
            "env_header": self._env_header,
            "worker_command": self._command_template,
        }
        return self._script_template % pieces

    @contextmanager
    def job_file(self):
        """ Write job submission script to temporary file """
        with tmpfile(extension="sh") as fn:
            with open(fn, "w") as f:
                logger.debug("writing job script: \n%s", self.job_script())
                f.write(self.job_script())
            yield fn

    def _submit_job(self, script_filename):
        return self._call(shlex.split(self.submit_command) + [script_filename])

    def start_workers(self, n=1):
        """ Start workers and point them to our local scheduler """
        logger.debug("starting %s workers", n)
        num_jobs = int(math.ceil(n / self.worker_processes))
        for _ in range(num_jobs):
            with self.job_file() as fn:
                out = self._submit_job(fn)
                job = self._job_id_from_submit_output(out)
                if not job:
                    raise ValueError(
                        "Unable to parse jobid from output of %s" % out)
                logger.debug("started job: %s", job)
                self.pending_jobs[job] = {}

    @property
    def scheduler(self):
        """ The scheduler of this cluster """
        return self.local_cluster.scheduler

    def _call(self, cmd, **kwargs):
        """ Call a command using subprocess.Popen.

        This centralizes calls out to the command line, providing consistent
        outputs, logging, and an opportunity to go asynchronous in the future.

        Parameters
        ----------
        cmd: List(str))
            A command, each of which is a list of strings to hand to
            subprocess.Popen

        Examples
        --------
        >>> self._call(['ls', '/foo'])

        Returns
        -------
        The stdout produced by the command, as string.

        Raises
        ------
        RuntimeError if the command exits with a non-zero exit code
        """
        cmd_str = " ".join(cmd)
        logger.debug(
            "Executing the following command to command line\n{}".format(
                cmd_str))

        proc = subprocess.Popen(cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE,
                                **kwargs)

        out, err = proc.communicate()
        if six.PY3:
            out, err = out.decode(), err.decode()
        if proc.returncode != 0:
            raise RuntimeError("Command exited with non-zero exit code.\n"
                               "Exit code: {}\n"
                               "Command:\n{}\n"
                               "stdout:\n{}\n"
                               "stderr:\n{}\n".format(proc.returncode, cmd_str,
                                                      out, err))
        return out

    def stop_workers(self, workers):
        """ Stop a list of workers"""
        logger.debug("Stopping workers: %s", workers)
        if not workers:
            return
        jobs = self._del_pending_jobs()  # stop pending jobs too
        for w in workers:
            if isinstance(w, dict):
                jobs.append(_job_id_from_worker_name(w["name"]))
            else:
                jobs.append(_job_id_from_worker_name(w.name))
        self.stop_jobs(jobs)

    def stop_jobs(self, jobs):
        """ Stop a list of jobs"""
        logger.debug("Stopping jobs: %s", jobs)
        if jobs:
            jobs = list(jobs)
            self._call(shlex.split(self.cancel_command) + list(set(jobs)))

        # if any of these jobs were pending, we should remove those now
        for job_id in jobs:
            if job_id in self.pending_jobs:
                del self.pending_jobs[job_id]

    def scale_up(self, n, **kwargs):
        """ Brings total worker count up to ``n`` """
        active_and_pending = self._count_active_and_pending_workers()
        if n >= active_and_pending:
            logger.debug("Scaling up to %d workers.", n)
            self.start_workers(n - active_and_pending)
        else:
            # scale_up should not be called if n < active + pending jobs
            logger.warning("JobQueueCluster.scale_up was called with a"
                           " number of workers lower that what is already"
                           " running or pending")

    def _count_active_and_pending_workers(self):
        active_and_pending = (self._count_active_workers() +
                              self._count_pending_workers())
        logger.debug("Found %d active/pending workers.", active_and_pending)
        assert len(self.scheduler.workers) <= active_and_pending
        return active_and_pending

    def _count_active_workers(self):
        active_workers = sum([len(j) for j in self.running_jobs.values()])
        assert len(self.scheduler.workers) == active_workers
        return active_workers

    def _count_pending_workers(self):
        return self.worker_processes * len(self.pending_jobs)

    def scale_down(self, workers, n=None):
        """ Close the workers with the given addresses """
        if n is None:
            # Adaptive currently calls directly scale_down, we need to handle this
            # Need to only keep active workers minus those adaptive wants to stop
            n = self._count_active_workers() - len(workers)
        logger.debug("Scaling down to %d Workers: %s", n, workers)
        active_and_pending = self._count_active_and_pending_workers()
        n_to_close = active_and_pending - n
        if n_to_close < 0:
            logger.warning("JobQueueCluster.scale_down was called with"
                           " a number of worker greater than what is"
                           " already running or pending.")
        elif n_to_close <= self._count_pending_workers():
            # We only need to kill some pending jobs,
            to_kill = int(n_to_close / self.worker_processes)
            jobs = list(self.pending_jobs.keys())[-to_kill:]
            logger.debug("%d jobs to stop, stopping jobs %s", to_kill, jobs)
            self.stop_jobs(jobs)
        else:
            worker_states = []
            for w in workers:
                try:
                    # Get the actual WorkerState
                    worker_states.append(self.scheduler.workers[w])
                except KeyError:
                    logger.debug("worker %s is already gone", w)
            self.stop_workers(worker_states)

    def stop_all_jobs(self):
        """ Stops all running and pending jobs """
        jobs = self._del_pending_jobs()
        jobs += list(self.running_jobs.keys())
        self.stop_jobs(set(jobs))

    def close(self, **kwargs):
        """ Stops all running and pending jobs and stops scheduler """
        self.stop_all_jobs()
        return self.local_cluster.close(**kwargs)

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.close()
        self.local_cluster.__exit__(type, value, traceback)

    def _del_pending_jobs(self):
        jobs = list(self.pending_jobs.keys())
        logger.debug("Deleting pending jobs %s" % jobs)
        for job_id in jobs:
            del self.pending_jobs[job_id]
        return jobs

    def _job_id_from_submit_output(self, out):
        match = re.search(self.job_id_regexp, out)
        if match is None:
            msg = ("Could not parse job id from submission command "
                   "output.\nJob id regexp is {!r}\nSubmission command "
                   "output is:\n{}".format(self.job_id_regexp, out))
            raise ValueError(msg)

        job_id = match.groupdict().get("job_id")
        if job_id is None:
            msg = (
                "You need to use a 'job_id' named group in your regexp, e.g. "
                "r'(?P<job_id>\\d+)', in your regexp. Your regexp was: "
                "{!r}".format(self.job_id_regexp))
            raise ValueError(msg)

        return job_id

    def worker_key(self, worker_state):
        return _job_id_from_worker_name(worker_state.name)
Пример #31
0
class Cluster:
    def _wait_workers_start(self, n=1, timeout=None):
        """ Wait for number of workers, seen by scheduler, will become not less
        than n and return True. If timeout is reached, and it is still
        not happens, return False """
        dt = timeout or self._wait_timeout
        end_time = time() + dt
        while True:
            if self.n_workers >= n:
                return True
            if time() > end_time:
                return False
            sleep(self._wait_timestep)

    def _start_local_cluster(self, **kwargs):
        ip = kwargs.pop("ip", socket.gethostbyname(self._hostname))
        scheduler_port = kwargs.pop("scheduler_port", 0)
        self._local_cluster = LocalCluster(n_workers=0,
                                           ip=ip,
                                           scheduler_port=scheduler_port,
                                           **kwargs)
        logger.info("Started local scheduler at {addr}".format(
            addr=self.scheduler_address))

    def __init__(self,
                 slurm_kwargs=None,
                 hostname=None,
                 task_name=None,
                 nanny=True,
                 bokeh=True,
                 bokeh_port=None,
                 timeout=10.,
                 extra_path=None,
                 tmp_dir=None,
                 **kwargs):
        """
        Dask.Distribued workers launched via Slurm workload manager

        Parameters
        ----------
        slurm_kwargs : dict
            A dictionary with arguments, passed to Slurm batch script
            (see Examples). If None, defaults to empty dictionary.
        hostname : None or string
            Hostname of a controller node, visible by other Slurm nodes.
            If None, determined automatically through 'socket.gethostname()'.
        task_name : string or None
            Name of the job, passed to Slurm. If None, defaults to
            'dask-workers'.
        nanny : boolean
            Start Dask workers in nanny process for management.
            Default is True.
        bokeh : boolean
            Whether to launch Bokeh Web UI
            Default is True.
        bokeh_port: None or int
            Bokeh port for dask-worker. None means default.
        timeout : float
            Default time to wait until workers start
            (see ``self.start_workers``).
        extra_path : None or str or List of str
            Extra module path values, that are injected to the workers via
            PYTHONPATH environment variable
        tmp_dir : str or None
            Directory for temporary files. If not specified, defaults to
            "slurmified_files" in working directory.
            For now it is assumed, that it is accessible from all nodes of a
            cluster. If you need more clever behaviour, please file a bug.

        **kwargs: dict
            Keyword arguments, passed directly to 'distributed.LocalCluster'
            constructor.

        Examples
        --------
        >>> from slurmified import Cluster
        >>> slurm_kwargs = {
        ...     'partition': 'default',
        ...     'mem-per-cpu': '100',
        ...     'time': '1-00:00:00'
        ... }
        >>> cluster = Cluster(slurm_kwargs)
        >>> cluster.start_workers(10)
        >>> from distributed import Client
        >>> client = Client(cluster)
        >>> future = client.submit(lambda x: x + 1, 10)
        >>> future.result()
        11
        """
        self._hostname = hostname or socket.gethostname()
        self._start_local_cluster(**kwargs)

        self._slurm_kwargs = slurm_kwargs.copy() if slurm_kwargs else {}
        nthreads1 = self._slurm_kwargs.pop("cpus-per-task", None)
        nthreads2 = self._slurm_kwargs.pop("c", None)
        self._nthreads = nthreads1 or nthreads2 or 1
        self._jobid = None

        self._task_name = task_name or "dask-workers"
        self._wait_timeout = timeout
        self._wait_timestep = 1

        self._worker_exec = os.path.join(sys.exec_prefix, 'bin', 'dask-worker')
        logger.info("Using dask-worker executable '{exe}'".format(
            exe=self._worker_exec))
        self._nanny = nanny
        self._bokeh = bokeh
        self._bokeh_port = bokeh_port
        if isinstance(extra_path, str):
            self._extra_path = [extra_path]
        else:
            self._extra_path = extra_path

        self._tmp_dir = tmp_dir or os.path.abspath("slurmified_files")
        if not os.path.exists(self._tmp_dir):
            os.makedirs(self._tmp_dir)
            self._remove_tmp_dir = True
        else:
            self._remove_tmp_dir = False

    @property
    def scheduler(self):
        return self._local_cluster.scheduler

    @property
    def scheduler_address(self):
        return ('{hostname}:{port}'.format(hostname=self._hostname,
                                           port=self.scheduler.port))

    @property
    def n_workers(self):
        return len(self.scheduler.workers)

    def start_workers(self, n=1, n_min=None, timeout=None, **kwargs):
        """Start Dask workers via Slurm batch script. If workers are started
        already, they are terminated. Returns self.

        Parameters
        ----------
        n: int
            Number of workers to start.
        n_min: None or int
            Minimal number of workers launched, needed to start calculations.
            Function waits, until it is reached and exits. If it is not
            achieved until ``timeout``, RuntimeError will be emited. If None,
            wunction will wait for all ``n`` workers to start, but error would
            never be emited, only warning.
        timeout: None or int
            Time in seconds to wait for workers to start. If it is reached, and
            workers are not started, warning is emited. If None, default is
            used (provided in constructor).
        **kwargs: dict
            Dictionary with strings as keys and values, can be used to override
            Slurm kwargs, passed to the constructor.
        """
        if self._jobid:
            self.stop_workers()
        slurm_kwargs = merge(self._slurm_kwargs, kwargs or {}, {
            "array": "0-{}".format(n - 1),
            "cpus-per-task": self._nthreads
        })
        if self._extra_path:
            pythonpath_cmd = (
                "[[ -z \"$PYTHONPATH\" ]] && "
                "export PYTHONPATH=\"{new_entries}\" || "
                "export PYTHONPATH=\"{new_entries}:$PYTHONPATH\"".format(
                    new_entries=":".join(self._extra_path)))
        else:
            pythonpath_cmd = ""

        s = slurmpy.Slurm(self._task_name,
                          slurm_kwargs=slurm_kwargs,
                          scripts_dir=self._tmp_dir)
        # This command puts Jobid to stderr, that is likely nice to suppress
        with redirect_stderr(_Sink):
            self._jobid = s.run(pythonpath_cmd + "\n" + " ".join((
                self._worker_exec, "--nthreads", str(self._nthreads),
                "--nprocs", "1", "--reconnect",
                "--nanny" if self._nanny else "--no-nanny",
                "--bokeh" if self._bokeh else "--no-bokeh",
                ("--bokeh-port {}".format(self._bokeh_port) if self._bokeh_port
                 else ""), "--local-directory \"{}\"".format(self._tmp_dir),
                self.scheduler_address)))
        if self._wait_workers_start(n_min or n, timeout):
            m = ("Started {n} workers, job number {jobid}".format(
                n=self.n_workers, jobid=self._jobid))
            logger.info(m)
        elif n_min:
            m = ("Not enough workers to continue "
                 "({n}, minimal provided {n_min})".format(n=self.n_workers,
                                                          n_min=n_min))
            self.stop_workers()
            raise RuntimeError(m)
        else:
            m = ("Timeout is reached while waiting for {n} workers to start. "
                 "{n_started} actually started. Job number {jobid}.".format(
                     n=n, n_started=self.n_workers, jobid=self._jobid))
            logger.warning(m)
        return self

    def stop_workers(self):
        """ Stop running workers. """
        # Sometimes retire_workers command throws a lot of exceptions, that
        # also vary from update to update, so we just suppress everything here.
        # Anyway we just kill all the workers later using Slurm,
        # so it is just an attempt to do this in polite manner.
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            with suppress(Exception):
                sync(loop=self._local_cluster.loop,
                     func=self.scheduler.retire_workers,
                     remove=True)
        if self._jobid:
            try:
                subprocess.check_call(("scancel", str(self._jobid)))
            except subprocess.CalledProcessError as ex:
                m = ("scancel returned non-zero exit status {code} while "
                     "stopping Slurm job number {jobid} for workers. "
                     "You should check manually whether they are "
                     "terminated successfully.".format(code=ex.returncode,
                                                       jobid=self._jobid))
                logger.error(m)
            finally:
                self._jobid = None

    def _start(self):
        return self._local_cluster._start()

    def close(self):
        """ Close the cluster. """
        logger.info("Closing workers and cluster")
        if self._jobid:
            self.stop_workers()
        self._local_cluster.close()

        if self._remove_tmp_dir:
            shutil.rmtree(self._tmp_dir)
            self._remove_tmp_dir = False

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def __del__(self):
        self.close()
Пример #32
0
class DaskExecutorTest(unittest.TestCase):

    def setUp(self):
        self.dagbag = DagBag(include_examples=True)
        self.cluster = LocalCluster()

    @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration')
    def test_dask_executor_functions(self):
        executor = DaskExecutor(cluster_address=self.cluster.scheduler_address)

        # start the executor
        executor.start()

        success_command = 'echo 1'
        fail_command = 'exit 1'

        executor.execute_async(key='success', command=success_command)
        executor.execute_async(key='fail', command=fail_command)

        success_future = next(
            k for k, v in executor.futures.items() if v == 'success')
        fail_future = next(
            k for k, v in executor.futures.items() if v == 'fail')

        # wait for the futures to execute, with a timeout
        timeout = timezone.utcnow() + timedelta(seconds=30)
        while not (success_future.done() and fail_future.done()):
            if timezone.utcnow() > timeout:
                raise ValueError(
                    'The futures should have finished; there is probably '
                    'an error communciating with the Dask cluster.')

        # both tasks should have finished
        self.assertTrue(success_future.done())
        self.assertTrue(fail_future.done())

        # check task exceptions
        self.assertTrue(success_future.exception() is None)
        self.assertTrue(fail_future.exception() is not None)

    @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration')
    def test_backfill_integration(self):
        """
        Test that DaskExecutor can be used to backfill example dags
        """
        dags = [
            dag for dag in self.dagbag.dags.values()
            if dag.dag_id in [
                'example_bash_operator',
                # 'example_python_operator',
            ]
        ]

        for dag in dags:
            dag.clear(
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE)

        for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)):
            job = BackfillJob(
                dag=dag,
                start_date=DEFAULT_DATE,
                end_date=DEFAULT_DATE,
                ignore_first_depends_on_past=True,
                executor=DaskExecutor(
                    cluster_address=self.cluster.scheduler_address))
            job.run()

    def tearDown(self):
        self.cluster.close(timeout=5)
Пример #33
0
class DRMAACluster(object):
    def __init__(self,
                 template=None,
                 cleanup_interval=1000,
                 hostname=None,
                 script=None,
                 preexec_commands=(),
                 **kwargs):
        """
        Dask workers launched by a DRMAA-compatible cluster

        Parameters
        ----------
        jobName: string
            Name of the job as known by the DRMAA cluster.
        script: string (optional)
            Path to the dask-worker executable script.
            A temporary file will be made if none is provided (recommended)
        args: list
            Extra string arguments to pass to dask-worker
        outputPath: string
        errorPath: string
        workingDirectory: string
            Where dask-worker runs, defaults to current directory
        nativeSpecification: string
            Options native to the job scheduler

        Examples
        --------
        >>> from dask_drmaa import DRMAACluster          # doctest: +SKIP
        >>> cluster = DRMAACluster()                     # doctest: +SKIP
        >>> cluster.start_workers(10)                    # doctest: +SKIP

        >>> from distributed import Client               # doctest: +SKIP
        >>> client = Client(cluster)                     # doctest: +SKIP

        >>> future = client.submit(lambda x: x + 1, 10)  # doctest: +SKIP
        >>> future.result()                              # doctest: +SKIP
        11
        """
        self.hostname = hostname or socket.gethostname()
        logger.info("Start local scheduler at %s", self.hostname)
        self.local_cluster = LocalCluster(n_workers=0, ip='', **kwargs)

        if script is None:
            fn = tempfile.mktemp(suffix='sh',
                                 prefix='dask-worker-script',
                                 dir=os.path.curdir)
            self.script = fn

            script_contents = make_job_script(executable=worker_bin_path,
                                              name='%s.%s' % (JOB_ID, TASK_ID),
                                              preexec=preexec_commands)
            with open(fn, 'wt') as f:
                f.write(script_contents)

            @atexit.register
            def remove_script():
                if os.path.exists(fn):
                    os.remove(fn)

            os.chmod(self.script, 0o777)

        else:
            assert not preexec_commands, "Cannot specify both script and preexec_commands"

        # TODO: check that user-provided script is executable

        self.template = merge(default_template, {'remoteCommand': self.script},
                              template or {})

        self._cleanup_callback = PeriodicCallback(
            callback=self.cleanup_closed_workers,
            callback_time=cleanup_interval,
            io_loop=self.scheduler.loop)
        self._cleanup_callback.start()

        self.workers = {}  # {job-id: WorkerSpec}

    @gen.coroutine
    def _start(self):
        pass

    @property
    def scheduler(self):
        return self.local_cluster.scheduler

    @property
    def scheduler_address(self):
        return self.scheduler.address

    def create_job_template(self, **kwargs):
        template = self.template.copy()
        if kwargs:
            template.update(kwargs)
        template['args'] = [self.scheduler_address] + template['args']

        jt = get_session().createJobTemplate()
        valid_attributes = dir(jt)

        for key, value in template.items():
            if key not in valid_attributes:
                raise ValueError("Invalid job template attribute %s" % key)
            setattr(jt, key, value)

        return jt

    def start_workers(self, n=1, **kwargs):
        with log_errors():
            with self.create_job_template(**kwargs) as jt:
                ids = get_session().runBulkJobs(jt, 1, n, 1)
                logger.info("Start %d workers. Job ID: %s", len(ids),
                            ids[0].split('.')[0])
                self.workers.update({
                    jid: WorkerSpec(
                        job_id=jid,
                        kwargs=kwargs,
                        stdout=worker_out_path_template %
                        dict(jid=jid, kind='out'),
                        stderr=worker_out_path_template %
                        dict(jid=jid, kind='err'),
                    )
                    for jid in ids
                })

    def stop_workers(self, worker_ids, sync=False):
        if isinstance(worker_ids, str):
            worker_ids = [worker_ids]

        for wid in list(worker_ids):
            try:
                get_session().control(wid, drmaa.JobControlAction.TERMINATE)
            except drmaa.errors.InvalidJobException:
                pass
            self.workers.pop(wid)

        logger.info("Stop workers %s", worker_ids)
        if sync:
            get_session().synchronize(worker_ids, dispose=True)

    def close(self):
        logger.info("Closing DRMAA cluster")
        self.local_cluster.close()
        if self.workers:
            self.stop_workers(self.workers, sync=True)

        if os.path.exists(self.script):
            os.remove(self.script)

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self.close()

    def cleanup_closed_workers(self):
        for jid in list(self.workers):
            if get_session().jobStatus(jid) in ('closed', 'done'):
                logger.info("Removing closed worker %s", jid)
                del self.workers[jid]

    def __del__(self):
        try:
            self.close()
        except:
            pass

    def __str__(self):
        return "<%s: %d workers>" % (self.__class__.__name__, len(
            self.workers))

    __repr__ = __str__
Пример #34
0
class DaskRunner(object):
    def __init__(self, config):
        self.config = config
        self.scheduler_mode = config.getvalue("dask_scheduler_mode")
        remote_cluster_address = config.getvalue('dask_scheduler_address')
        if remote_cluster_address:
            self.client = Client(remote_cluster_address)
        else:
            self.cluster = LocalCluster(
                ip='127.0.0.1',
                n_workers=int(config.getvalue('dask_nworkers')),
                processes=config.getvalue('dask_scheduler_mode') == 'process')
            self.client = Client(self.cluster, set_as_default=True)

    def __getstate__(self):
        return {'config': None}

    def __setstate__(self, state):
        for k in state:
            pass

    def pytest_runtestloop(self, session):
        if (session.testsfailed
                and not session.config.option.continue_on_collection_errors):
            raise session.Interrupted("%d errors during collection" %
                                      session.testsfailed)

        unregister_plugins = ['debugging', 'terminalreporter']
        for p in unregister_plugins:
            session.config.pluginmanager.unregister(p)

        if session.config.option.collectonly:
            return True

        def generate_tasks(session):
            for i, item in enumerate(session.items):

                # @delayed(pure=False)
                def run_test(_item):
                    # ensure that the plugin manager gets recreated appropriately.
                    _item.config.pluginmanager.__recreate__()
                    results = self.pytest_runtest_protocol(item=_item,
                                                           nextitem=None)
                    return results

                # hook = item.ihook
                # try to ensure that the module gets treated as a dynamic module that does not
                # exist.

                # delattr(item.module, '__file__')
                # setup = hook.pytest_runtest_setup
                # make_report = hook.pytest_runtest_makereport

                fut = self.client.submit(run_test, item, pure=False)
                yield fut

        with self.remote_syspath_ctx():
            tasks = generate_tasks(session)

            # log these reports to the console.
            for resolved in as_completed(tasks):
                t = resolved.result()
                for report in t:
                    session.ihook.pytest_runtest_logreport(report=report)

        return True

    @contextmanager
    def remote_syspath_ctx(self):
        # Due to test directories being dynamic in certain cases we should make sure that our
        # workers are using the same pythonpath that we are using here.
        original_sys_path = self.client.run(get_imports)
        logger.debug("Original remote sys path %s", original_sys_path)
        updated_sys_path = self.client.run(update_syspath, sys.path)
        logger.debug("Updated remote sys path %s", updated_sys_path)
        try:
            yield
        finally:
            # restore correct syspath
            for worker, value in original_sys_path.items():
                self.client.run(restore_syspath, value, workers=[worker])

            original_sys_path2 = self.client.run(get_imports)
            assert original_sys_path == original_sys_path2

    def call_and_report(self, item, when, log=True, **kwds):
        call = self.call_runtest_hook(item, when, **kwds)
        hook = item.ihook
        report = hook.pytest_runtest_makereport(item=item, call=call)
        return report

    def call_runtest_hook(self, item, when, **kwds):
        hookname = "pytest_runtest_" + when
        ihook = getattr(item.ihook, hookname)
        return CallInfo(lambda: ihook(item=item, **kwds), when=when)

    # VENDORED so that we have access to the report objects and not just T/F
    def pytest_runtest_protocol(self, item, log=True, nextitem=None):
        hasrequest = hasattr(item, "_request")
        if hasrequest and not item._request:
            item._initrequest()
        rep = self.call_and_report(item, "setup", log)
        reports = [rep]
        if rep.passed:
            if item.config.option.setupshow:
                # TODO figure out how to pass this test
                # show_test_item(item)
                pass
            if not item.config.option.setuponly:
                rep = self.call_and_report(item, "call", log)
                reports.append(rep)
        rep = self.call_and_report(item, "teardown", log, nextitem=None)
        reports.append(rep)
        # after all teardown hooks have been called
        # want funcargs and request info to go away
        if hasrequest:
            item._request = False
            item.funcargs = None
        return reports

    def pytest_runtest_setup(self, item):
        item.session._setupstate.prepare(item)

    def pytest_unconfigure(self, config):
        """ called before test process is exited.  """
        if hasattr(self, 'cluster'):
            self.cluster.close()