示例#1
0
async def test_stress():
    from distributed.utils import get_ip_interface

    try:  # this check should be removed once UCX + TCP works
        get_ip_interface("ib0")
    except Exception:
        pytest.skip("ib0 interface not found")

    import dask.array as da
    from distributed import wait

    chunksize = "10 MB"

    async with LocalCluster(
        protocol="ucx", interface="ib0", asynchronous=True
    ) as cluster:
        async with Client(cluster, asynchronous=True) as client:
            rs = da.random.RandomState()
            x = rs.random((10000, 10000), chunks=(-1, chunksize))
            x = x.persist()
            await wait(x)

            for i in range(10):
                x = x.rechunk((chunksize, -1))
                x = x.rechunk((-1, chunksize))
                x = x.persist()
                await wait(x)
示例#2
0
def test_get_ip_interface():
    if sys.platform == 'darwin':
        assert get_ip_interface('lo0') == '127.0.0.1'
    elif sys.platform.startswith('linux'):
        assert get_ip_interface('lo') == '127.0.0.1'
    else:
        pytest.skip("test needs to be enhanced for platform %r" % (sys.platform,))
    with pytest.raises(KeyError):
        get_ip_interface('__non-existent-interface')
示例#3
0
def test_get_ip_interface():
    if sys.platform == "darwin":
        assert get_ip_interface("lo0") == "127.0.0.1"
    elif sys.platform.startswith("linux"):
        assert get_ip_interface("lo") == "127.0.0.1"
    else:
        pytest.skip("test needs to be enhanced for platform %r" % (sys.platform,))
    with pytest.raises(KeyError):
        get_ip_interface("__non-existent-interface")
示例#4
0
def test_get_ip_interface():
    if sys.platform == 'darwin':
        assert get_ip_interface('lo0') == '127.0.0.1'
    elif sys.platform.startswith('linux'):
        assert get_ip_interface('lo') == '127.0.0.1'
    else:
        pytest.skip("test needs to be enhanced for platform %r" % (sys.platform,))
    with pytest.raises(KeyError):
        get_ip_interface('__non-existent-interface')
示例#5
0
async def test_interface_async(loop, Worker):
    from distributed.utils import get_ip_interface

    psutil = pytest.importorskip("psutil")
    if_names = sorted(psutil.net_if_addrs())
    for if_name in if_names:
        try:
            ipv4_addr = get_ip_interface(if_name)
        except ValueError:
            pass
        else:
            if ipv4_addr == "127.0.0.1":
                break
    else:
        pytest.skip("Could not find loopback interface. "
                    "Available interfaces are: %s." % (if_names, ))

    async with Scheduler(interface=if_name) as s:
        assert s.address.startswith("tcp://127.0.0.1")
        async with Worker(s.address, interface=if_name) as w:
            assert w.address.startswith("tcp://127.0.0.1")
            assert w.ip == "127.0.0.1"
            async with Client(s.address, asynchronous=True) as c:
                info = c.scheduler_info()
                assert "tcp://127.0.0.1" in info["address"]
                assert all("127.0.0.1" == d["host"]
                           for d in info["workers"].values())
示例#6
0
def test_interface(loop):
    if_names = sorted(psutil.net_if_addrs())
    for if_name in if_names:
        try:
            ipv4_addr = get_ip_interface(if_name)
        except ValueError:
            pass
        else:
            if ipv4_addr == "127.0.0.1":
                break
    else:
        pytest.skip("Could not find loopback interface. "
                    "Available interfaces are: %s." % (if_names, ))

    with popen(["dask-scheduler", "--no-dashboard", "--interface",
                if_name]) as s:
        with popen([
                "dask-worker", "127.0.0.1:8786", "--no-dashboard",
                "--interface", if_name
        ]) as a:
            with Client("tcp://127.0.0.1:%d" % Scheduler.default_port,
                        loop=loop) as c:
                start = time()
                while not len(c.nthreads()):
                    sleep(0.1)
                    assert time() - start < 30
                info = c.scheduler_info()
                assert "tcp://127.0.0.1" in info["address"]
                assert all("127.0.0.1" == d["host"]
                           for d in info["workers"].values())
示例#7
0
def test_interface(loop):
    psutil = pytest.importorskip('psutil')
    if_names = sorted(psutil.net_if_addrs())
    for if_name in if_names:
        try:
            ipv4_addr = get_ip_interface(if_name)
        except ValueError:
            pass
        else:
            if ipv4_addr == '127.0.0.1':
                break
    else:
        pytest.skip("Could not find loopback interface. "
                    "Available interfaces are: %s." % (if_names,))

    with popen(['dask-scheduler', '--no-bokeh', '--interface', if_name]) as s:
        with popen(['dask-worker', '127.0.0.1:8786', '--no-bokeh', '--interface', if_name]) as a:
            with Client('tcp://127.0.0.1:%d' % Scheduler.default_port, loop=loop) as c:
                start = time()
                while not len(c.ncores()):
                    sleep(0.1)
                    assert time() - start < 5
                info = c.scheduler_info()
                assert 'tcp://127.0.0.1' in info['address']
                assert all('127.0.0.1' == d['host']
                           for d in info['workers'].values())
示例#8
0
def test_get_ip_interface():
    if sys.platform == "darwin":
        assert get_ip_interface("lo0") == "127.0.0.1"
    elif sys.platform.startswith("linux"):
        assert get_ip_interface("lo") == "127.0.0.1"
    else:
        pytest.skip("test needs to be enhanced for platform %r" % (sys.platform,))

    non_existent_interface = "__non-existent-interface"
    expected_error_message = "{!r}.+network interface.+".format(non_existent_interface)

    if sys.platform == "darwin":
        expected_error_message += "'lo0'"
    elif sys.platform.startswith("linux"):
        expected_error_message += "'lo'"
    with pytest.raises(ValueError, match=expected_error_message):
        get_ip_interface(non_existent_interface)
示例#9
0
def main(scheduler_file, interface, nthreads, local_directory, memory_limit,
         scheduler, bokeh_port, bokeh_prefix, nanny, bokeh_worker_port):
    if interface:
        host = get_ip_interface(interface)
    else:
        host = None

    if rank == 0 and scheduler:
        try:
            from distributed.bokeh.scheduler import BokehScheduler
        except ImportError:
            services = {}
        else:
            services = {
                ('bokeh', bokeh_port): partial(BokehScheduler,
                                               prefix=bokeh_prefix)
            }
        scheduler = Scheduler(scheduler_file=scheduler_file,
                              loop=loop,
                              services=services)
        addr = uri_from_host_port(host, None, 8786)
        scheduler.start(addr)
        try:
            loop.start()
            loop.close()
        finally:
            scheduler.stop()
    else:
        W = Nanny if nanny else Worker
        worker = W(scheduler_file=scheduler_file,
                   loop=loop,
                   name=rank if scheduler else None,
                   ncores=nthreads,
                   local_dir=local_directory,
                   services={('bokeh', bokeh_worker_port): BokehWorker},
                   memory_limit=memory_limit)
        addr = uri_from_host_port(host, None, 0)

        @gen.coroutine
        def run():
            yield worker._start(addr)
            while worker.status != 'closed':
                yield gen.sleep(0.2)

        try:
            loop.run_sync(run)
            loop.close()
        finally:
            pass

        @gen.coroutine
        def close():
            yield worker._close(timeout=2)

        loop.run_sync(close)
示例#10
0
def main(scheduler_file, interface, nthreads, local_directory, memory_limit,
         scheduler, bokeh_port, bokeh_prefix, nanny, bokeh_worker_port):
    if interface:
        host = get_ip_interface(interface)
    else:
        host = None

    if rank == 0 and scheduler:
        try:
            from distributed.bokeh.scheduler import BokehScheduler
        except ImportError:
            services = {}
        else:
            services = {('bokeh',  bokeh_port): partial(BokehScheduler,
                                                        prefix=bokeh_prefix)}
        scheduler = Scheduler(scheduler_file=scheduler_file,
                              loop=loop,
                              services=services)
        addr = uri_from_host_port(host, None, 8786)
        scheduler.start(addr)
        try:
            loop.start()
            loop.close()
        finally:
            scheduler.stop()
    else:
        W = Nanny if nanny else Worker
        worker = W(scheduler_file=scheduler_file,
                   loop=loop,
                   name=rank if scheduler else None,
                   ncores=nthreads,
                   local_dir=local_directory,
                   services={('bokeh', bokeh_worker_port): BokehWorker},
                   memory_limit=memory_limit)
        addr = uri_from_host_port(host, None, 0)

        @gen.coroutine
        def run():
            yield worker._start(addr)
            while worker.status != 'closed':
                yield gen.sleep(0.2)

        try:
            loop.run_sync(run)
            loop.close()
        finally:
            pass

        @gen.coroutine
        def close():
            yield worker._close(timeout=2)

        loop.run_sync(close)
示例#11
0
def main(scheduler_file, interface, nthreads, local_directory, memory_limit,
         scheduler):
    if interface:
        host = get_ip_interface(interface)
    else:
        host = None

    if rank == 0 and scheduler:
        scheduler = Scheduler(scheduler_file=scheduler_file,
                              loop=loop,
                              services={('bokeh', 8787): BokehScheduler})
        addr = uri_from_host_port(host, None, 8786)
        scheduler.start(addr)
        try:
            loop.start()
            loop.close()
        finally:
            scheduler.stop()
    else:
        worker = Worker(scheduler_file=scheduler_file,
                        loop=loop,
                        name=rank if scheduler else None,
                        ncores=nthreads,
                        local_dir=local_directory,
                        services={'bokeh': BokehWorker},
                        memory_limit=memory_limit)
        addr = uri_from_host_port(host, None, 0)

        @gen.coroutine
        def run():
            yield worker._start(addr)
            while worker.status != 'closed':
                yield gen.sleep(0.2)

        try:
            loop.run_sync(run)
            loop.close()
        finally:
            pass

        @gen.coroutine
        def close():
            yield worker._close(timeout=2)

        loop.run_sync(close)
示例#12
0
def address_from_user_args(
    host=None,
    port=None,
    interface=None,
    protocol=None,
    peer=None,
    security=None,
    default_port=0,
) -> str:
    """Get an address to listen on from common user provided arguments"""

    if security and security.require_encryption and not protocol:
        protocol = "tls"

    if protocol and protocol.rstrip("://") == "inplace":
        if host or port or interface:
            raise ValueError(
                "Can not specify inproc protocol and host or port or interface"
            )
        else:
            return "inproc://"

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host", interface, host)
        else:
            host = get_ip_interface(interface)

    if protocol and host and "://" not in host:
        host = protocol.rstrip("://") + "://" + host

    if host or port:
        addr = uri_from_host_port(host, port, default_port)
    else:
        addr = ""

    if protocol:
        addr = protocol.rstrip("://") + "://" + addr.split("://")[-1]

    return addr
示例#13
0
    def _start(self, ip=None, n_workers=0):
        """
        Start all cluster services.
        """
        if self.status == "running":
            return

        if self.protocol == "inproc://":
            address = self.protocol
        else:
            if ip is None:
                if self.interface:
                    ip = get_ip_interface(self.interface)
                else:
                    ip = "127.0.0.1"

            if "://" in ip:
                address = ip
            else:
                address = self.protocol + ip
            if self.scheduler_port:
                address += ":" + str(self.scheduler_port)

        self.scheduler.start(address)

        yield [
            self._start_worker(
                **self.worker_kwargs,
                env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
                name="gpu-" + str(i),
            ) for i in range(n_workers)
        ]

        self.status = "running"

        raise gen.Return(self)
示例#14
0
def main(host, port, bokeh_port, show, _bokeh, bokeh_whitelist, bokeh_prefix,
         use_xheaders, pid_file, scheduler_file, interface, local_directory,
         preload, preload_argv, tls_ca_file, tls_cert, tls_key):

    enable_proctitle_on_current()
    enable_proctitle_on_children()

    sec = Security(
        tls_ca_file=tls_ca_file,
        tls_scheduler_cert=tls_cert,
        tls_scheduler_key=tls_key,
    )

    if not host and (tls_ca_file or tls_cert or tls_key):
        host = 'tls://'

    if pid_file:
        with open(pid_file, 'w') as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    local_directory_created = False
    if local_directory:
        if not os.path.exists(local_directory):
            os.mkdir(local_directory)
            local_directory_created = True
    else:
        local_directory = tempfile.mkdtemp(prefix='scheduler-')
        local_directory_created = True
    if local_directory not in sys.path:
        sys.path.insert(0, local_directory)

    if sys.platform.startswith('linux'):
        import resource  # module fails importing on Windows
        soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
        limit = max(soft, hard // 2)
        resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard))

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    addr = uri_from_host_port(host, port, 8786)

    loop = IOLoop.current()
    logger.info('-' * 47)

    services = {}
    if _bokeh:
        with ignoring(ImportError):
            from distributed.bokeh.scheduler import BokehScheduler
            services[('bokeh', bokeh_port)] = (BokehScheduler, {
                'prefix': bokeh_prefix
            })
    scheduler = Scheduler(loop=loop,
                          services=services,
                          scheduler_file=scheduler_file,
                          security=sec)
    scheduler.start(addr)
    if not preload:
        preload = dask.config.get('distributed.scheduler.preload')
    if not preload_argv:
        preload_argv = dask.config.get('distributed.scheduler.preload-argv')
    preload_modules(preload,
                    parameter=scheduler,
                    file_dir=local_directory,
                    argv=preload_argv)

    logger.info('Local Directory: %26s', local_directory)
    logger.info('-' * 47)

    install_signal_handlers(loop)

    try:
        loop.start()
        loop.close()
    finally:
        scheduler.stop()
        if local_directory_created:
            shutil.rmtree(local_directory)

        logger.info("End scheduler at %r", addr)
示例#15
0
def get_host_from_interface(interface=None):
    if interface:
        host = get_ip_interface(interface)
    else:
        host = None
    return host
示例#16
0
def main(host, port, bokeh_port, show, _bokeh, bokeh_whitelist, bokeh_prefix,
        use_xheaders, pid_file, scheduler_file, interface,
        local_directory, preload, preload_argv, tls_ca_file, tls_cert, tls_key):

    enable_proctitle_on_current()
    enable_proctitle_on_children()

    sec = Security(tls_ca_file=tls_ca_file,
                   tls_scheduler_cert=tls_cert,
                   tls_scheduler_key=tls_key,
                   )

    if not host and (tls_ca_file or tls_cert or tls_key):
        host = 'tls://'

    if pid_file:
        with open(pid_file, 'w') as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)
        atexit.register(del_pid_file)

    local_directory_created = False
    if local_directory:
        if not os.path.exists(local_directory):
            os.mkdir(local_directory)
            local_directory_created = True
    else:
        local_directory = tempfile.mkdtemp(prefix='scheduler-')
        local_directory_created = True
    if local_directory not in sys.path:
        sys.path.insert(0, local_directory)

    if sys.platform.startswith('linux'):
        import resource   # module fails importing on Windows
        soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
        limit = max(soft, hard // 2)
        resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard))

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    addr = uri_from_host_port(host, port, 8786)

    loop = IOLoop.current()
    logger.info('-' * 47)

    services = {}
    if _bokeh:
        try:
            from distributed.bokeh.scheduler import BokehScheduler
            services[('bokeh', bokeh_port)] = (BokehScheduler,
                                               {'prefix': bokeh_prefix})
        except ImportError as error:
            if str(error).startswith('No module named'):
                logger.info('Web dashboard not loaded.  Unable to import bokeh')
            else:
                logger.info('Unable to import bokeh: %s' % str(error))

    scheduler = Scheduler(loop=loop, services=services,
                          scheduler_file=scheduler_file,
                          security=sec)
    scheduler.start(addr)
    if not preload:
        preload = dask.config.get('distributed.scheduler.preload')
    if not preload_argv:
        preload_argv = dask.config.get('distributed.scheduler.preload-argv')
    preload_modules(preload, parameter=scheduler, file_dir=local_directory, argv=preload_argv)

    logger.info('Local Directory: %26s', local_directory)
    logger.info('-' * 47)

    install_signal_handlers(loop)

    try:
        loop.start()
        loop.close()
    finally:
        scheduler.stop()
        if local_directory_created:
            shutil.rmtree(local_directory)

        logger.info("End scheduler at %r", addr)
示例#17
0
def main(
    scheduler,
    host,
    nthreads,
    name,
    memory_limit,
    pid_file,
    reconnect,
    resources,
    dashboard,
    dashboard_address,
    local_directory,
    scheduler_file,
    interface,
    death_timeout,
    preload,
    preload_argv,
    bokeh_prefix,
    tls_ca_file,
    tls_cert,
    tls_key,
):
    enable_proctitle_on_current()
    enable_proctitle_on_children()

    sec = Security(tls_ca_file=tls_ca_file,
                   tls_worker_cert=tls_cert,
                   tls_worker_key=tls_key)

    try:
        nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
    except KeyError:
        nprocs = get_n_gpus()

    if not nthreads:
        nthreads = min(1, _ncores // nprocs)

    if pid_file:
        with open(pid_file, "w") as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {}

    if dashboard:
        try:
            from distributed.dashboard import BokehWorker
        except ImportError:
            pass
        else:
            if bokeh_prefix:
                result = (BokehWorker, {"prefix": bokeh_prefix})
            else:
                result = BokehWorker
            services[("dashboard", dashboard_address)] = result

    if resources:
        resources = resources.replace(",", " ").split()
        resources = dict(pair.split("=") for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    kwargs = {"worker_port": None, "listen_address": None}
    t = Nanny

    if not scheduler and not scheduler_file and "scheduler-address" not in config:
        raise ValueError("Need to provide scheduler address like\n"
                         "dask-worker SCHEDULER_ADDRESS:8786")

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    if host:
        addr = uri_from_host_port(host, 0, 0)
    else:
        # Choose appropriate address for scheduler
        addr = None

    if death_timeout is not None:
        death_timeout = parse_timedelta(death_timeout, "s")

    nannies = [
        t(scheduler,
          scheduler_file=scheduler_file,
          ncores=nthreads,
          services=services,
          loop=loop,
          resources=resources,
          memory_limit=memory_limit,
          reconnect=reconnect,
          local_dir=local_directory,
          death_timeout=death_timeout,
          preload=(preload or []) + ["dask_cuda.initialize_context"],
          preload_argv=preload_argv,
          security=sec,
          contact_address=None,
          env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
          name=name if nprocs == 1 or not name else name + "-" + str(i),
          **kwargs) for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        # Unregister all workers from scheduler
        yield [n._close(timeout=2) for n in nannies]

    def on_signal(signum):
        logger.info("Exiting on signal %d", signum)
        close_all()

    @gen.coroutine
    def run():
        yield [n._start(addr) for n in nannies]
        while all(n.status != "closed" for n in nannies):
            yield gen.sleep(0.2)

    install_signal_handlers(loop, cleanup=on_signal)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
示例#18
0
def main(host, port, http_port, bokeh_port, bokeh_internal_port, show, _bokeh,
         bokeh_whitelist, bokeh_prefix, use_xheaders, pid_file, scheduler_file,
         interface, local_directory, preload, prefix, tls_ca_file, tls_cert,
         tls_key):

    if bokeh_internal_port:
        print("The --bokeh-internal-port keyword has been removed.\n"
              "The internal bokeh server is now the default bokeh server.\n"
              "Use --bokeh-port %d instead" % bokeh_internal_port)
        sys.exit(1)

    if prefix:
        print("The --prefix keyword has moved to --bokeh-prefix")
        sys.exit(1)

    sec = Security(
        tls_ca_file=tls_ca_file,
        tls_scheduler_cert=tls_cert,
        tls_scheduler_key=tls_key,
    )

    if pid_file:
        with open(pid_file, 'w') as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    local_directory_created = False
    if local_directory:
        if not os.path.exists(local_directory):
            os.mkdir(local_directory)
            local_directory_created = True
    else:
        local_directory = tempfile.mkdtemp(prefix='scheduler-')
        local_directory_created = True
    if local_directory not in sys.path:
        sys.path.insert(0, local_directory)

    if sys.platform.startswith('linux'):
        import resource  # module fails importing on Windows
        soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
        limit = max(soft, hard // 2)
        resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard))

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    addr = uri_from_host_port(host, port, 8786)

    loop = IOLoop.current()
    logger.info('-' * 47)

    services = {('http', http_port): HTTPScheduler}
    if _bokeh:
        with ignoring(ImportError):
            from distributed.bokeh.scheduler import BokehScheduler
            services[('bokeh', bokeh_port)] = partial(BokehScheduler,
                                                      prefix=bokeh_prefix)
    scheduler = Scheduler(loop=loop,
                          services=services,
                          scheduler_file=scheduler_file,
                          security=sec)
    scheduler.start(addr)
    preload_modules(preload, parameter=scheduler, file_dir=local_directory)

    logger.info('Local Directory: %26s', local_directory)
    logger.info('-' * 47)
    try:
        loop.start()
        loop.close()
    finally:
        scheduler.stop()
        if local_directory_created:
            shutil.rmtree(local_directory)

        logger.info("End scheduler at %r", addr)
示例#19
0
def main(scheduler, host, worker_port, http_port, nanny_port, nthreads, nprocs,
         nanny, name, memory_limit, pid_file, reconnect, resources, bokeh,
         bokeh_port, local_directory, scheduler_file, interface, death_timeout,
         preload, bokeh_prefix, tls_ca_file, tls_cert, tls_key):
    sec = Security(
        tls_ca_file=tls_ca_file,
        tls_worker_cert=tls_cert,
        tls_worker_key=tls_key,
    )

    if nanny:
        port = nanny_port
    else:
        port = worker_port

    if nprocs > 1 and worker_port != 0:
        logger.error(
            "Failed to launch worker.  You cannot use the --port argument when nprocs > 1."
        )
        exit(1)

    if nprocs > 1 and name:
        logger.error(
            "Failed to launch worker.  You cannot use the --name argument when nprocs > 1."
        )
        exit(1)

    if nprocs > 1 and not nanny:
        logger.error(
            "Failed to launch worker.  You cannot use the --no-nanny argument when nprocs > 1."
        )
        exit(1)

    if not nthreads:
        nthreads = _ncores // nprocs

    if pid_file:
        with open(pid_file, 'w') as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {('http', http_port): HTTPWorker}

    if bokeh:
        try:
            from distributed.bokeh.worker import BokehWorker
        except ImportError:
            pass
        else:
            if bokeh_prefix:
                result = (BokehWorker, {'prefix': bokeh_prefix})
            else:
                result = BokehWorker
            services[('bokeh', bokeh_port)] = result

    if resources:
        resources = resources.replace(',', ' ').split()
        resources = dict(pair.split('=') for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    if nanny:
        kwargs = {'worker_port': worker_port}
        t = Nanny
    else:
        kwargs = {}
        if nanny_port:
            kwargs['service_ports'] = {'nanny': nanny_port}
        t = Worker

    if scheduler_file:
        while not os.path.exists(scheduler_file):
            sleep(0.01)
        for i in range(10):
            try:
                with open(scheduler_file) as f:
                    cfg = json.load(f)
                scheduler = cfg['address']
                break
            except (ValueError, KeyError):  # race with scheduler on file
                sleep(0.01)

    if not scheduler:
        raise ValueError("Need to provide scheduler address like\n"
                         "dask-worker SCHEDULER_ADDRESS:8786")

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    if host or port:
        addr = uri_from_host_port(host, port, 0)
    else:
        # Choose appropriate address for scheduler
        addr = None

    nannies = [
        t(scheduler,
          ncores=nthreads,
          services=services,
          name=name,
          loop=loop,
          resources=resources,
          memory_limit=memory_limit,
          reconnect=reconnect,
          local_dir=local_directory,
          death_timeout=death_timeout,
          preload=preload,
          security=sec,
          **kwargs) for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        try:
            if nanny:
                yield [n._close(timeout=2) for n in nannies]
        finally:
            loop.stop()

    def handle_signal(signum, frame):
        logger.info("Exiting on signal %d", signum)
        if loop._running:
            loop.add_callback_from_signal(loop.stop)
        else:
            exit(0)

    # NOTE: We can't use the generic install_signal_handlers() function from
    # distributed.cli.utils because we're handling the signal differently.
    signal.signal(signal.SIGINT, handle_signal)
    signal.signal(signal.SIGTERM, handle_signal)

    for n in nannies:
        n.start(addr)

    @gen.coroutine
    def run():
        while all(n.status != 'closed' for n in nannies):
            yield gen.sleep(0.2)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")

    # Clean exit: unregister all workers from scheduler
    loop.run_sync(close_all)
示例#20
0
def main(
    scheduler,
    host,
    nthreads,
    name,
    memory_limit,
    device_memory_limit,
    pid_file,
    resources,
    dashboard,
    dashboard_address,
    local_directory,
    scheduler_file,
    interface,
    death_timeout,
    preload,
    dashboard_prefix,
    tls_ca_file,
    tls_cert,
    tls_key,
    **kwargs,
):
    enable_proctitle_on_current()
    enable_proctitle_on_children()

    sec = Security(
        tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key
    )

    try:
        nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
    except KeyError:
        nprocs = get_n_gpus()

    if not nthreads:
        nthreads = min(1, multiprocessing.cpu_count() // nprocs)

    if pid_file:
        with open(pid_file, "w") as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {}

    if dashboard:
        try:
            from distributed.dashboard import BokehWorker
        except ImportError:
            pass
        else:
            if dashboard_prefix:
                result = (BokehWorker, {"prefix": dashboard_prefix})
            else:
                result = BokehWorker
            services[("dashboard", dashboard_address)] = result

    if resources:
        resources = resources.replace(",", " ").split()
        resources = dict(pair.split("=") for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    kwargs = {"worker_port": None, "listen_address": None}
    t = Nanny

    if not scheduler and not scheduler_file and "scheduler-address" not in config:
        raise ValueError(
            "Need to provide scheduler address like\n"
            "dask-worker SCHEDULER_ADDRESS:8786"
        )

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    nannies = [
        t(
            scheduler,
            scheduler_file=scheduler_file,
            nthreads=nthreads,
            services=services,
            loop=loop,
            resources=resources,
            memory_limit=memory_limit,
            host=host,
            preload=(preload or []) + ["dask_cuda.initialize_context"],
            security=sec,
            env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
            name=name if nprocs == 1 or not name else name + "-" + str(i),
            data=(
                DeviceHostFile,
                {
                    "device_memory_limit": get_device_total_memory(index=i)
                    if (device_memory_limit == "auto" or device_memory_limit == int(0))
                    else parse_bytes(device_memory_limit),
                    "memory_limit": parse_memory_limit(
                        memory_limit, nthreads, total_cores=nprocs
                    ),
                    "local_directory": local_directory,
                },
            ),
            **kwargs,
        )
        for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        # Unregister all workers from scheduler
        yield [n._close(timeout=2) for n in nannies]

    def on_signal(signum):
        logger.info("Exiting on signal %d", signum)
        close_all()

    @gen.coroutine
    def run():
        yield nannies
        yield [n.finished() for n in nannies]

    install_signal_handlers(loop, cleanup=on_signal)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
示例#21
0
    def __init__(self,
                 name=None,
                 cores=None,
                 memory=None,
                 processes=None,
                 interface=None,
                 death_timeout=None,
                 local_directory=None,
                 extra=None,
                 env_extra=None,
                 log_directory=None,
                 threads=None,
                 shebang=None,
                 python=sys.executable,
                 config_name=None,
                 **kwargs):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used directly.
        # """
        super(JobQueueCluster, self).__init__()

        if threads is not None:
            raise ValueError(threads_deprecation_message)

        if config_name is None:
            raise NotImplementedError(
                "JobQueueCluster is an abstract class that should not be instantiated."
            )

        if name is None:
            name = dask.config.get("jobqueue.%s.name" % config_name)
        if cores is None:
            cores = dask.config.get("jobqueue.%s.cores" % config_name)
        if memory is None:
            memory = dask.config.get("jobqueue.%s.memory" % config_name)
        if processes is None:
            processes = dask.config.get("jobqueue.%s.processes" % config_name)
        if interface is None:
            interface = dask.config.get("jobqueue.%s.interface" % config_name)
        if death_timeout is None:
            death_timeout = dask.config.get("jobqueue.%s.death-timeout" %
                                            config_name)
        if local_directory is None:
            local_directory = dask.config.get("jobqueue.%s.local-directory" %
                                              config_name)
        if extra is None:
            extra = dask.config.get("jobqueue.%s.extra" % config_name)
        if env_extra is None:
            env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name)
        if log_directory is None:
            log_directory = dask.config.get("jobqueue.%s.log-directory" %
                                            config_name)
        if shebang is None:
            shebang = dask.config.get("jobqueue.%s.shebang" % config_name)

        if dask.config.get("jobqueue.%s.threads", None):
            warnings.warn(threads_deprecation_message)

        if cores is None:
            raise ValueError(
                "You must specify how many cores to use per job like ``cores=8``"
            )

        if memory is None:
            raise ValueError(
                "You must specify how much memory to use per job like ``memory='24 GB'``"
            )

        # This attribute should be overridden
        self.job_header = None

        if interface:
            extra += ["--interface", interface]
            kwargs.setdefault("ip", get_ip_interface(interface))
        else:
            kwargs.setdefault("ip", "")

        # Bokeh diagnostics server should listen on all interfaces
        kwargs.setdefault("dashboard_address", ("", 8787))
        self.local_cluster = LocalCluster(n_workers=0, **kwargs)

        # Keep information on process, cores, and memory, for use in subclasses
        self.worker_memory = parse_bytes(
            memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name

        # plugin for tracking job status
        self._scheduler_plugin = JobQueuePlugin()
        self.local_cluster.scheduler.add_plugin(self._scheduler_plugin)

        self._adaptive = None

        self.shebang = shebang

        self._env_header = "\n".join(env_extra)

        # dask-worker command line build
        dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict(
            python=python)
        command_args = [dask_worker_command, self.scheduler.address]
        command_args += ["--nthreads", self.worker_process_threads]
        if processes is not None and processes > 1:
            command_args += ["--nprocs", processes]

        command_args += ["--memory-limit", self.worker_process_memory]
        command_args += ["--name", "%s--${JOB_ID}--" % name]

        if death_timeout is not None:
            command_args += ["--death-timeout", death_timeout]
        if local_directory is not None:
            command_args += ["--local-directory", local_directory]
        if extra is not None:
            command_args += extra

        self._command_template = " ".join(map(str, command_args))

        self.log_directory = log_directory
        if self.log_directory is not None:
            if not os.path.exists(self.log_directory):
                os.makedirs(self.log_directory)
示例#22
0
文件: slurm.py 项目: leej3/pangeo
    def __init__(self,
                 name='dask',
                 queue='dav',
                 project=None,
                 threads_per_worker=4,
                 processes=8,
                 memory='7GB',
                 walltime='00:30:00',
                 interface=None,
                 extra='',
                 **kwargs):
        """ Initialize a SLURM Cluster

        Parameters
        ----------
        name : str
            Name of worker jobs. Passed to `#SBATCH -J` option.
        queue : str
            Destination queue for each worker job.
            Passed to `#SBATCH -p` option.
        project : str
            Accounting string associated with each worker job. Passed to
            `#SBATCH -A` option.
        threads_per_worker : int
            Number of threads per process.
        processes : int
            Number of processes per node.
        memory : str
            Bytes of memory that the worker can use. This should be a string
            like "7GB" that can be interpretted both by PBS and Dask.
        walltime : str
            Walltime for each worker job.
        interface : str
            Network interface like 'eth0' or 'ib0'.
        extra : str
            Additional arguments to pass to `dask-worker`
        kwargs : dict
            Additional keyword arguments to pass to `LocalCluster`
        """
        self._template = """
#!/bin/bash

#SBATCH -J %(name)s
#SBATCH -n %(processes)d
#SBATCH -p %(queue)s
#SBATCH -A %(project)s
#SBATCH -t %(walltime)s
#SBATCH -e %(name)s.err
#SBATCH -o %(name)s.out

%(base_path)s/dask-worker %(scheduler)s \
    --nthreads %(threads_per_worker)d \
    --nprocs %(processes)s \
    --memory-limit %(memory)s \
    --name %(name)s-%(n)d \
     %(extra)s
""".lstrip()

        if interface:
            host = get_ip_interface(interface)
            extra += ' --interface  %s ' % interface
        else:
            host = socket.gethostname()

        project = project or os.environ.get('SLURM_ACCOUNT')
        if not project:
            raise ValueError("Must specify a project like `project='UCLB1234' "
                             "or set SLURM_ACCOUNT environment variable")
        self.cluster = LocalCluster(n_workers=0, ip=host, **kwargs)
        memory = memory.replace(' ', '')
        self.config = {
            'name': name,
            'queue': queue,
            'project': project,
            'threads_per_worker': threads_per_worker,
            'processes': processes,
            'scheduler': self.scheduler.address,
            'walltime': walltime,
            'base_path': dirname,
            'memory': memory,
            'extra': extra
        }
        self.jobs = dict()
        self.n = 0
        self._adaptive = None
        self._submitcmd = 'sbatch'
        self._cancelcmd = 'scancel'

        logger.debug("Job script: \n %s" % self.job_script())
示例#23
0
    def __init__(self,
                 name=dask.config.get('jobqueue.name'),
                 threads=dask.config.get('jobqueue.threads'),
                 processes=dask.config.get('jobqueue.processes'),
                 memory=dask.config.get('jobqueue.memory'),
                 interface=dask.config.get('jobqueue.interface'),
                 death_timeout=dask.config.get('jobqueue.death-timeout'),
                 local_directory=dask.config.get('jobqueue.local-directory'),
                 extra=dask.config.get('jobqueue.extra'),
                 env_extra=dask.config.get('jobqueue.env-extra'),
                 **kwargs):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used
        # directly.
        # """
        if not self.cancel_command or not self.submit_command:
            raise NotImplementedError('JobQueueCluster is an abstract class '
                                      'that should not be instanciated.')

        #This attribute should be overriden
        self.job_header = None

        if interface:
            host = get_ip_interface(interface)
            extra += ' --interface  %s ' % interface
        else:
            host = socket.gethostname()

        self.cluster = LocalCluster(n_workers=0, ip=host, **kwargs)

        # Keep information on process, threads and memory, for use in
        # subclasses
        self.worker_memory = parse_bytes(
            memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_threads = threads
        self.name = name

        self.jobs = dict()
        self.n = 0
        self._adaptive = None

        self._env_header = '\n'.join(env_extra)

        # dask-worker command line build
        dask_worker_command = ('%(python)s -m distributed.cli.dask_worker' %
                               dict(python=sys.executable))
        self._command_template = ' '.join(
            [dask_worker_command, self.scheduler.address])
        if threads is not None:
            self._command_template += " --nthreads %d" % threads
        if processes is not None:
            self._command_template += " --nprocs %d" % processes
        if memory is not None:
            self._command_template += " --memory-limit %s" % memory
        if name is not None:
            self._command_template += " --name %s" % name
            self._command_template += "-%(n)d"  # Keep %(n) to be replaced later
        if death_timeout is not None:
            self._command_template += " --death-timeout %s" % death_timeout
        if local_directory is not None:
            self._command_template += " --local-directory %s" % local_directory
        if extra is not None:
            self._command_template += extra
示例#24
0
def main(scheduler, host, worker_port, listen_address, contact_address,
         nanny_port, nthreads, nprocs, nanny, name,
         memory_limit, pid_file, reconnect, resources, bokeh,
         bokeh_port, local_directory, scheduler_file, interface,
         death_timeout, preload, preload_argv, bokeh_prefix, tls_ca_file,
         tls_cert, tls_key):
    enable_proctitle_on_current()
    enable_proctitle_on_children()

    sec = Security(tls_ca_file=tls_ca_file,
                   tls_worker_cert=tls_cert,
                   tls_worker_key=tls_key,
                   )

    if nprocs > 1 and worker_port != 0:
        logger.error("Failed to launch worker.  You cannot use the --port argument when nprocs > 1.")
        exit(1)

    if nprocs > 1 and not nanny:
        logger.error("Failed to launch worker.  You cannot use the --no-nanny argument when nprocs > 1.")
        exit(1)

    if contact_address and not listen_address:
        logger.error("Failed to launch worker. "
                     "Must specify --listen-address when --contact-address is given")
        exit(1)

    if nprocs > 1 and listen_address:
        logger.error("Failed to launch worker. "
                     "You cannot specify --listen-address when nprocs > 1.")
        exit(1)

    if (worker_port or host) and listen_address:
        logger.error("Failed to launch worker. "
                     "You cannot specify --listen-address when --worker-port or --host is given.")
        exit(1)

    try:
        if listen_address:
            (host, worker_port) = get_address_host_port(listen_address, strict=True)

        if contact_address:
            # we only need this to verify it is getting parsed
            (_, _) = get_address_host_port(contact_address, strict=True)
        else:
            # if contact address is not present we use the listen_address for contact
            contact_address = listen_address
    except ValueError as e:
        logger.error("Failed to launch worker. " + str(e))
        exit(1)

    if nanny:
        port = nanny_port
    else:
        port = worker_port

    if not nthreads:
        nthreads = _ncores // nprocs

    if pid_file:
        with open(pid_file, 'w') as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)
        atexit.register(del_pid_file)

    services = {}

    if bokeh:
        try:
            from distributed.bokeh.worker import BokehWorker
        except ImportError:
            pass
        else:
            if bokeh_prefix:
                result = (BokehWorker, {'prefix': bokeh_prefix})
            else:
                result = BokehWorker
            services[('bokeh', bokeh_port)] = result

    if resources:
        resources = resources.replace(',', ' ').split()
        resources = dict(pair.split('=') for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    if nanny:
        kwargs = {'worker_port': worker_port, 'listen_address': listen_address}
        t = Nanny
    else:
        kwargs = {}
        if nanny_port:
            kwargs['service_ports'] = {'nanny': nanny_port}
        t = Worker

    if not scheduler and not scheduler_file and 'scheduler-address' not in config:
        raise ValueError("Need to provide scheduler address like\n"
                         "dask-worker SCHEDULER_ADDRESS:8786")

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    if host or port:
        addr = uri_from_host_port(host, port, 0)
    else:
        # Choose appropriate address for scheduler
        addr = None

    if death_timeout is not None:
        death_timeout = parse_timedelta(death_timeout, 's')

    nannies = [t(scheduler, scheduler_file=scheduler_file, ncores=nthreads,
                 services=services, loop=loop, resources=resources,
                 memory_limit=memory_limit, reconnect=reconnect,
                 local_dir=local_directory, death_timeout=death_timeout,
                 preload=preload, preload_argv=preload_argv,
                 security=sec, contact_address=contact_address,
                 name=name if nprocs == 1 or not name else name + '-' + str(i),
                 **kwargs)
               for i in range(nprocs)]

    @gen.coroutine
    def close_all():
        # Unregister all workers from scheduler
        if nanny:
            yield [n._close(timeout=2) for n in nannies]

    def on_signal(signum):
        logger.info("Exiting on signal %d", signum)
        close_all()

    @gen.coroutine
    def run():
        yield [n._start(addr) for n in nannies]
        while all(n.status != 'closed' for n in nannies):
            yield gen.sleep(0.2)

    install_signal_handlers(loop, cleanup=on_signal)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
示例#25
0
    def __init__(self,
                 name=None,
                 cores=None,
                 memory=None,
                 processes=None,
                 interface=None,
                 death_timeout=None,
                 local_directory=None,
                 extra=None,
                 env_extra=None,
                 walltime=None,
                 threads=None,
                 **kwargs
                 ):
        """ """
        # """
        # This initializer should be considered as Abstract, and never used
        # directly.
        # """
        if threads is not None:
            raise ValueError(threads_deprecation_message)

        if not self.scheduler_name:
            raise NotImplementedError('JobQueueCluster is an abstract class '
                                      'that should not be instanciated.')

        if name is None:
            name = dask.config.get('jobqueue.%s.name' % self.scheduler_name)
        if cores is None:
            cores = dask.config.get('jobqueue.%s.cores' % self.scheduler_name)
        if memory is None:
            memory = dask.config.get('jobqueue.%s.memory' % self.scheduler_name)
        if processes is None:
            processes = dask.config.get('jobqueue.%s.processes' % self.scheduler_name)
        if interface is None:
            interface = dask.config.get('jobqueue.%s.interface' % self.scheduler_name)
        if death_timeout is None:
            death_timeout = dask.config.get('jobqueue.%s.death-timeout' % self.scheduler_name)
        if local_directory is None:
            local_directory = dask.config.get('jobqueue.%s.local-directory' % self.scheduler_name)
        if extra is None:
            extra = dask.config.get('jobqueue.%s.extra' % self.scheduler_name)
        if env_extra is None:
            env_extra = dask.config.get('jobqueue.%s.env-extra' % self.scheduler_name)

        if dask.config.get('jobqueue.%s.threads', None):
            warnings.warn(threads_deprecation_message)

        if cores is None:
            raise ValueError("You must specify how many cores to use per job "
                             "like ``cores=8``")

        if memory is None:
            raise ValueError("You must specify how much memory to use per job "
                             "like ``memory='24 GB'``")

        #This attribute should be overriden
        self.job_header = None

        if interface:
            host = get_ip_interface(interface)
            extra += ' --interface  %s ' % interface
        else:
            host = socket.gethostname()

        self.local_cluster = LocalCluster(n_workers=0, ip=host, **kwargs)

        # Keep information on process, cores, and memory, for use in subclasses
        self.worker_memory = parse_bytes(memory)

        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name

        self.jobs = dict()
        self.n = 0
        self._adaptive = None

        self._env_header = '\n'.join(env_extra)

        # dask-worker command line build
        dask_worker_command = (
            '%(python)s -m distributed.cli.dask_worker' % dict(python=sys.executable))
        self._command_template = ' '.join([dask_worker_command, self.scheduler.address])
        self._command_template += " --nthreads %d" % self.worker_threads
        if processes is not None and processes > 1:
            self._command_template += " --nprocs %d" % processes

        mem = format_bytes(self.worker_memory / self.worker_processes)
        mem = mem.replace(' ', '')
        self._command_template += " --memory-limit %s" % mem

        if name is not None:
            self._command_template += " --name %s" % name
            self._command_template += "-%(n)d" # Keep %(n) to be replaced later
        if death_timeout is not None:
            self._command_template += " --death-timeout %s" % death_timeout
        if local_directory is not None:
            self._command_template += " --local-directory %s" % local_directory
        if extra is not None:
            self._command_template += extra
示例#26
0
def main(
    scheduler,
    host,
    nthreads,
    name,
    memory_limit,
    device_memory_limit,
    rmm_pool_size,
    pid_file,
    resources,
    dashboard,
    dashboard_address,
    local_directory,
    scheduler_file,
    interface,
    death_timeout,
    preload,
    dashboard_prefix,
    tls_ca_file,
    tls_cert,
    tls_key,
    enable_tcp_over_ucx,
    enable_infiniband,
    enable_nvlink,
    enable_rdmacm,
    net_devices,
    **kwargs,
):
    enable_proctitle_on_current()
    enable_proctitle_on_children()

    if tls_ca_file and tls_cert and tls_key:
        sec = Security(
            tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key
        )
    else:
        sec = None

    try:
        nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
    except KeyError:
        nprocs = get_n_gpus()

    if not nthreads:
        nthreads = min(1, multiprocessing.cpu_count() // nprocs)

    memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs)

    if pid_file:
        with open(pid_file, "w") as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {}

    if dashboard:
        try:
            from distributed.dashboard import BokehWorker
        except ImportError:
            pass
        else:
            if dashboard_prefix:
                result = (BokehWorker, {"prefix": dashboard_prefix})
            else:
                result = BokehWorker
            services[("dashboard", dashboard_address)] = result

    if resources:
        resources = resources.replace(",", " ").split()
        resources = dict(pair.split("=") for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    preload_argv = kwargs.get("preload_argv", [])
    kwargs = {"worker_port": None, "listen_address": None}
    t = Nanny

    if not scheduler and not scheduler_file and "scheduler-address" not in config:
        raise ValueError(
            "Need to provide scheduler address like\n"
            "dask-worker SCHEDULER_ADDRESS:8786"
        )

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    if rmm_pool_size is not None:
        try:
            import rmm  # noqa F401
        except ImportError:
            raise ValueError(
                "RMM pool requested but module 'rmm' is not available. "
                "For installation instructions, please see "
                "https://github.com/rapidsai/rmm"
            )  # pragma: no cover
        rmm_pool_size = parse_bytes(rmm_pool_size)

    nannies = [
        t(
            scheduler,
            scheduler_file=scheduler_file,
            nthreads=nthreads,
            services=services,
            loop=loop,
            resources=resources,
            memory_limit=memory_limit,
            interface=get_ucx_net_devices(
                cuda_device_index=i,
                ucx_net_devices=net_devices,
                get_openfabrics=False,
                get_network=True,
            ),
            preload=(list(preload) or []) + ["dask_cuda.initialize"],
            preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"],
            security=sec,
            env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
            plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)},
            name=name if nprocs == 1 or not name else name + "-" + str(i),
            local_directory=local_directory,
            config={
                "ucx": get_ucx_config(
                    enable_tcp_over_ucx=enable_tcp_over_ucx,
                    enable_infiniband=enable_infiniband,
                    enable_nvlink=enable_nvlink,
                    enable_rdmacm=enable_rdmacm,
                    net_devices=net_devices,
                    cuda_device_index=i,
                )
            },
            data=(
                DeviceHostFile,
                {
                    "device_memory_limit": get_device_total_memory(index=i)
                    if (device_memory_limit == "auto" or device_memory_limit == int(0))
                    else parse_bytes(device_memory_limit),
                    "memory_limit": memory_limit,
                    "local_directory": local_directory,
                },
            ),
            **kwargs,
        )
        for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        # Unregister all workers from scheduler
        yield [n._close(timeout=2) for n in nannies]

    def on_signal(signum):
        logger.info("Exiting on signal %d", signum)
        close_all()

    @gen.coroutine
    def run():
        yield nannies
        yield [n.finished() for n in nannies]

    install_signal_handlers(loop, cleanup=on_signal)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
示例#27
0
def main(host, port, http_port, bokeh_port, bokeh_internal_port, show, _bokeh,
         bokeh_whitelist, prefix, use_xheaders, pid_file, scheduler_file,
         interface):

    if pid_file:
        with open(pid_file, 'w') as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    if sys.platform.startswith('linux'):
        import resource  # module fails importing on Windows
        soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
        limit = max(soft, hard // 2)
        resource.setrlimit(resource.RLIMIT_NOFILE, (limit, hard))

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    addr = uri_from_host_port(host, port, 8786)

    loop = IOLoop.current()
    logger.info('-' * 47)

    services = {('http', http_port): HTTPScheduler}
    if _bokeh:
        with ignoring(ImportError):
            from distributed.bokeh.scheduler import BokehScheduler
            services[('bokeh', bokeh_internal_port)] = BokehScheduler
    scheduler = Scheduler(loop=loop,
                          services=services,
                          scheduler_file=scheduler_file)
    scheduler.start(addr)

    bokeh_proc = None
    if _bokeh:
        if bokeh_port == 0:  # This is a hack and not robust
            bokeh_port = open_port()  # This port may be taken by the OS
        try:  # before we successfully pass it to Bokeh
            from distributed.bokeh.application import BokehWebInterface
            bokeh_proc = BokehWebInterface(http_port=http_port,
                                           scheduler_address=scheduler.address,
                                           bokeh_port=bokeh_port,
                                           bokeh_whitelist=bokeh_whitelist,
                                           show=show,
                                           prefix=prefix,
                                           use_xheaders=use_xheaders,
                                           quiet=False)
        except ImportError:
            logger.info("Please install Bokeh to get Web UI")
        except Exception as e:
            logger.warn("Could not start Bokeh web UI", exc_info=True)

    logger.info('-' * 47)
    try:
        loop.start()
        loop.close()
    finally:
        scheduler.stop()
        if bokeh_proc:
            bokeh_proc.close()

        logger.info("End scheduler at %r", addr)
示例#28
0
    def __init__(self,
                 scheduler=None,
                 name=None,
                 cores=None,
                 memory=None,
                 processes=None,
                 nanny=True,
                 interface=None,
                 death_timeout=None,
                 local_directory=None,
                 extra=None,
                 env_extra=None,
                 header_skip=None,
                 log_directory=None,
                 shebang=None,
                 python=sys.executable,
                 job_name=None,
                 config_name=None,
                 **kwargs):
        self.scheduler = scheduler
        self.job_id = None

        super().__init__()

        default_config_name = self.default_config_name()
        if config_name is None:
            config_name = default_config_name
        self.config_name = config_name

        if cores is None:
            cores = dask.config.get("jobqueue.%s.cores" % self.config_name)
        if memory is None:
            memory = dask.config.get("jobqueue.%s.memory" % self.config_name)

        if cores is None or memory is None:
            job_class_name = self.__class__.__name__
            cluster_class_name = job_class_name.replace("Job", "Cluster")
            raise ValueError(
                "You must specify how much cores and memory per job you want to use, for example:\n"
                "cluster = {}(cores={}, memory={!r})".format(
                    cluster_class_name, cores or 8, memory or "24GB"))

        if job_name is None:
            job_name = dask.config.get("jobqueue.%s.name" % self.config_name)
        if processes is None:
            processes = dask.config.get("jobqueue.%s.processes" %
                                        self.config_name)
            if processes is None:
                processes, _ = nprocesses_nthreads(cores)
        if interface is None:
            interface = dask.config.get("jobqueue.%s.interface" %
                                        self.config_name)
        if death_timeout is None:
            death_timeout = dask.config.get("jobqueue.%s.death-timeout" %
                                            self.config_name)
        if local_directory is None:
            local_directory = dask.config.get("jobqueue.%s.local-directory" %
                                              self.config_name)
        if extra is None:
            extra = dask.config.get("jobqueue.%s.extra" % self.config_name)
        if env_extra is None:
            env_extra = dask.config.get("jobqueue.%s.env-extra" %
                                        self.config_name)
        if header_skip is None:
            header_skip = dask.config.get(
                "jobqueue.%s.header-skip" % self.config_name, ())
        if log_directory is None:
            log_directory = dask.config.get("jobqueue.%s.log-directory" %
                                            self.config_name)
        if shebang is None:
            shebang = dask.config.get("jobqueue.%s.shebang" % self.config_name)

        # This attribute should be set in the derived class
        self.job_header = None

        if interface:
            extra = extra + ["--interface", interface]
            kwargs.setdefault("host", get_ip_interface(interface))
        else:
            kwargs.setdefault("host", "")

        # Keep information on process, cores, and memory, for use in subclasses
        self.worker_memory = parse_bytes(
            memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name
        self.job_name = job_name

        self.shebang = shebang

        self._env_header = "\n".join(filter(None, env_extra))
        self.header_skip = set(header_skip)

        # dask-worker command line build
        dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict(
            python=python)
        command_args = [dask_worker_command, self.scheduler]
        command_args += ["--nthreads", self.worker_process_threads]
        if processes is not None and processes > 1:
            command_args += ["--nprocs", processes]

        command_args += ["--memory-limit", self.worker_process_memory]
        command_args += ["--name", str(name)]
        command_args += ["--nanny" if nanny else "--no-nanny"]

        if death_timeout is not None:
            command_args += ["--death-timeout", death_timeout]
        if local_directory is not None:
            command_args += ["--local-directory", local_directory]
        if extra is not None:
            command_args += extra

        self._command_template = " ".join(map(str, command_args))

        self.log_directory = log_directory
        if self.log_directory is not None:
            if not os.path.exists(self.log_directory):
                os.makedirs(self.log_directory)
示例#29
0
文件: core.py 项目: jni/dask-jobqueue
    def __init__(
        self,
        scheduler=None,
        name=None,
        cores=None,
        memory=None,
        processes=None,
        nanny=True,
        interface=None,
        death_timeout=None,
        local_directory=None,
        extra=None,
        env_extra=None,
        header_skip=None,
        log_directory=None,
        shebang=None,
        python=sys.executable,
        job_name=None,
        config_name=None,
        **kwargs
    ):
        self.scheduler = scheduler
        self.job_id = None

        super().__init__()

        if config_name is None:
            config_name = getattr(type(self), "config_name")
        if config_name is None:
            raise ValueError(
                "Looks like you are trying to create a class that inherits from dask_jobqueue.core.Job. "
                "If that is the case, you need to:\n"
                "- set the 'config_name' class variable to a non-None value\n"
                "- create a section in jobqueue.yaml with the value of 'config_name'\n"
                "If that is not the case, please open an issue in https://github.com/dask/dask-jobqueue/issues."
            )

        if job_name is None:
            job_name = dask.config.get("jobqueue.%s.name" % config_name)
        if cores is None:
            cores = dask.config.get("jobqueue.%s.cores" % config_name)
        if memory is None:
            memory = dask.config.get("jobqueue.%s.memory" % config_name)
        if processes is None:
            processes = dask.config.get("jobqueue.%s.processes" % config_name)
        if interface is None:
            interface = dask.config.get("jobqueue.%s.interface" % config_name)
        if death_timeout is None:
            death_timeout = dask.config.get("jobqueue.%s.death-timeout" % config_name)
        if local_directory is None:
            local_directory = dask.config.get(
                "jobqueue.%s.local-directory" % config_name
            )
        if extra is None:
            extra = dask.config.get("jobqueue.%s.extra" % config_name)
        if env_extra is None:
            env_extra = dask.config.get("jobqueue.%s.env-extra" % config_name)
        if header_skip is None:
            header_skip = dask.config.get("jobqueue.%s.header-skip" % config_name, ())
        if log_directory is None:
            log_directory = dask.config.get("jobqueue.%s.log-directory" % config_name)
        if shebang is None:
            shebang = dask.config.get("jobqueue.%s.shebang" % config_name)

        if cores is None or memory is None:
            raise ValueError(
                "You must specify how much cores and memory per job you want to use, for example:\n"
                "cluster = {}(cores={}, memory={!r})".format(
                    self.__class__.__name__, cores or 8, memory or "24GB"
                )
            )

        # This attribute should be overridden
        self.job_header = None

        if interface:
            extra = extra + ["--interface", interface]
            kwargs.setdefault("host", get_ip_interface(interface))
        else:
            kwargs.setdefault("host", "")

        # Keep information on process, cores, and memory, for use in subclasses
        self.worker_memory = parse_bytes(memory) if memory is not None else None
        self.worker_processes = processes
        self.worker_cores = cores
        self.name = name
        self.job_name = job_name

        self.shebang = shebang

        self._env_header = "\n".join(filter(None, env_extra))
        self.header_skip = set(header_skip)

        # dask-worker command line build
        dask_worker_command = "%(python)s -m distributed.cli.dask_worker" % dict(
            python=python
        )
        command_args = [dask_worker_command, self.scheduler]
        command_args += ["--nthreads", self.worker_process_threads]
        if processes is not None and processes > 1:
            command_args += ["--nprocs", processes]

        command_args += ["--memory-limit", self.worker_process_memory]
        command_args += ["--name", str(name)]
        command_args += ["--nanny" if nanny else "--no-nanny"]

        if death_timeout is not None:
            command_args += ["--death-timeout", death_timeout]
        if local_directory is not None:
            command_args += ["--local-directory", local_directory]
        if extra is not None:
            command_args += extra

        self._command_template = " ".join(map(str, command_args))

        self.log_directory = log_directory
        if self.log_directory is not None:
            if not os.path.exists(self.log_directory):
                os.makedirs(self.log_directory)
示例#30
0
def main(scheduler, host, worker_port, http_port, nanny_port, nthreads, nprocs,
         nanny, name, memory_limit, pid_file, temp_filename, reconnect,
         resources, bokeh, bokeh_port, local_directory, scheduler_file,
         interface, death_timeout, preload):
    if nanny:
        port = nanny_port
    else:
        port = worker_port

    if nprocs > 1 and worker_port != 0:
        logger.error(
            "Failed to launch worker.  You cannot use the --port argument when nprocs > 1."
        )
        exit(1)

    if nprocs > 1 and name:
        logger.error(
            "Failed to launch worker.  You cannot use the --name argument when nprocs > 1."
        )
        exit(1)

    if not nthreads:
        nthreads = _ncores // nprocs

    if pid_file:
        with open(pid_file, 'w') as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {('http', http_port): HTTPWorker}

    if bokeh:
        try:
            from distributed.bokeh.worker import BokehWorker
        except ImportError:
            pass
        else:
            services[('bokeh', bokeh_port)] = BokehWorker

    if resources:
        resources = resources.replace(',', ' ').split()
        resources = dict(pair.split('=') for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    if nanny:
        kwargs = {'worker_port': worker_port}
        t = Nanny
    else:
        kwargs = {}
        if nanny_port:
            kwargs['service_ports'] = {'nanny': nanny_port}
        t = Worker

    if scheduler_file:
        while not os.path.exists(scheduler_file):
            sleep(0.01)
        for i in range(10):
            try:
                with open(scheduler_file) as f:
                    cfg = json.load(f)
                scheduler = cfg['address']
                break
            except (ValueError, KeyError):  # race with scheduler on file
                sleep(0.01)

    if not scheduler:
        raise ValueError("Need to provide scheduler address like\n"
                         "dask-worker SCHEDULER_ADDRESS:8786")

    nannies = [
        t(scheduler,
          ncores=nthreads,
          services=services,
          name=name,
          loop=loop,
          resources=resources,
          memory_limit=memory_limit,
          reconnect=reconnect,
          local_dir=local_directory,
          death_timeout=death_timeout,
          preload=preload,
          **kwargs) for i in range(nprocs)
    ]

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    for n in nannies:
        if host:
            n.start((host, port))
        else:
            n.start(port)
        if t is Nanny:
            global_nannies.append(n)

    if temp_filename:

        @gen.coroutine
        def f():
            while nannies[0].status != 'running':
                yield gen.sleep(0.01)
            import json
            msg = {
                'port': nannies[0].port,
                'local_directory': nannies[0].local_dir
            }
            with open(temp_filename, 'w') as f:
                json.dump(msg, f)

        loop.add_callback(f)

    @gen.coroutine
    def run():
        while all(n.status != 'closed' for n in nannies):
            yield gen.sleep(0.2)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
        loop.close()

    # Clean exit: unregister all workers from scheduler

    loop2 = IOLoop()

    @gen.coroutine
    def f():
        with rpc(nannies[0].scheduler.address) as scheduler:
            if nanny:
                yield gen.with_timeout(timeout=timedelta(seconds=2),
                                       future=All([
                                           scheduler.unregister(
                                               address=n.worker_address,
                                               close=True) for n in nannies
                                           if n.process and n.worker_address
                                       ]),
                                       io_loop=loop2)

    loop2.run_sync(f)

    if nanny:
        for n in nannies:
            if isalive(n.process):
                n.process.terminate()

    if nanny:
        start = time()
        while (any(isalive(n.process) for n in nannies)
               and time() < start + 1):
            sleep(0.1)

    for nanny in nannies:
        nanny.stop()
示例#31
0
def _test_dask_cuda_worker_ucx_net_devices(enable_rdmacm):
    loop = IOLoop.current()
    ucp = pytest.importorskip("ucp")

    cm_protocol = "rdmacm" if enable_rdmacm else "sockcm"
    net_devices = _get_dgx_net_devices()
    openfabrics_devices = [d.split(",")[0] for d in net_devices]

    sched_addr = "127.0.0.1"

    # Enable proper variables for scheduler
    sched_env = os.environ.copy()
    sched_env["DASK_UCX__INFINIBAND"] = "True"
    sched_env["DASK_UCX__TCP"] = "True"
    sched_env["DASK_UCX__CUDA_COPY"] = "True"
    sched_env["DASK_UCX__NET_DEVICES"] = openfabrics_devices[0]

    if enable_rdmacm:
        sched_env["DASK_UCX__RDMACM"] = "True"
        sched_addr = get_ip_interface("ib0")

    sched_url = "ucx://" + sched_addr + ":9379"

    # Enable proper variables for workers
    worker_ucx_opts = [
        "--enable-infiniband",
        "--net-devices",
        "auto",
    ]
    if enable_rdmacm:
        worker_ucx_opts.append("--enable-rdmacm")

    # Enable proper variables for client
    initialize(
        enable_tcp_over_ucx=True,
        enable_infiniband=True,
        enable_rdmacm=enable_rdmacm,
        net_devices=openfabrics_devices[0],
    )

    with subprocess.Popen(
        [
            "dask-scheduler",
            "--protocol",
            "ucx",
            "--host",
            sched_addr,
            "--port",
            "9379",
            "--no-dashboard",
        ],
            env=sched_env,
    ) as sched_proc:
        # Scheduler with UCX will take a few seconds to fully start
        sleep(5)

        with subprocess.Popen([
                "dask-cuda-worker",
                sched_url,
                "--no-dashboard",
        ] + worker_ucx_opts) as worker_proc:
            with Client(sched_url, loop=loop) as client:

                def _timeout_callback():
                    # We must ensure processes are terminated to avoid hangs
                    # if a timeout occurs
                    worker_proc.kill()
                    sched_proc.kill()

                assert wait_workers(client, timeout_callback=_timeout_callback)

                workers_tls = client.run(lambda: ucp.get_config()["TLS"])
                workers_tls_priority = client.run(
                    lambda: ucp.get_config()["SOCKADDR_TLS_PRIORITY"])
                for tls, tls_priority in zip(workers_tls.values(),
                                             workers_tls_priority.values()):
                    assert cm_protocol in tls
                    assert cm_protocol in tls_priority
                worker_net_devices = client.run(
                    lambda: ucp.get_config()["NET_DEVICES"])
                cuda_visible_devices = client.run(
                    lambda: os.environ["CUDA_VISIBLE_DEVICES"])

                for i, v in enumerate(
                        zip(worker_net_devices.values(),
                            cuda_visible_devices.values())):
                    net_dev = v[0]
                    dev_idx = int(v[1].split(",")[0])
                    assert net_dev == openfabrics_devices[dev_idx]

            # A dask-worker with UCX protocol will not close until some work
            # is dispatched, therefore we kill the worker and scheduler to
            # ensure timely closing.
            worker_proc.kill()
            sched_proc.kill()
示例#32
0
def test_get_ip_interface():
    iface = "lo0" if MACOS else "lo"
    assert get_ip_interface(iface) == "127.0.0.1"
    with pytest.raises(ValueError,
                       match=f"'__notexist'.+network interface.+'{iface}'"):
        get_ip_interface("__notexist")
示例#33
0
def main(scheduler, host, worker_port, listen_address, contact_address,
         nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file,
         reconnect, resources, bokeh, bokeh_port, local_directory,
         scheduler_file, interface, death_timeout, preload, bokeh_prefix,
         tls_ca_file, tls_cert, tls_key):
    sec = Security(
        tls_ca_file=tls_ca_file,
        tls_worker_cert=tls_cert,
        tls_worker_key=tls_key,
    )

    if nprocs > 1 and worker_port != 0:
        logger.error(
            "Failed to launch worker.  You cannot use the --port argument when nprocs > 1."
        )
        exit(1)

    if nprocs > 1 and name:
        logger.error(
            "Failed to launch worker.  You cannot use the --name argument when nprocs > 1."
        )
        exit(1)

    if nprocs > 1 and not nanny:
        logger.error(
            "Failed to launch worker.  You cannot use the --no-nanny argument when nprocs > 1."
        )
        exit(1)

    if contact_address and not listen_address:
        logger.error(
            "Failed to launch worker. "
            "Must specify --listen-address when --contact-address is given")
        exit(1)

    if nprocs > 1 and listen_address:
        logger.error("Failed to launch worker. "
                     "You cannot specify --listen-address when nprocs > 1.")
        exit(1)

    if (worker_port or host) and listen_address:
        logger.error(
            "Failed to launch worker. "
            "You cannot specify --listen-address when --worker-port or --host is given."
        )
        exit(1)

    try:
        if listen_address:
            (host, worker_port) = get_address_host_port(listen_address,
                                                        strict=True)

        if contact_address:
            # we only need this to verify it is getting parsed
            (_, _) = get_address_host_port(contact_address, strict=True)
        else:
            # if contact address is not present we use the listen_address for contact
            contact_address = listen_address
    except ValueError as e:
        logger.error("Failed to launch worker. " + str(e))
        exit(1)

    if nanny:
        port = nanny_port
    else:
        port = worker_port

    if not nthreads:
        nthreads = _ncores // nprocs

    if pid_file:
        with open(pid_file, 'w') as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {}

    if bokeh:
        try:
            from distributed.bokeh.worker import BokehWorker
        except ImportError:
            pass
        else:
            if bokeh_prefix:
                result = (BokehWorker, {'prefix': bokeh_prefix})
            else:
                result = BokehWorker
            services[('bokeh', bokeh_port)] = result

    if resources:
        resources = resources.replace(',', ' ').split()
        resources = dict(pair.split('=') for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    if nanny:
        kwargs = {'worker_port': worker_port, 'listen_address': listen_address}
        t = Nanny
    else:
        kwargs = {}
        if nanny_port:
            kwargs['service_ports'] = {'nanny': nanny_port}
        t = Worker

    if not scheduler and not scheduler_file:
        raise ValueError("Need to provide scheduler address like\n"
                         "dask-worker SCHEDULER_ADDRESS:8786")

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    if host or port:
        addr = uri_from_host_port(host, port, 0)
    else:
        # Choose appropriate address for scheduler
        addr = None

    nannies = [
        t(scheduler,
          scheduler_file=scheduler_file,
          ncores=nthreads,
          services=services,
          name=name,
          loop=loop,
          resources=resources,
          memory_limit=memory_limit,
          reconnect=reconnect,
          local_dir=local_directory,
          death_timeout=death_timeout,
          preload=preload,
          security=sec,
          contact_address=contact_address,
          **kwargs) for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        # Unregister all workers from scheduler
        if nanny:
            yield [n._close(timeout=2) for n in nannies]

    def on_signal(signum):
        logger.info("Exiting on signal %d", signum)
        close_all()

    @gen.coroutine
    def run():
        yield [n.start(addr) for n in nannies]
        while all(n.status != 'closed' for n in nannies):
            yield gen.sleep(0.2)

    install_signal_handlers(loop, cleanup=on_signal)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
def main(
    scheduler,
    host,
    worker_port,
    listen_address,
    contact_address,
    nanny_port,
    nthreads,
    nprocs,
    nanny,
    name,
    memory_limit,
    pid_file,
    reconnect,
    resources,
    bokeh,
    bokeh_port,
    local_directory,
    scheduler_file,
    interface,
    death_timeout,
    preload,
    preload_argv,
    bokeh_prefix,
    tls_ca_file,
    tls_cert,
    tls_key,
    dashboard_address,
):
    g0, g1, g2 = gc.get_threshold(
    )  # https://github.com/dask/distributed/issues/1653
    gc.set_threshold(g0 * 3, g1 * 3, g2 * 3)

    enable_proctitle_on_current()
    enable_proctitle_on_children()

    if bokeh_port is not None:
        warnings.warn(
            "The --bokeh-port flag has been renamed to --dashboard-address. "
            "Consider adding ``--dashboard-address :%d`` " % bokeh_port)
        dashboard_address = bokeh_port

    sec = Security(tls_ca_file=tls_ca_file,
                   tls_worker_cert=tls_cert,
                   tls_worker_key=tls_key)

    if nprocs > 1 and worker_port != 0:
        logger.error(
            "Failed to launch worker.  You cannot use the --port argument when nprocs > 1."
        )
        exit(1)

    if nprocs > 1 and not nanny:
        logger.error(
            "Failed to launch worker.  You cannot use the --no-nanny argument when nprocs > 1."
        )
        exit(1)

    if contact_address and not listen_address:
        logger.error(
            "Failed to launch worker. "
            "Must specify --listen-address when --contact-address is given")
        exit(1)

    if nprocs > 1 and listen_address:
        logger.error("Failed to launch worker. "
                     "You cannot specify --listen-address when nprocs > 1.")
        exit(1)

    if (worker_port or host) and listen_address:
        logger.error(
            "Failed to launch worker. "
            "You cannot specify --listen-address when --worker-port or --host is given."
        )
        exit(1)

    try:
        if listen_address:
            (host, worker_port) = get_address_host_port(listen_address,
                                                        strict=True)

        if contact_address:
            # we only need this to verify it is getting parsed
            (_, _) = get_address_host_port(contact_address, strict=True)
        else:
            # if contact address is not present we use the listen_address for contact
            contact_address = listen_address
    except ValueError as e:
        logger.error("Failed to launch worker. " + str(e))
        exit(1)

    if nanny:
        port = nanny_port
    else:
        port = worker_port

    if not nthreads:
        nthreads = _ncores // nprocs

    if pid_file:
        with open(pid_file, "w") as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {}

    if bokeh:
        try:
            from distributed.bokeh.worker import BokehWorker
        except ImportError:
            pass
        else:
            if bokeh_prefix:
                result = (BokehWorker, {"prefix": bokeh_prefix})
            else:
                result = BokehWorker
            services[("bokeh", dashboard_address)] = result

    if resources:
        resources = resources.replace(",", " ").split()
        resources = dict(pair.split("=") for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    if nanny:
        kwargs = {"worker_port": worker_port, "listen_address": listen_address}
        t = Nanny
    else:
        kwargs = {}
        if nanny_port:
            kwargs["service_ports"] = {"nanny": nanny_port}
        t = Worker

    if (not scheduler and not scheduler_file
            and dask.config.get("scheduler-address", None) is None):
        raise ValueError("Need to provide scheduler address like\n"
                         "dask-worker SCHEDULER_ADDRESS:8786")

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    if host or port:
        addr = uri_from_host_port(host, port, 0)
    else:
        # Choose appropriate address for scheduler
        addr = None

    if death_timeout is not None:
        death_timeout = parse_timedelta(death_timeout, "s")

    nannies = [
        t(scheduler,
          scheduler_file=scheduler_file,
          ncores=nthreads,
          services=services,
          loop=loop,
          resources=resources,
          memory_limit=memory_limit,
          reconnect=reconnect,
          local_dir=local_directory,
          death_timeout=death_timeout,
          preload=preload,
          preload_argv=preload_argv,
          security=sec,
          contact_address=contact_address,
          name=name if nprocs == 1 or not name else name + "-" + str(i),
          **kwargs) for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        # Unregister all workers from scheduler
        if nanny:
            yield [n.close(timeout=2) for n in nannies]

    def on_signal(signum):
        logger.info("Exiting on signal %d", signum)
        close_all()

    @gen.coroutine
    def run():
        yield [n._start(addr) for n in nannies]
        while all(n.status != "closed" for n in nannies):
            yield gen.sleep(0.2)

    install_signal_handlers(loop, cleanup=on_signal)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")