Пример #1
0
def test_resource_limit(monkeypatch):
    assert parse_memory_limit("250MiB", 1, total_cores=1) == 1024 * 1024 * 250

    new_limit = 1024 * 1024 * 200
    import distributed.worker

    monkeypatch.setattr(distributed.system, "MEMORY_LIMIT", new_limit)
    assert parse_memory_limit("250MiB", 1, total_cores=1) == new_limit
Пример #2
0
def test_resource_limit():
    assert parse_memory_limit("250MiB", 1, total_cores=1) == 1024 * 1024 * 250

    # get current limit
    resource = pytest.importorskip("resource")
    try:
        hard_limit = resource.getrlimit(resource.RLIMIT_RSS)[1]
    except OSError:
        pytest.skip("resource could not get the RSS limit")
    memory_limit = psutil.virtual_memory().total
    if hard_limit > memory_limit or hard_limit < 0:
        hard_limit = memory_limit

    # decrease memory limit by one byte
    new_limit = hard_limit - 1
    try:
        resource.setrlimit(resource.RLIMIT_RSS, (new_limit, new_limit))
        assert parse_memory_limit(hard_limit, 1, total_cores=1) == new_limit
    except OSError:
        pytest.skip("resource could not set the RSS limit")
Пример #3
0
    def __init__(
        self,
        n_workers=None,
        threads_per_worker=1,
        processes=True,
        memory_limit="auto",
        device_memory_limit=0.8,
        CUDA_VISIBLE_DEVICES=None,
        data=None,
        local_directory=None,
        protocol=None,
        enable_tcp_over_ucx=False,
        enable_infiniband=False,
        enable_nvlink=False,
        enable_rdmacm=False,
        ucx_net_devices=None,
        rmm_pool_size=None,
        rmm_managed_memory=False,
        **kwargs,
    ):
        # Required by RAPIDS libraries (e.g., cuDF) to ensure no context
        # initialization happens before we can set CUDA_VISIBLE_DEVICES
        os.environ["RAPIDS_NO_INITIALIZE"] = "True"

        if CUDA_VISIBLE_DEVICES is None:
            CUDA_VISIBLE_DEVICES = cuda_visible_devices(0)
        if isinstance(CUDA_VISIBLE_DEVICES, str):
            CUDA_VISIBLE_DEVICES = CUDA_VISIBLE_DEVICES.split(",")
        CUDA_VISIBLE_DEVICES = list(
            map(parse_cuda_visible_device, CUDA_VISIBLE_DEVICES))
        if n_workers is None:
            n_workers = len(CUDA_VISIBLE_DEVICES)
        self.host_memory_limit = parse_memory_limit(memory_limit,
                                                    threads_per_worker,
                                                    n_workers)
        self.device_memory_limit = parse_device_memory_limit(
            device_memory_limit, device_index=0)

        self.rmm_pool_size = rmm_pool_size
        self.rmm_managed_memory = rmm_managed_memory
        if rmm_pool_size is not None or rmm_managed_memory:
            try:
                import rmm  # noqa F401
            except ImportError:
                raise ValueError(
                    "RMM pool or managed memory requested but module 'rmm' "
                    "is not available. For installation instructions, please "
                    "see https://github.com/rapidsai/rmm")  # pragma: no cover
            if self.rmm_pool_size is not None:
                self.rmm_pool_size = parse_bytes(self.rmm_pool_size)
        else:
            if enable_nvlink:
                warnings.warn(
                    "When using NVLink we recommend setting a "
                    "`rmm_pool_size`. Please see: "
                    "https://dask-cuda.readthedocs.io/en/latest/ucx.html"
                    "#important-notes for more details")

        if not processes:
            raise ValueError(
                "Processes are necessary in order to use multiple GPUs with Dask"
            )

        if data is None:
            data = (
                DeviceHostFile,
                {
                    "device_memory_limit":
                    self.device_memory_limit,
                    "memory_limit":
                    self.host_memory_limit,
                    "local_directory":
                    local_directory or dask.config.get("temporary-directory")
                    or os.getcwd(),
                },
            )

        if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
            if protocol is None:
                protocol = "ucx"
            elif protocol != "ucx":
                raise TypeError(
                    "Enabling InfiniBand or NVLink requires protocol='ucx'")

        if ucx_net_devices == "auto":
            try:
                from ucp._libs.topological_distance import TopologicalDistance  # NOQA
            except ImportError:
                raise ValueError(
                    "ucx_net_devices set to 'auto' but UCX-Py is not "
                    "installed or it's compiled without hwloc support")
        elif ucx_net_devices == "":
            raise ValueError("ucx_net_devices can not be an empty string")
        self.ucx_net_devices = ucx_net_devices
        self.set_ucx_net_devices = enable_infiniband
        self.host = kwargs.get("host", None)

        initialize(
            enable_tcp_over_ucx=enable_tcp_over_ucx,
            enable_nvlink=enable_nvlink,
            enable_infiniband=enable_infiniband,
            enable_rdmacm=enable_rdmacm,
            net_devices=ucx_net_devices,
            cuda_device_index=0,
        )

        super().__init__(
            n_workers=0,
            threads_per_worker=threads_per_worker,
            memory_limit=self.host_memory_limit,
            processes=True,
            data=data,
            local_directory=local_directory,
            protocol=protocol,
            config={
                "ucx":
                get_ucx_config(
                    enable_tcp_over_ucx=enable_tcp_over_ucx,
                    enable_nvlink=enable_nvlink,
                    enable_infiniband=enable_infiniband,
                    enable_rdmacm=enable_rdmacm,
                )
            },
            **kwargs,
        )

        self.new_spec["options"]["preload"] = self.new_spec["options"].get(
            "preload", []) + ["dask_cuda.initialize"]
        self.new_spec[
            "options"]["preload_argv"] = self.new_spec["options"].get(
                "preload_argv", []) + ["--create-cuda-context"]

        self.cuda_visible_devices = CUDA_VISIBLE_DEVICES
        self.scale(n_workers)
        self.sync(self._correct_state)
Пример #4
0
def main(
    scheduler,
    host,
    nthreads,
    name,
    memory_limit,
    device_memory_limit,
    rmm_pool_size,
    pid_file,
    resources,
    dashboard,
    dashboard_address,
    local_directory,
    scheduler_file,
    interface,
    death_timeout,
    preload,
    dashboard_prefix,
    tls_ca_file,
    tls_cert,
    tls_key,
    enable_tcp_over_ucx,
    enable_infiniband,
    enable_nvlink,
    enable_rdmacm,
    net_devices,
    **kwargs,
):
    enable_proctitle_on_current()
    enable_proctitle_on_children()

    if tls_ca_file and tls_cert and tls_key:
        sec = Security(
            tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key
        )
    else:
        sec = None

    try:
        nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
    except KeyError:
        nprocs = get_n_gpus()

    if not nthreads:
        nthreads = min(1, multiprocessing.cpu_count() // nprocs)

    memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs)

    if pid_file:
        with open(pid_file, "w") as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {}

    if dashboard:
        try:
            from distributed.dashboard import BokehWorker
        except ImportError:
            pass
        else:
            if dashboard_prefix:
                result = (BokehWorker, {"prefix": dashboard_prefix})
            else:
                result = BokehWorker
            services[("dashboard", dashboard_address)] = result

    if resources:
        resources = resources.replace(",", " ").split()
        resources = dict(pair.split("=") for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    preload_argv = kwargs.get("preload_argv", [])
    kwargs = {"worker_port": None, "listen_address": None}
    t = Nanny

    if not scheduler and not scheduler_file and "scheduler-address" not in config:
        raise ValueError(
            "Need to provide scheduler address like\n"
            "dask-worker SCHEDULER_ADDRESS:8786"
        )

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    if rmm_pool_size is not None:
        try:
            import rmm  # noqa F401
        except ImportError:
            raise ValueError(
                "RMM pool requested but module 'rmm' is not available. "
                "For installation instructions, please see "
                "https://github.com/rapidsai/rmm"
            )  # pragma: no cover
        rmm_pool_size = parse_bytes(rmm_pool_size)

    nannies = [
        t(
            scheduler,
            scheduler_file=scheduler_file,
            nthreads=nthreads,
            services=services,
            loop=loop,
            resources=resources,
            memory_limit=memory_limit,
            interface=get_ucx_net_devices(
                cuda_device_index=i,
                ucx_net_devices=net_devices,
                get_openfabrics=False,
                get_network=True,
            ),
            preload=(list(preload) or []) + ["dask_cuda.initialize"],
            preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"],
            security=sec,
            env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
            plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)},
            name=name if nprocs == 1 or not name else name + "-" + str(i),
            local_directory=local_directory,
            config={
                "ucx": get_ucx_config(
                    enable_tcp_over_ucx=enable_tcp_over_ucx,
                    enable_infiniband=enable_infiniband,
                    enable_nvlink=enable_nvlink,
                    enable_rdmacm=enable_rdmacm,
                    net_devices=net_devices,
                    cuda_device_index=i,
                )
            },
            data=(
                DeviceHostFile,
                {
                    "device_memory_limit": get_device_total_memory(index=i)
                    if (device_memory_limit == "auto" or device_memory_limit == int(0))
                    else parse_bytes(device_memory_limit),
                    "memory_limit": memory_limit,
                    "local_directory": local_directory,
                },
            ),
            **kwargs,
        )
        for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        # Unregister all workers from scheduler
        yield [n._close(timeout=2) for n in nannies]

    def on_signal(signum):
        logger.info("Exiting on signal %d", signum)
        close_all()

    @gen.coroutine
    def run():
        yield nannies
        yield [n.finished() for n in nannies]

    install_signal_handlers(loop, cleanup=on_signal)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
Пример #5
0
    def __init__(
        self,
        scheduler=None,
        host=None,
        nthreads=0,
        name=None,
        memory_limit="auto",
        device_memory_limit="auto",
        rmm_pool_size=None,
        rmm_managed_memory=False,
        pid_file=None,
        resources=None,
        dashboard=True,
        dashboard_address=":0",
        local_directory=None,
        scheduler_file=None,
        interface=None,
        death_timeout=None,
        preload=[],
        dashboard_prefix=None,
        security=None,
        enable_tcp_over_ucx=False,
        enable_infiniband=False,
        enable_nvlink=False,
        enable_rdmacm=False,
        net_devices=None,
        **kwargs,
    ):
        # Required by RAPIDS libraries (e.g., cuDF) to ensure no context
        # initialization happens before we can set CUDA_VISIBLE_DEVICES
        os.environ["RAPIDS_NO_INITIALIZE"] = "True"

        enable_proctitle_on_current()
        enable_proctitle_on_children()

        try:
            nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
        except KeyError:
            nprocs = get_n_gpus()

        if not nthreads:
            nthreads = min(1, multiprocessing.cpu_count() // nprocs)

        memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs)

        if pid_file:
            with open(pid_file, "w") as f:
                f.write(str(os.getpid()))

            def del_pid_file():
                if os.path.exists(pid_file):
                    os.remove(pid_file)

            atexit.register(del_pid_file)

        services = {}

        if dashboard:
            try:
                from distributed.dashboard import BokehWorker
            except ImportError:
                pass
            else:
                if dashboard_prefix:
                    result = (BokehWorker, {"prefix": dashboard_prefix})
                else:
                    result = BokehWorker
                services[("dashboard", dashboard_address)] = result

        if resources:
            resources = resources.replace(",", " ").split()
            resources = dict(pair.split("=") for pair in resources)
            resources = valmap(float, resources)
        else:
            resources = None

        loop = IOLoop.current()

        preload_argv = kwargs.get("preload_argv", [])
        kwargs = {"worker_port": None, "listen_address": None}
        t = Nanny

        if (
            not scheduler
            and not scheduler_file
            and dask.config.get("scheduler-address", None) is None
        ):
            raise ValueError(
                "Need to provide scheduler address like\n"
                "dask-worker SCHEDULER_ADDRESS:8786"
            )

        if interface and host:
            raise ValueError("Can not specify both interface and host")

        if rmm_pool_size is not None or rmm_managed_memory:
            try:
                import rmm  # noqa F401
            except ImportError:
                raise ValueError(
                    "RMM pool requested but module 'rmm' is not available. "
                    "For installation instructions, please see "
                    "https://github.com/rapidsai/rmm"
                )  # pragma: no cover
            if rmm_pool_size is not None:
                rmm_pool_size = parse_bytes(rmm_pool_size)
        else:
            if enable_nvlink:
                warnings.warn(
                    "When using NVLink we recommend setting a "
                    "`rmm_pool_size`.  Please see: "
                    "https://dask-cuda.readthedocs.io/en/latest/ucx.html"
                    "#important-notes for more details"
                )

        if enable_nvlink and rmm_managed_memory:
            raise ValueError(
                "RMM managed memory and NVLink are currently incompatible."
            )

        # Ensure this parent dask-cuda-worker process uses the same UCX
        # configuration as child worker processes created by it.
        initialize(
            create_cuda_context=False,
            enable_tcp_over_ucx=enable_tcp_over_ucx,
            enable_infiniband=enable_infiniband,
            enable_nvlink=enable_nvlink,
            enable_rdmacm=enable_rdmacm,
            net_devices=net_devices,
            cuda_device_index=0,
        )

        self.nannies = [
            t(
                scheduler,
                scheduler_file=scheduler_file,
                nthreads=nthreads,
                services=services,
                loop=loop,
                resources=resources,
                memory_limit=memory_limit,
                interface=_get_interface(interface, host, i, net_devices),
                host=host,
                preload=(list(preload) or []) + ["dask_cuda.initialize"],
                preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"],
                security=security,
                env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
                plugins={
                    CPUAffinity(get_cpu_affinity(i)),
                    RMMSetup(rmm_pool_size, rmm_managed_memory),
                },
                name=name if nprocs == 1 or not name else name + "-" + str(i),
                local_directory=local_directory,
                config={
                    "ucx": get_ucx_config(
                        enable_tcp_over_ucx=enable_tcp_over_ucx,
                        enable_infiniband=enable_infiniband,
                        enable_nvlink=enable_nvlink,
                        enable_rdmacm=enable_rdmacm,
                        net_devices=net_devices,
                        cuda_device_index=i,
                    )
                },
                data=(
                    DeviceHostFile,
                    {
                        "device_memory_limit": parse_device_memory_limit(
                            device_memory_limit, device_index=i
                        ),
                        "memory_limit": memory_limit,
                        "local_directory": local_directory,
                    },
                ),
                **kwargs,
            )
            for i in range(nprocs)
        ]
Пример #6
0
    def __init__(
        self,
        CUDA_VISIBLE_DEVICES=None,
        n_workers=None,
        threads_per_worker=1,
        memory_limit="auto",
        device_memory_limit=0.8,
        data=None,
        local_directory=None,
        protocol=None,
        enable_tcp_over_ucx=False,
        enable_infiniband=False,
        enable_nvlink=False,
        enable_rdmacm=False,
        ucx_net_devices=None,
        rmm_pool_size=None,
        rmm_managed_memory=False,
        rmm_async=False,
        rmm_log_directory=None,
        jit_unspill=None,
        log_spilling=False,
        worker_class=None,
        **kwargs,
    ):
        # Required by RAPIDS libraries (e.g., cuDF) to ensure no context
        # initialization happens before we can set CUDA_VISIBLE_DEVICES
        os.environ["RAPIDS_NO_INITIALIZE"] = "True"

        if threads_per_worker < 1:
            raise ValueError("threads_per_worker must be higher than 0.")

        if CUDA_VISIBLE_DEVICES is None:
            CUDA_VISIBLE_DEVICES = cuda_visible_devices(0)
        if isinstance(CUDA_VISIBLE_DEVICES, str):
            CUDA_VISIBLE_DEVICES = CUDA_VISIBLE_DEVICES.split(",")
        CUDA_VISIBLE_DEVICES = list(
            map(parse_cuda_visible_device, CUDA_VISIBLE_DEVICES))
        if n_workers is None:
            n_workers = len(CUDA_VISIBLE_DEVICES)
        if n_workers < 1:
            raise ValueError("Number of workers cannot be less than 1.")
        self.host_memory_limit = parse_memory_limit(memory_limit,
                                                    threads_per_worker,
                                                    n_workers)
        self.device_memory_limit = parse_device_memory_limit(
            device_memory_limit,
            device_index=nvml_device_index(0, CUDA_VISIBLE_DEVICES))

        self.rmm_pool_size = rmm_pool_size
        self.rmm_managed_memory = rmm_managed_memory
        self.rmm_async = rmm_async
        if rmm_pool_size is not None or rmm_managed_memory:
            try:
                import rmm  # noqa F401
            except ImportError:
                raise ValueError(
                    "RMM pool or managed memory requested but module 'rmm' "
                    "is not available. For installation instructions, please "
                    "see https://github.com/rapidsai/rmm")  # pragma: no cover
            if rmm_async:
                raise ValueError(
                    "RMM pool and managed memory are incompatible with asynchronous "
                    "allocator")
            if self.rmm_pool_size is not None:
                self.rmm_pool_size = parse_bytes(self.rmm_pool_size)
        else:
            if enable_nvlink:
                warnings.warn(
                    "When using NVLink we recommend setting a "
                    "`rmm_pool_size`. Please see: "
                    "https://dask-cuda.readthedocs.io/en/latest/ucx.html"
                    "#important-notes for more details")

        self.rmm_log_directory = rmm_log_directory

        if not kwargs.pop("processes", True):
            raise ValueError(
                "Processes are necessary in order to use multiple GPUs with Dask"
            )

        if jit_unspill is None:
            self.jit_unspill = dask.config.get("jit-unspill", default=False)
        else:
            self.jit_unspill = jit_unspill

        data = kwargs.pop("data", None)
        if data is None:
            if self.jit_unspill:
                data = (
                    ProxifyHostFile,
                    {
                        "device_memory_limit": self.device_memory_limit,
                    },
                )
            else:
                data = (
                    DeviceHostFile,
                    {
                        "device_memory_limit":
                        self.device_memory_limit,
                        "memory_limit":
                        self.host_memory_limit,
                        "local_directory":
                        local_directory
                        or dask.config.get("temporary-directory")
                        or os.getcwd(),
                        "log_spilling":
                        log_spilling,
                    },
                )

        if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
            if protocol is None:
                protocol = "ucx"
            elif protocol != "ucx":
                raise TypeError(
                    "Enabling InfiniBand or NVLink requires protocol='ucx'")

        if ucx_net_devices == "auto":
            if _ucx_111:
                warnings.warn(
                    "Starting with UCX 1.11, `ucx_net_devices='auto' is deprecated, "
                    "it should now be left unspecified for the same behavior. "
                    "Please make sure to read the updated UCX Configuration section in "
                    "https://docs.rapids.ai/api/dask-cuda/nightly/ucx.html, "
                    "where significant performance considerations for InfiniBand with "
                    "UCX 1.11 and above is documented.", )
            else:
                try:
                    from ucp._libs.topological_distance import (  # NOQA
                        TopologicalDistance, )
                except ImportError:
                    raise ValueError(
                        "ucx_net_devices set to 'auto' but UCX-Py is not "
                        "installed or it's compiled without hwloc support")
        elif ucx_net_devices == "":
            raise ValueError("ucx_net_devices can not be an empty string")
        self.ucx_net_devices = ucx_net_devices
        self.set_ucx_net_devices = enable_infiniband
        self.host = kwargs.get("host", None)

        initialize(
            enable_tcp_over_ucx=enable_tcp_over_ucx,
            enable_nvlink=enable_nvlink,
            enable_infiniband=enable_infiniband,
            enable_rdmacm=enable_rdmacm,
            net_devices=ucx_net_devices,
            cuda_device_index=0,
        )

        if worker_class is not None:
            from functools import partial

            worker_class = partial(
                LoggedNanny if log_spilling is True else Nanny,
                worker_class=worker_class,
            )

        super().__init__(
            n_workers=0,
            threads_per_worker=threads_per_worker,
            memory_limit=self.host_memory_limit,
            processes=True,
            data=data,
            local_directory=local_directory,
            protocol=protocol,
            worker_class=worker_class,
            config={
                "ucx":
                get_ucx_config(
                    enable_tcp_over_ucx=enable_tcp_over_ucx,
                    enable_nvlink=enable_nvlink,
                    enable_infiniband=enable_infiniband,
                    enable_rdmacm=enable_rdmacm,
                )
            },
            **kwargs,
        )

        self.new_spec["options"]["preload"] = self.new_spec["options"].get(
            "preload", []) + ["dask_cuda.initialize"]
        self.new_spec[
            "options"]["preload_argv"] = self.new_spec["options"].get(
                "preload_argv", []) + ["--create-cuda-context"]

        self.cuda_visible_devices = CUDA_VISIBLE_DEVICES
        self.scale(n_workers)
        self.sync(self._correct_state)
Пример #7
0
def main(
    scheduler,
    host,
    nthreads,
    name,
    memory_limit,
    device_memory_limit,
    pid_file,
    resources,
    dashboard,
    dashboard_address,
    local_directory,
    scheduler_file,
    interface,
    death_timeout,
    preload,
    dashboard_prefix,
    tls_ca_file,
    tls_cert,
    tls_key,
    **kwargs,
):
    enable_proctitle_on_current()
    enable_proctitle_on_children()

    sec = Security(
        tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key
    )

    try:
        nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
    except KeyError:
        nprocs = get_n_gpus()

    if not nthreads:
        nthreads = min(1, multiprocessing.cpu_count() // nprocs)

    if pid_file:
        with open(pid_file, "w") as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {}

    if dashboard:
        try:
            from distributed.dashboard import BokehWorker
        except ImportError:
            pass
        else:
            if dashboard_prefix:
                result = (BokehWorker, {"prefix": dashboard_prefix})
            else:
                result = BokehWorker
            services[("dashboard", dashboard_address)] = result

    if resources:
        resources = resources.replace(",", " ").split()
        resources = dict(pair.split("=") for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    kwargs = {"worker_port": None, "listen_address": None}
    t = Nanny

    if not scheduler and not scheduler_file and "scheduler-address" not in config:
        raise ValueError(
            "Need to provide scheduler address like\n"
            "dask-worker SCHEDULER_ADDRESS:8786"
        )

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    nannies = [
        t(
            scheduler,
            scheduler_file=scheduler_file,
            nthreads=nthreads,
            services=services,
            loop=loop,
            resources=resources,
            memory_limit=memory_limit,
            host=host,
            preload=(preload or []) + ["dask_cuda.initialize_context"],
            security=sec,
            env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
            name=name if nprocs == 1 or not name else name + "-" + str(i),
            data=(
                DeviceHostFile,
                {
                    "device_memory_limit": get_device_total_memory(index=i)
                    if (device_memory_limit == "auto" or device_memory_limit == int(0))
                    else parse_bytes(device_memory_limit),
                    "memory_limit": parse_memory_limit(
                        memory_limit, nthreads, total_cores=nprocs
                    ),
                    "local_directory": local_directory,
                },
            ),
            **kwargs,
        )
        for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        # Unregister all workers from scheduler
        yield [n._close(timeout=2) for n in nannies]

    def on_signal(signum):
        logger.info("Exiting on signal %d", signum)
        close_all()

    @gen.coroutine
    def run():
        yield nannies
        yield [n.finished() for n in nannies]

    install_signal_handlers(loop, cleanup=on_signal)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
Пример #8
0
    def __init__(
        self,
        scheduler=None,
        host=None,
        nthreads=1,
        name=None,
        memory_limit="auto",
        device_memory_limit="auto",
        rmm_pool_size=None,
        rmm_maximum_pool_size=None,
        rmm_managed_memory=False,
        rmm_async=False,
        rmm_log_directory=None,
        pid_file=None,
        resources=None,
        dashboard=True,
        dashboard_address=":0",
        local_directory=None,
        shared_filesystem=None,
        scheduler_file=None,
        interface=None,
        preload=[],
        dashboard_prefix=None,
        security=None,
        enable_tcp_over_ucx=None,
        enable_infiniband=None,
        enable_nvlink=None,
        enable_rdmacm=None,
        net_devices=None,
        jit_unspill=None,
        worker_class=None,
        **kwargs,
    ):
        # Required by RAPIDS libraries (e.g., cuDF) to ensure no context
        # initialization happens before we can set CUDA_VISIBLE_DEVICES
        os.environ["RAPIDS_NO_INITIALIZE"] = "True"

        enable_proctitle_on_current()
        enable_proctitle_on_children()

        try:
            nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
        except KeyError:
            nprocs = get_n_gpus()

        if nthreads < 1:
            raise ValueError("nthreads must be higher than 0.")

        memory_limit = parse_memory_limit(memory_limit,
                                          nthreads,
                                          total_cores=nprocs)

        if pid_file:
            with open(pid_file, "w") as f:
                f.write(str(os.getpid()))

            def del_pid_file():
                if os.path.exists(pid_file):
                    os.remove(pid_file)

            atexit.register(del_pid_file)

        if resources:
            resources = resources.replace(",", " ").split()
            resources = dict(pair.split("=") for pair in resources)
            resources = valmap(float, resources)
        else:
            resources = None

        loop = IOLoop.current()

        preload_argv = kwargs.pop("preload_argv", [])
        kwargs = {"worker_port": None, "listen_address": None, **kwargs}

        if (not scheduler and not scheduler_file
                and dask.config.get("scheduler-address", None) is None):
            raise ValueError("Need to provide scheduler address like\n"
                             "dask-worker SCHEDULER_ADDRESS:8786")

        if isinstance(scheduler, Cluster):
            scheduler = scheduler.scheduler_address

        if interface and host:
            raise ValueError("Can not specify both interface and host")

        if rmm_pool_size is not None or rmm_managed_memory:
            try:
                import rmm  # noqa F401
            except ImportError:
                raise ValueError(
                    "RMM pool requested but module 'rmm' is not available. "
                    "For installation instructions, please see "
                    "https://github.com/rapidsai/rmm")  # pragma: no cover
            if rmm_async:
                raise ValueError(
                    "RMM pool and managed memory are incompatible with asynchronous "
                    "allocator")
            if rmm_pool_size is not None:
                rmm_pool_size = parse_bytes(rmm_pool_size)
                if rmm_maximum_pool_size is not None:
                    rmm_maximum_pool_size = parse_bytes(rmm_maximum_pool_size)

        else:
            if enable_nvlink:
                warnings.warn(
                    "When using NVLink we recommend setting a "
                    "`rmm_pool_size`.  Please see: "
                    "https://dask-cuda.readthedocs.io/en/latest/ucx.html"
                    "#important-notes for more details")

        if enable_nvlink and rmm_managed_memory:
            raise ValueError(
                "RMM managed memory and NVLink are currently incompatible.")

        if _ucx_111 and net_devices == "auto":
            warnings.warn(
                "Starting with UCX 1.11, `ucx_net_devices='auto' is deprecated, "
                "it should now be left unspecified for the same behavior. "
                "Please make sure to read the updated UCX Configuration section in "
                "https://docs.rapids.ai/api/dask-cuda/nightly/ucx.html, "
                "where significant performance considerations for InfiniBand with "
                "UCX 1.11 and above is documented.", )

        # Ensure this parent dask-cuda-worker process uses the same UCX
        # configuration as child worker processes created by it.
        initialize(
            create_cuda_context=False,
            enable_tcp_over_ucx=enable_tcp_over_ucx,
            enable_infiniband=enable_infiniband,
            enable_nvlink=enable_nvlink,
            enable_rdmacm=enable_rdmacm,
            net_devices=net_devices,
            cuda_device_index=0,
        )

        if jit_unspill is None:
            self.jit_unspill = dask.config.get("jit-unspill", default=False)
        else:
            self.jit_unspill = jit_unspill

        if self.jit_unspill:
            data = lambda i: (
                ProxifyHostFile,
                {
                    "device_memory_limit":
                    parse_device_memory_limit(device_memory_limit,
                                              device_index=i),
                    "memory_limit":
                    memory_limit,
                    "local_directory":
                    local_directory,
                    "shared_filesystem":
                    shared_filesystem,
                },
            )
        else:
            data = lambda i: (
                DeviceHostFile,
                {
                    "device_memory_limit":
                    parse_device_memory_limit(device_memory_limit,
                                              device_index=i),
                    "memory_limit":
                    memory_limit,
                    "local_directory":
                    local_directory,
                },
            )

        self.nannies = [
            Nanny(
                scheduler,
                scheduler_file=scheduler_file,
                nthreads=nthreads,
                dashboard=dashboard,
                dashboard_address=dashboard_address,
                http_prefix=dashboard_prefix,
                loop=loop,
                resources=resources,
                memory_limit=memory_limit,
                interface=_get_interface(interface, host, i, net_devices),
                host=host,
                preload=(list(preload) or []) + ["dask_cuda.initialize"],
                preload_argv=(list(preload_argv) or []) +
                ["--create-cuda-context"],
                security=security,
                env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
                plugins={
                    CPUAffinity(
                        get_cpu_affinity(
                            nvml_device_index(i, cuda_visible_devices(i)))),
                    RMMSetup(
                        rmm_pool_size,
                        rmm_maximum_pool_size,
                        rmm_managed_memory,
                        rmm_async,
                        rmm_log_directory,
                    ),
                },
                name=name if nprocs == 1 or name is None else str(name) + "-" +
                str(i),
                local_directory=local_directory,
                config={
                    "distributed.comm.ucx":
                    get_ucx_config(
                        enable_tcp_over_ucx=enable_tcp_over_ucx,
                        enable_infiniband=enable_infiniband,
                        enable_nvlink=enable_nvlink,
                        enable_rdmacm=enable_rdmacm,
                        net_devices=net_devices,
                        cuda_device_index=i,
                    )
                },
                data=data(nvml_device_index(i, cuda_visible_devices(i))),
                worker_class=worker_class,
                **kwargs,
            ) for i in range(nprocs)
        ]
Пример #9
0
def main(
    scheduler,
    host,
    nthreads,
    name,
    memory_limit,
    device_memory_limit,
    pid_file,
    reconnect,
    resources,
    dashboard,
    dashboard_address,
    local_directory,
    scheduler_file,
    interface,
    death_timeout,
    preload,
    preload_argv,
    bokeh_prefix,
    tls_ca_file,
    tls_cert,
    tls_key,
):
    enable_proctitle_on_current()
    enable_proctitle_on_children()

    sec = Security(tls_ca_file=tls_ca_file,
                   tls_worker_cert=tls_cert,
                   tls_worker_key=tls_key)

    try:
        nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(","))
    except KeyError:
        nprocs = get_n_gpus()

    if not nthreads:
        nthreads = min(1, multiprocessing.cpu_count() // nprocs)

    if pid_file:
        with open(pid_file, "w") as f:
            f.write(str(os.getpid()))

        def del_pid_file():
            if os.path.exists(pid_file):
                os.remove(pid_file)

        atexit.register(del_pid_file)

    services = {}

    if dashboard:
        try:
            from distributed.dashboard import BokehWorker
        except ImportError:
            pass
        else:
            if bokeh_prefix:
                result = (BokehWorker, {"prefix": bokeh_prefix})
            else:
                result = BokehWorker
            services[("dashboard", dashboard_address)] = result

    if resources:
        resources = resources.replace(",", " ").split()
        resources = dict(pair.split("=") for pair in resources)
        resources = valmap(float, resources)
    else:
        resources = None

    loop = IOLoop.current()

    kwargs = {"worker_port": None, "listen_address": None}
    t = Nanny

    if not scheduler and not scheduler_file and "scheduler-address" not in config:
        raise ValueError("Need to provide scheduler address like\n"
                         "dask-worker SCHEDULER_ADDRESS:8786")

    if interface:
        if host:
            raise ValueError("Can not specify both interface and host")
        else:
            host = get_ip_interface(interface)

    if host:
        addr = uri_from_host_port(host, 0, 0)
    else:
        # Choose appropriate address for scheduler
        addr = None

    if death_timeout is not None:
        death_timeout = parse_timedelta(death_timeout, "s")

    local_dir = kwargs.get("local_dir", "dask-worker-space")
    with warn_on_duration(
            "1s",
            "Creating scratch directories is taking a surprisingly long time. "
            "This is often due to running workers on a network file system. "
            "Consider specifying a local-directory to point workers to write "
            "scratch data to a local disk.",
    ):
        _workspace = WorkSpace(os.path.abspath(local_dir))
        _workdir = _workspace.new_work_dir(prefix="worker-")
        local_dir = _workdir.dir_path

    nannies = [
        t(
            scheduler,
            scheduler_file=scheduler_file,
            nthreads=nthreads,
            services=services,
            loop=loop,
            resources=resources,
            memory_limit=memory_limit,
            reconnect=reconnect,
            local_dir=local_directory,
            death_timeout=death_timeout,
            preload=(preload or []) + ["dask_cuda.initialize_context"],
            preload_argv=preload_argv,
            security=sec,
            contact_address=None,
            env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)},
            name=name if nprocs == 1 or not name else name + "-" + str(i),
            data=(
                DeviceHostFile,
                {
                    "device_memory_limit":
                    get_device_total_memory(index=i) if
                    (device_memory_limit == "auto" or device_memory_limit
                     == int(0)) else parse_bytes(device_memory_limit),
                    "memory_limit":
                    parse_memory_limit(memory_limit,
                                       nthreads,
                                       total_cores=nprocs),
                    "local_dir":
                    local_dir,
                },
            ),
            **kwargs,
        ) for i in range(nprocs)
    ]

    @gen.coroutine
    def close_all():
        # Unregister all workers from scheduler
        yield [n._close(timeout=2) for n in nannies]

    def on_signal(signum):
        logger.info("Exiting on signal %d", signum)
        close_all()

    @gen.coroutine
    def run():
        yield [n._start(addr) for n in nannies]
        while all(n.status != "closed" for n in nannies):
            yield gen.sleep(0.2)

    install_signal_handlers(loop, cleanup=on_signal)

    try:
        loop.run_sync(run)
    except (KeyboardInterrupt, TimeoutError):
        pass
    finally:
        logger.info("End worker")
Пример #10
0
    def __init__(
        self,
        n_workers=None,
        threads_per_worker=1,
        processes=True,
        memory_limit="auto",
        device_memory_limit=None,
        CUDA_VISIBLE_DEVICES=None,
        data=None,
        local_directory=None,
        protocol=None,
        enable_tcp_over_ucx=False,
        enable_infiniband=False,
        enable_nvlink=False,
        enable_rdmacm=False,
        ucx_net_devices=None,
        rmm_pool_size=None,
        **kwargs,
    ):
        if CUDA_VISIBLE_DEVICES is None:
            CUDA_VISIBLE_DEVICES = cuda_visible_devices(0)
        if isinstance(CUDA_VISIBLE_DEVICES, str):
            CUDA_VISIBLE_DEVICES = CUDA_VISIBLE_DEVICES.split(",")
        CUDA_VISIBLE_DEVICES = list(map(int, CUDA_VISIBLE_DEVICES))
        if n_workers is None:
            n_workers = len(CUDA_VISIBLE_DEVICES)
        self.host_memory_limit = parse_memory_limit(memory_limit,
                                                    threads_per_worker,
                                                    n_workers)
        self.device_memory_limit = device_memory_limit

        self.rmm_pool_size = rmm_pool_size
        if rmm_pool_size is not None:
            try:
                import rmm  # noqa F401
            except ImportError:
                raise ValueError(
                    "RMM pool requested but module 'rmm' is not available. "
                    "For installation instructions, please see "
                    "https://github.com/rapidsai/rmm")  # pragma: no cover
            self.rmm_pool_size = parse_bytes(self.rmm_pool_size)

        if not processes:
            raise ValueError(
                "Processes are necessary in order to use multiple GPUs with Dask"
            )

        if self.device_memory_limit is None:
            self.device_memory_limit = get_device_total_memory(0)
        elif isinstance(self.device_memory_limit, str):
            self.device_memory_limit = parse_bytes(self.device_memory_limit)

        if data is None:
            data = (
                DeviceHostFile,
                {
                    "device_memory_limit":
                    self.device_memory_limit,
                    "memory_limit":
                    self.host_memory_limit,
                    "local_directory":
                    local_directory or dask.config.get("temporary-directory")
                    or os.getcwd(),
                },
            )

        if enable_tcp_over_ucx or enable_infiniband or enable_nvlink:
            if protocol is None:
                protocol = "ucx"
            elif protocol != "ucx":
                raise TypeError(
                    "Enabling InfiniBand or NVLink requires protocol='ucx'")

        if ucx_net_devices == "auto":
            try:
                from ucp._libs.topological_distance import TopologicalDistance  # noqa
            except ImportError:
                raise ValueError(
                    "ucx_net_devices set to 'auto' but UCX-Py is not "
                    "installed or it's compiled without hwloc support")
        elif ucx_net_devices == "":
            raise ValueError("ucx_net_devices can not be an empty string")
        self.ucx_net_devices = ucx_net_devices
        self.set_ucx_net_devices = enable_infiniband
        self.host = kwargs.get("host", None)

        super().__init__(
            n_workers=0,
            threads_per_worker=threads_per_worker,
            memory_limit=self.host_memory_limit,
            processes=True,
            data=data,
            local_directory=local_directory,
            protocol=protocol,
            config={
                "ucx":
                get_ucx_config(
                    enable_tcp_over_ucx=enable_tcp_over_ucx,
                    enable_nvlink=enable_nvlink,
                    enable_infiniband=enable_infiniband,
                    enable_rdmacm=enable_rdmacm,
                )
            },
            **kwargs,
        )

        self.new_spec["options"]["preload"] = self.new_spec["options"].get(
            "preload", []) + ["dask_cuda.initialize"]
        self.new_spec[
            "options"]["preload_argv"] = self.new_spec["options"].get(
                "preload_argv", []) + ["--create-cuda-context"]

        self.cuda_visible_devices = CUDA_VISIBLE_DEVICES
        self.scale(n_workers)
        self.sync(self._correct_state)