예제 #1
0
파일: ucx.py 프로젝트: d-v-b/distributed
def init_once():
    global ucp, cuda_array
    if ucp is not None:
        return

    import ucp as _ucp

    ucp = _ucp

    # remove/process dask.ucx flags for valid ucx options
    ucx_config = _scrub_ucx_config()

    ucp.init(options=ucx_config, env_takes_precedence=True)

    # Find the function, `cuda_array()`, to use when allocating new CUDA arrays
    try:
        import rmm

        if hasattr(rmm, "DeviceBuffer"):
            cuda_array = lambda n: rmm.DeviceBuffer(size=n)
        else:  # pre-0.11.0
            cuda_array = lambda n: rmm.device_array(n, dtype=np.uint8)
    except ImportError:
        try:
            import numba.cuda

            cuda_array = lambda n: numba.cuda.device_array(
                (n, ), dtype=np.uint8)
        except ImportError:

            def cuda_array(n):
                raise RuntimeError(
                    "In order to send/recv CUDA arrays, Numba or RMM is required"
                )

    pool_size_str = dask.config.get("rmm.pool-size")
    if pool_size_str is not None:
        pool_size = parse_bytes(pool_size_str)
        rmm.reinitialize(pool_allocator=True,
                         managed_memory=False,
                         initial_pool_size=pool_size)
예제 #2
0
def test_mr_devicebuffer_lifetime():
    # Test ensures MR/Stream lifetime is longer than DeviceBuffer. Even if all
    # references go out of scope
    # Create new Pool MR
    rmm.mr.set_current_device_resource(
        rmm.mr.PoolMemoryResource(rmm.mr.get_current_device_resource()))

    # Creates a new non-default stream
    stream = rmm._cuda.stream.Stream()

    # Allocate DeviceBuffer with Pool and Stream
    a = rmm.DeviceBuffer(size=10, stream=stream)

    # Change current MR. Will cause Pool to go out of scope
    rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource())

    # Force collection to ensure objects are cleaned up
    gc.collect()

    # Delete a. Used to crash before. Pool MR should still be alive
    del a
예제 #3
0
def test_rmm_device_buffer(size):
    b = rmm.DeviceBuffer(size=size)

    # Test some properties
    if size:
        assert b.ptr != 0
        assert b.size == size
    else:
        assert b.ptr == 0
        assert b.size == 0
    assert len(b) == b.size
    assert b.nbytes == b.size
    assert b.capacity() >= b.size
    assert b.__sizeof__() == b.size

    # Test `__cuda_array_interface__`
    keyset = {"data", "shape", "strides", "typestr", "version"}
    assert isinstance(b.__cuda_array_interface__, dict)
    assert set(b.__cuda_array_interface__) == keyset
    assert b.__cuda_array_interface__["data"] == (b.ptr, False)
    assert b.__cuda_array_interface__["shape"] == (b.size, )
    assert b.__cuda_array_interface__["strides"] is None
    assert b.__cuda_array_interface__["typestr"] == "|u1"
    assert b.__cuda_array_interface__["version"] == 0

    # Test conversion to bytes
    s = b.tobytes()
    assert isinstance(s, bytes)
    assert len(s) == len(b)

    # Test conversion from bytes
    b2 = rmm.DeviceBuffer.to_device(s)
    assert isinstance(b2, rmm.DeviceBuffer)
    assert len(b2) == len(s)

    # Test resizing
    b.resize(2)
    assert b.size == 2
    assert b.capacity() >= b.size
예제 #4
0
def register_am_allocators(args, worker):
    if not args.enable_am:
        return

    import numpy as np

    worker.register_am_allocator(
        lambda n: np.empty(n, dtype=np.uint8), ucx_api.AllocatorType.HOST
    )

    if args.object_type == "cupy":
        import cupy as cp

        worker.register_am_allocator(
            lambda n: cp.empty(n, dtype=cp.uint8), ucx_api.AllocatorType.CUDA
        )
    elif args.object_type == "rmm":
        import rmm

        worker.register_am_allocator(
            lambda n: rmm.DeviceBuffer(size=n), ucx_api.AllocatorType.CUDA
        )
예제 #5
0
def test_dev_buf_circle_ref_dealloc():
    rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource())

    dbuf1 = rmm.DeviceBuffer(size=1_000_000)

    # Make dbuf1 part of a reference cycle:
    l1 = [dbuf1]
    l1.append(l1)

    # due to the reference cycle, the device buffer doesn't actually get
    # cleaned up until later, when we invoke `gc.collect()`:
    del dbuf1, l1

    rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource())

    # by now, the only remaining reference to the *original* memory
    # resource should be in `dbuf1`. However, the cyclic garbage collector
    # will eliminate that reference when it clears the object via its
    # `tp_clear` method.  Later, when `tp_dealloc` attemps to actually
    # deallocate `dbuf1` (which needs the MR alive), a segfault occurs.

    gc.collect()
예제 #6
0
def test_statistics_resource_adaptor():

    cuda_mr = rmm.mr.CudaMemoryResource()

    mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr)

    rmm.mr.set_current_device_resource(mr)

    buffers = [rmm.DeviceBuffer(size=1000) for _ in range(10)]

    for i in range(9, 0, -2):
        del buffers[i]

    assert mr.allocation_counts == {
        "current_bytes": 5000,
        "current_count": 5,
        "peak_bytes": 10000,
        "peak_count": 10,
        "total_bytes": 10000,
        "total_count": 10,
    }

    # Push a new Tracking adaptor
    mr2 = rmm.mr.StatisticsResourceAdaptor(mr)
    rmm.mr.set_current_device_resource(mr2)

    for _ in range(2):
        buffers.append(rmm.DeviceBuffer(size=1000))

    assert mr2.allocation_counts == {
        "current_bytes": 2000,
        "current_count": 2,
        "peak_bytes": 2000,
        "peak_count": 2,
        "total_bytes": 2000,
        "total_count": 2,
    }
    assert mr.allocation_counts == {
        "current_bytes": 7000,
        "current_count": 7,
        "peak_bytes": 10000,
        "peak_count": 10,
        "total_bytes": 12000,
        "total_count": 12,
    }

    del buffers
    gc.collect()

    assert mr2.allocation_counts == {
        "current_bytes": 0,
        "current_count": 0,
        "peak_bytes": 2000,
        "peak_count": 2,
        "total_bytes": 2000,
        "total_count": 2,
    }
    assert mr.allocation_counts == {
        "current_bytes": 0,
        "current_count": 0,
        "peak_bytes": 10000,
        "peak_count": 10,
        "total_bytes": 12000,
        "total_count": 12,
    }
예제 #7
0
def test_reinitialize_max_pool_size_exceeded():
    rmm.reinitialize(pool_allocator=True,
                     initial_pool_size=0,
                     maximum_pool_size=1 << 23)
    with pytest.raises(MemoryError):
        rmm.DeviceBuffer().resize(1 << 24)
예제 #8
0
def test_reinitialize_max_pool_size():
    rmm.reinitialize(pool_allocator=True,
                     initial_pool_size=0,
                     maximum_pool_size=1 << 23)
    rmm.DeviceBuffer().resize((1 << 23) - 1)
예제 #9
0
def cuda_array(size):
    return rmm.DeviceBuffer(size=size)
예제 #10
0
def init_once():
    global ucp, device_array
    global ucx_create_endpoint, ucx_create_listener
    global pre_existing_cuda_context, cuda_context_created

    if ucp is not None:
        return

    # remove/process dask.ucx flags for valid ucx options
    ucx_config = _scrub_ucx_config()

    # We ensure the CUDA context is created before initializing UCX. This can't
    # be safely handled externally because communications in Dask start before
    # preload scripts run.
    if dask.config.get("distributed.comm.ucx.create-cuda-context") is True or (
            "TLS" in ucx_config and "cuda_copy" in ucx_config["TLS"]):
        try:
            import numba.cuda
        except ImportError:
            raise ImportError(
                "CUDA support with UCX requires Numba for context management")

        cuda_visible_device = int(
            os.environ.get("CUDA_VISIBLE_DEVICES", "0").split(",")[0])
        pre_existing_cuda_context = has_cuda_context()
        if pre_existing_cuda_context is not False:
            warnings.warn(
                f"A CUDA context for device {pre_existing_cuda_context} already exists on process "
                f"ID {os.getpid()}. This is often the result of a CUDA-enabled library calling a "
                "CUDA runtime function before Dask-CUDA can spawn worker processes. Please make "
                "sure any such function calls don't happen at import time or in the global scope "
                "of a program.")

        numba.cuda.current_context()

        cuda_context_created = has_cuda_context()
        if (cuda_context_created is not False
                and cuda_context_created != cuda_visible_device):
            warnings.warn(
                f"Worker with process ID {os.getpid()} should have a CUDA context assigned to "
                f"device {cuda_visible_device}, but instead the CUDA context is on device "
                "{cuda_context_created}. This is often the result of a CUDA-enabled library "
                "calling a CUDA runtime function before Dask-CUDA can spawn worker processes. "
                "Please make sure any such function calls don't happen at import time or in "
                "the global scope of a program.")

    import ucp as _ucp

    ucp = _ucp

    ucp.init(options=ucx_config, env_takes_precedence=True)

    # Find the function, `cuda_array()`, to use when allocating new CUDA arrays
    try:
        import rmm

        device_array = lambda n: rmm.DeviceBuffer(size=n)
    except ImportError:
        try:
            import numba.cuda

            def numba_device_array(n):
                a = numba.cuda.device_array((n, ), dtype="u1")
                weakref.finalize(a, numba.cuda.current_context)
                return a

            device_array = numba_device_array
        except ImportError:

            def device_array(n):
                raise RuntimeError(
                    "In order to send/recv CUDA arrays, Numba or RMM is required"
                )

    pool_size_str = dask.config.get("distributed.rmm.pool-size")
    if pool_size_str is not None:
        pool_size = parse_bytes(pool_size_str)
        rmm.reinitialize(pool_allocator=True,
                         managed_memory=False,
                         initial_pool_size=pool_size)
예제 #11
0
def init_once():
    global ucp, host_array, device_array, ucx_create_endpoint, ucx_create_listener
    if ucp is not None:
        return

    import ucp as _ucp

    ucp = _ucp

    # remove/process dask.ucx flags for valid ucx options
    ucx_config = _scrub_ucx_config()

    ucp.init(options=ucx_config, env_takes_precedence=True)

    # Find the function, `host_array()`, to use when allocating new host arrays
    try:
        import numpy

        host_array = lambda n: numpy.empty((n, ), dtype="u1")
    except ImportError:
        host_array = lambda n: bytearray(n)

    # Find the function, `cuda_array()`, to use when allocating new CUDA arrays
    try:
        import rmm

        if hasattr(rmm, "DeviceBuffer"):
            device_array = lambda n: rmm.DeviceBuffer(size=n)
        else:  # pre-0.11.0
            import numba.cuda

            def rmm_device_array(n):
                a = rmm.device_array(n, dtype="u1")
                weakref.finalize(a, numba.cuda.current_context)
                return a

            device_array = rmm_device_array
    except ImportError:
        try:
            import numba.cuda

            def numba_device_array(n):
                a = numba.cuda.device_array((n, ), dtype="u1")
                weakref.finalize(a, numba.cuda.current_context)
                return a

            device_array = numba_device_array
        except ImportError:

            def device_array(n):
                raise RuntimeError(
                    "In order to send/recv CUDA arrays, Numba or RMM is required"
                )

    pool_size_str = dask.config.get("rmm.pool-size")
    if pool_size_str is not None:
        pool_size = parse_bytes(pool_size_str)
        rmm.reinitialize(pool_allocator=True,
                         managed_memory=False,
                         initial_pool_size=pool_size)

    try:
        from ucp.endpoint_reuse import EndpointReuse
    except ImportError:
        ucx_create_endpoint = ucp.create_endpoint
        ucx_create_listener = ucp.create_listener
    else:
        if dask.config.get("ucx.reuse-endpoints"):
            ucx_create_endpoint = EndpointReuse.create_endpoint
            ucx_create_listener = EndpointReuse.create_listener
        else:
            ucx_create_endpoint = ucp.create_endpoint
            ucx_create_listener = ucp.create_listener
예제 #12
0
파일: ucx.py 프로젝트: oshadura/distributed
def init_once():
    global ucp, host_array, device_array, ucx_create_endpoint, ucx_create_listener
    if ucp is not None:
        return

    # remove/process dask.ucx flags for valid ucx options
    ucx_config = _scrub_ucx_config()

    # We ensure the CUDA context is created before initializing UCX. This can't
    # be safely handled externally because communications in Dask start before
    # preload scripts run.
    if "TLS" in ucx_config and "cuda_copy" in ucx_config["TLS"]:
        try:
            import numba.cuda
        except ImportError:
            raise ImportError(
                "CUDA support with UCX requires Numba for context management")

        numba.cuda.current_context()

    import ucp as _ucp

    ucp = _ucp

    ucp.init(options=ucx_config, env_takes_precedence=True)

    # Find the function, `host_array()`, to use when allocating new host arrays
    try:
        import numpy

        host_array = lambda n: numpy.empty((n, ), dtype="u1")
    except ImportError:
        host_array = lambda n: bytearray(n)

    # Find the function, `cuda_array()`, to use when allocating new CUDA arrays
    try:
        import rmm

        if hasattr(rmm, "DeviceBuffer"):
            device_array = lambda n: rmm.DeviceBuffer(size=n)
        else:  # pre-0.11.0
            import numba.cuda

            def rmm_device_array(n):
                a = rmm.device_array(n, dtype="u1")
                weakref.finalize(a, numba.cuda.current_context)
                return a

            device_array = rmm_device_array
    except ImportError:
        try:
            import numba.cuda

            def numba_device_array(n):
                a = numba.cuda.device_array((n, ), dtype="u1")
                weakref.finalize(a, numba.cuda.current_context)
                return a

            device_array = numba_device_array
        except ImportError:

            def device_array(n):
                raise RuntimeError(
                    "In order to send/recv CUDA arrays, Numba or RMM is required"
                )

    pool_size_str = dask.config.get("rmm.pool-size")
    if pool_size_str is not None:
        pool_size = parse_bytes(pool_size_str)
        rmm.reinitialize(pool_allocator=True,
                         managed_memory=False,
                         initial_pool_size=pool_size)

    try:
        from ucp.endpoint_reuse import EndpointReuse
    except ImportError:
        ucx_create_endpoint = ucp.create_endpoint
        ucx_create_listener = ucp.create_listener
    else:
        reuse_endpoints = dask.config.get("ucx.reuse-endpoints")
        if (reuse_endpoints is None and ucp.get_ucx_version() >=
            (1, 11, 0)) or reuse_endpoints is False:
            ucx_create_endpoint = ucp.create_endpoint
            ucx_create_listener = ucp.create_listener
        else:
            ucx_create_endpoint = EndpointReuse.create_endpoint
            ucx_create_listener = EndpointReuse.create_listener
예제 #13
0
    def _concat(cls, objs, dtype=None):
        from cudf.core.series import Series
        from cudf.core.column import (
            StringColumn,
            CategoricalColumn,
            NumericalColumn,
        )

        if len(objs) == 0:
            dtype = pd.api.types.pandas_dtype(dtype)
            if is_categorical_dtype(dtype):
                dtype = CategoricalDtype()
            return column_empty(0, dtype=dtype, masked=True)

        # If all columns are `NumericalColumn` with different dtypes,
        # we cast them to a common dtype.
        # Notice, we can always cast pure null columns
        not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs))
        if len(not_null_cols) > 0 and (len([
                o for o in not_null_cols if not isinstance(o, NumericalColumn)
                or np.issubdtype(o.dtype, np.datetime64)
        ]) == 0):
            col_dtypes = [o.dtype for o in not_null_cols]
            # Use NumPy to find a common dtype
            common_dtype = np.find_common_type(col_dtypes, [])
            # Cast all columns to the common dtype
            for i in range(len(objs)):
                objs[i] = objs[i].astype(common_dtype)

        # Find the first non-null column:
        head = objs[0]
        for i, obj in enumerate(objs):
            if len(obj) != obj.null_count:
                head = obj
                break

        for i, obj in enumerate(objs):
            # Check that all columns are the same type:
            if not pd.api.types.is_dtype_equal(objs[i].dtype, head.dtype):
                # if all null, cast to appropriate dtype
                if len(obj) == obj.null_count:
                    from cudf.core.column import column_empty_like

                    objs[i] = column_empty_like(head,
                                                dtype=head.dtype,
                                                masked=True,
                                                newsize=len(obj))

        # Handle categories for categoricals
        if all(isinstance(o, CategoricalColumn) for o in objs):
            cats = (Series(ColumnBase._concat([o.categories for o in objs
                                               ])).drop_duplicates()._column)
            objs = [
                o.cat()._set_categories(cats, is_unique=True) for o in objs
            ]

        head = objs[0]
        for obj in objs:
            if not (obj.dtype == head.dtype):
                raise ValueError("All series must be of same type")

        newsize = sum(map(len, objs))
        if newsize > libcudfxx.MAX_COLUMN_SIZE:
            raise MemoryError("Result of concat cannot have "
                              "size > {}".format(
                                  libcudfxx.MAX_COLUMN_SIZE_STR))

        # Handle strings separately
        if all(isinstance(o, StringColumn) for o in objs):
            result_nbytes = sum(o._nbytes for o in objs)
            if result_nbytes > libcudfxx.MAX_STRING_COLUMN_BYTES:
                raise MemoryError(
                    "Result of concat cannot have > {}  bytes".format(
                        libcudfxx.MAX_STRING_COLUMN_BYTES_STR))
            objs = [o.nvstrings for o in objs]
            return as_column(nvstrings.from_strings(*objs))

        # Filter out inputs that have 0 length
        objs = [o for o in objs if len(o) > 0]
        nulls = any(col.nullable for col in objs)

        if is_categorical_dtype(head):
            data = None
            data_dtype = head.codes.dtype
            children = (column_empty(newsize,
                                     dtype=head.codes.dtype,
                                     masked=True), )
        else:
            data_dtype = head.dtype
            mem = rmm.DeviceBuffer(size=newsize * data_dtype.itemsize)
            data = Buffer(mem)
            children = None

        # Allocate output mask only if there's nulls in the input objects
        mask = None
        if nulls:
            mask = Buffer(utils.make_mask(newsize))

        col = build_column(data=data,
                           dtype=head.dtype,
                           mask=mask,
                           children=children)

        # Performance the actual concatenation
        if newsize > 0:
            col = libcudf.concat._column_concat(objs, col)

        return col