예제 #1
0
    def _initialize_read(self):
        from cudf.core import Buffer
        from cupy.cuda.memory import UnownedMemory

        self._offset = 0
        self._has_read_headers = False
        self._buffers = []
        headers, buffers = _id_to_buffers[self._object_id]
        self._headers = headers = headers.copy()
        buffer_types = []
        for buf in buffers:
            if isinstance(buf, cupy.ndarray):
                ptr, size = buf.data.ptr, buf.size
                self._buffers.append(
                    UnownedMemory(ptr, size, Buffer(ptr, size)))
                buffer_types.append(['cuda', size])
            elif isinstance(buf, Buffer):
                ptr, size = buf.ptr, buf.size
                if size == 0:
                    # empty buffer cannot construct a UnownedMemory
                    self._buffers.append(None)
                else:
                    self._buffers.append(
                        UnownedMemory(ptr, size, Buffer(ptr, size)))
                buffer_types.append(['cuda', size])
            else:
                size = getattr(buf, 'size', len(buf))
                self._buffers.append(buf)
                buffer_types.append(['memory', size])
        headers['buffer_types'] = buffer_types
예제 #2
0
파일: utils.py 프로젝트: h8f/mars
 def _write_cuda_buffer(ptr):  # pragma: no cover
     # copy cuda buffer to host
     chunk_size = CUDA_CHUNK_SIZE
     offset = 0
     nbytes = buffer.nbytes
     while offset < nbytes:
         size = chunk_size if (offset + chunk_size) < nbytes else nbytes - offset
         chunk_buffer = CPBuffer(ptr + offset, size=size)
         # copy chunk to host memoryview
         writer.write(chunk_buffer.host_serialize()[1][0])
         offset += size
예제 #3
0
async def read_buffers(header: Dict, reader: StreamReader):
    try:
        from cudf.core import Buffer as CPBuffer
        from cupy.cuda.memory import UnownedMemory as CPUnownedMemory, \
            MemoryPointer as CPMemoryPointer
    except ImportError:
        CPBuffer = CPUnownedMemory = CPMemoryPointer = None

    serializer = header.get('serializer')
    if serializer == 'cudf' or serializer == 'cupy':  # pragma: no cover
        # construct a empty cuda buffer and copy from host
        lengths = header.get('lengths')
        buffers = []
        for length in lengths:
            cuda_buffer = CPBuffer.empty(length)
            cupy_memory = CPUnownedMemory(cuda_buffer.ptr, length, cuda_buffer)
            offset = 0
            chunk_size = CUDA_CHUNK_SIZE
            while offset < length:
                read_size = chunk_size if (
                    offset + chunk_size) < length else length - offset
                content = await reader.read(read_size)
                source_mem = np.frombuffer(
                    content, dtype='uint8').ctypes.data_as(ctypes.c_void_p)
                cupy_pointer = CPMemoryPointer(cupy_memory, offset)
                cupy_pointer.copy_from(source_mem, len(content))
                offset += read_size
            buffers.append(cuda_buffer)
        return buffers
    else:
        buffer_sizes = header.pop(BUFFER_SIZES_NAME)
        buffers = [await reader.read(size) for size in buffer_sizes]
        return buffers
예제 #4
0
파일: cuda.py 프로젝트: edgar87/mars
    def _read_init(self):
        from cupy.cuda.memory import UnownedMemory

        ptr = self._object_id.ptrs[0]
        self._size = self._object_id.headers['size']
        self._buffer = Buffer(ptr, self._size)
        self._cupy_memory = UnownedMemory(ptr, self._size, self._buffer)
예제 #5
0
파일: cuda.py 프로젝트: edgar87/mars
 def read(self, size=-1) -> Buffer:
     if not self._initialized:
         self._read_init()
         self._initialized = True
     size = self._size if size < 0 else size
     cupy_pointer = MemoryPointer(self._cupy_memory, self._offset)
     self._offset += size
     return Buffer(cupy_pointer.ptr, size=size)
예제 #6
0
파일: array.py 프로젝트: teju85/cuml
 def serialize(self):
     header, frames = super(CumlArray, self).serialize()
     header["constructor-kwargs"] = {
         "dtype": self.dtype.str,
         "shape": self.shape,
         "order": self.order,
     }
     frames = [Buffer(f) for f in frames]
     return header, frames
예제 #7
0
    async def open_writer(self, size=None) -> StorageFileObject:
        from cudf.core.buffer import Buffer

        cuda_buffer = Buffer.empty(size)
        headers = dict(size=size)
        object_id = CudaObjectId(headers, [cuda_buffer.ptr])
        self._id_to_buffers[object_id] = cuda_buffer
        cuda_writer = CudaFileObject(object_id, cuda_buffer=cuda_buffer, mode='w', size=size)
        return StorageFileObject(cuda_writer, object_id=object_id)
예제 #8
0
    def read(self, size=-1):
        from cudf.core import Buffer
        from cupy.cuda import MemoryPointer

        if not self._initialized:
            self._read_init()
            self._initialized = True
        size = self._size if size < 0 else size
        cupy_pointer = MemoryPointer(self._cupy_memory, self._offset)
        self._offset += size
        return Buffer(cupy_pointer.ptr, size=size)
예제 #9
0
    def read(self, size: int):
        # we read cuda_header first and then read cuda buffers one by one,
        # the return value's size is not exactly the specified size.
        from cudf.core import Buffer
        from cupy.cuda import MemoryPointer
        from cupy.cuda.memory import UnownedMemory

        if not self._has_read_headers:
            self._has_read_headers = True
            return pickle.dumps(self._headers)
        if len(self._buffers) == 0:
            return ''
        cur_buf = self._buffers[0]
        # current buf read to end
        if cur_buf is None:
            # empty cuda buffer
            content = Buffer.empty(0)
            self._offset = 0
            self._buffers.pop(0)
            return content
        elif size >= cur_buf.size - self._offset:
            if isinstance(cur_buf, UnownedMemory):
                cupy_pointer = MemoryPointer(cur_buf, self._offset)
                content = Buffer(cupy_pointer.ptr,
                                 size=cur_buf.size - self._offset)
            else:
                content = cur_buf[self._offset:self._offset + size]
            self._offset = 0
            self._buffers.pop(0)
            return content
        else:
            if isinstance(cur_buf, UnownedMemory):
                cupy_pointer = MemoryPointer(cur_buf, self._offset)
                self._offset += size
                return Buffer(cupy_pointer.ptr, size=size)
            else:
                self._offset += size
                return cur_buf[self._offset, self._offset + size]
예제 #10
0
    async def get(self, object_id: str, **kwargs) -> object:
        from cudf.core import Buffer
        from rmm import DeviceBuffer

        headers, buffers = _id_to_buffers[object_id]
        new_buffers = []
        for buf in buffers:
            if isinstance(buf, cupy.ndarray):
                new_buffers.append(
                    DeviceBuffer(ptr=buf.data.ptr, size=buf.size))
            elif isinstance(buf, cudf.core.Buffer):
                new_buffers.append(
                    Buffer(buf.ptr, buf.size,
                           DeviceBuffer(ptr=buf.ptr, size=buf.size)))
            else:
                new_buffers.append(buf)
        return deserialize(headers, new_buffers)
예제 #11
0
    async def get(self, object_id: CudaObjectId, **kwargs) -> object:
        from cudf.core.buffer import Buffer
        from rmm import DeviceBuffer

        headers = object_id.headers
        ptrs = object_id.ptrs
        data_type = headers.pop('data_type')
        if data_type == 'cupy':
            ptr = ptrs[0]
            size = headers['lengths'][0]
            cuda_buf = DeviceBuffer(ptr=ptr, size=size)
            buffers = [cuda_buf]
        elif data_type == 'cudf':
            buffers = [Buffer(ptr, length, DeviceBuffer(ptr=ptr, size=length))
                       for ptr, length in zip(ptrs, headers['lengths'])]
        else:
            raise TypeError(f'Unknown data type {data_type}')
        return deserialize(headers, buffers)
예제 #12
0
    def write(self, content):
        from cudf.core import Buffer
        from cupy.cuda import MemoryPointer
        from cupy.cuda.memory import UnownedMemory

        if not self._has_write_headers:
            self._headers = headers = pickle.loads(content)
            buffer_types = headers['buffer_types']
            for buffer_type, size in buffer_types:
                if buffer_type == 'cuda':
                    self._buffers.append(Buffer.empty(size))
                else:
                    self._buffers.append(BytesIO())
            self._has_write_headers = True
            return

        cur_buf = self._buffers[self._cur_buffer_index]
        cur_buf_size = self._headers['buffer_types'][self._cur_buffer_index][1]
        if isinstance(cur_buf, Buffer):
            cur_cupy_memory = UnownedMemory(cur_buf.ptr, len(cur_buf), cur_buf)
            cupy_pointer = MemoryPointer(cur_cupy_memory, self._offset)

            if isinstance(content, bytes):
                content_length = len(content)
                source_mem = np.frombuffer(
                    content, dtype='uint8').ctypes.data_as(ctypes.c_void_p)
            else:
                source_mem = MemoryPointer(
                    UnownedMemory(content.ptr, len(content), content), 0)
                content_length = source_mem.mem.size
            cupy_pointer.copy_from(source_mem, content_length)
        else:
            content_length = len(content)
            cur_buf.write(content)
        if content_length + self._offset >= cur_buf_size:
            if isinstance(cur_buf, BytesIO):
                self._buffers[self._cur_buffer_index] = cur_buf.getvalue()
            self._cur_buffer_index += 1
            self._offset = 0
        else:
            self._offset += content_length
예제 #13
0
파일: utils.py 프로젝트: qinxuye/mars
async def read_buffers(header: Dict, reader: StreamReader):
    if cupy is not None and cudf is not None:
        from cudf.core import Buffer as CPBuffer
        from cupy.cuda.memory import UnownedMemory as CPUnownedMemory, \
            MemoryPointer as CPMemoryPointer
    else:
        CPBuffer = CPUnownedMemory = CPMemoryPointer = None

    # construct a empty cuda buffer and copy from host
    is_cuda_buffers = header.get('is_cuda_buffers')
    buffer_sizes = header.pop(BUFFER_SIZES_NAME)

    buffers = []
    for is_cuda_buffer, buf_size in zip(is_cuda_buffers, buffer_sizes):
        if is_cuda_buffer:  # pragma: no cover
            if buf_size == 0:
                content = await reader.readexactly(buf_size)
                buffers.append(content)
            else:
                cuda_buffer = CPBuffer.empty(buf_size)
                cupy_memory = CPUnownedMemory(cuda_buffer.ptr, buf_size,
                                              cuda_buffer)
                offset = 0
                chunk_size = CUDA_CHUNK_SIZE
                while offset < buf_size:
                    read_size = chunk_size if (
                        offset + chunk_size) < buf_size else buf_size - offset
                    content = await reader.readexactly(read_size)
                    source_mem = np.frombuffer(
                        content, dtype='uint8').ctypes.data_as(ctypes.c_void_p)
                    cupy_pointer = CPMemoryPointer(cupy_memory, offset)
                    cupy_pointer.copy_from(source_mem, len(content))
                    offset += read_size
                buffers.append(cuda_buffer)
        else:
            buffers.append(await reader.readexactly(buf_size))
    return buffers
예제 #14
0
def melt(
    frame,
    id_vars=None,
    value_vars=None,
    var_name=None,
    value_name="value",
    col_level=None,
):
    """Unpivots a DataFrame from wide format to long format,
    optionally leaving identifier variables set.

    Parameters
    ----------
    frame : DataFrame
    id_vars : tuple, list, or ndarray, optional
        Column(s) to use as identifier variables.
        default: None
    value_vars : tuple, list, or ndarray, optional
        Column(s) to unpivot.
        default: all columns that are not set as `id_vars`.
    var_name : scalar
        Name to use for the `variable` column.
        default: frame.columns.name or 'variable'
    value_name : str
        Name to use for the `value` column.
        default: 'value'

    Returns
    -------
    out : DataFrame
        Melted result

    Difference from pandas:
     * Does not support 'col_level' because cuDF does not have multi-index

    Examples
    --------
    >>> import cudf
    >>> import numpy as np
    >>> df = cudf.DataFrame({'A': {0: 1, 1: 1, 2: 5},
    ...                      'B': {0: 1, 1: 3, 2: 6},
    ...                      'C': {0: 1.0, 1: np.nan, 2: 4.0},
    ...                      'D': {0: 2.0, 1: 5.0, 2: 6.0}})
    >>> cudf.melt(frame=df, id_vars=['A', 'B'], value_vars=['C', 'D'])
         A    B variable value
    0    1    1        C   1.0
    1    1    3        C
    2    5    6        C   4.0
    3    1    1        D   2.0
    4    1    3        D   5.0
    5    5    6        D   6.0
    """
    assert col_level in (None, )

    # Arg cleaning
    import collections

    # id_vars
    if id_vars is not None:
        if not isinstance(id_vars, collections.abc.Sequence):
            id_vars = [id_vars]
        id_vars = list(id_vars)
        missing = set(id_vars) - set(frame.columns)
        if not len(missing) == 0:
            raise KeyError("The following 'id_vars' are not present"
                           " in the DataFrame: {missing}"
                           "".format(missing=list(missing)))
    else:
        id_vars = []

    # value_vars
    if value_vars is not None:
        if not isinstance(value_vars, collections.abc.Sequence):
            value_vars = [value_vars]
        value_vars = list(value_vars)
        missing = set(value_vars) - set(frame.columns)
        if not len(missing) == 0:
            raise KeyError("The following 'value_vars' are not present"
                           " in the DataFrame: {missing}"
                           "".format(missing=list(missing)))
    else:
        # then all remaining columns in frame
        value_vars = frame.columns.drop(id_vars)
        value_vars = list(value_vars)

    # Error for unimplemented support for datatype
    dtypes = [frame[col].dtype for col in id_vars + value_vars]
    if any(is_categorical_dtype(t) for t in dtypes):
        raise NotImplementedError("Categorical columns are not yet "
                                  "supported for function")

    # Check dtype homogeneity in value_var
    # Because heterogeneous concat is unimplemented
    dtypes = [frame[col].dtype for col in value_vars]
    if len(dtypes) > 0:
        dtype = dtypes[0]
        if any(t != dtype for t in dtypes):
            raise ValueError("all cols in value_vars must have the same dtype")

    # overlap
    overlap = set(id_vars).intersection(set(value_vars))
    if not len(overlap) == 0:
        raise KeyError("'value_vars' and 'id_vars' cannot have overlap."
                       " The following 'value_vars' are ALSO present"
                       " in 'id_vars': {overlap}"
                       "".format(overlap=list(overlap)))

    N = len(frame)
    K = len(value_vars)

    def _tile(A, reps):
        series_list = [A] * reps
        if reps > 0:
            return Series._concat(objs=series_list, index=None)
        else:
            return Series(Buffer.null(dtype=A.dtype))

    # Step 1: tile id_vars
    mdata = collections.OrderedDict()
    for col in id_vars:
        mdata[col] = _tile(frame[col], K)

    # Step 2: add variable
    var_cols = []
    for i, var in enumerate(value_vars):
        var_cols.append(
            Series(Buffer(cudautils.full(size=N, value=i, dtype=np.int8))))
    temp = Series._concat(objs=var_cols, index=None)

    if not var_name:
        var_name = "variable"

    mdata[var_name] = Series(
        CategoricalColumn(categories=value_vars,
                          data=temp._column.data,
                          ordered=False))

    # Step 3: add values
    mdata[value_name] = Series._concat(objs=[frame[val] for val in value_vars],
                                       index=None)

    return DataFrame(mdata)
예제 #15
0
 def _tile(A, reps):
     series_list = [A] * reps
     if reps > 0:
         return Series._concat(objs=series_list, index=None)
     else:
         return Series(Buffer.null(dtype=A.dtype))