def _initialize_read(self): from cudf.core import Buffer from cupy.cuda.memory import UnownedMemory self._offset = 0 self._has_read_headers = False self._buffers = [] headers, buffers = _id_to_buffers[self._object_id] self._headers = headers = headers.copy() buffer_types = [] for buf in buffers: if isinstance(buf, cupy.ndarray): ptr, size = buf.data.ptr, buf.size self._buffers.append( UnownedMemory(ptr, size, Buffer(ptr, size))) buffer_types.append(['cuda', size]) elif isinstance(buf, Buffer): ptr, size = buf.ptr, buf.size if size == 0: # empty buffer cannot construct a UnownedMemory self._buffers.append(None) else: self._buffers.append( UnownedMemory(ptr, size, Buffer(ptr, size))) buffer_types.append(['cuda', size]) else: size = getattr(buf, 'size', len(buf)) self._buffers.append(buf) buffer_types.append(['memory', size]) headers['buffer_types'] = buffer_types
def _write_cuda_buffer(ptr): # pragma: no cover # copy cuda buffer to host chunk_size = CUDA_CHUNK_SIZE offset = 0 nbytes = buffer.nbytes while offset < nbytes: size = chunk_size if (offset + chunk_size) < nbytes else nbytes - offset chunk_buffer = CPBuffer(ptr + offset, size=size) # copy chunk to host memoryview writer.write(chunk_buffer.host_serialize()[1][0]) offset += size
async def read_buffers(header: Dict, reader: StreamReader): try: from cudf.core import Buffer as CPBuffer from cupy.cuda.memory import UnownedMemory as CPUnownedMemory, \ MemoryPointer as CPMemoryPointer except ImportError: CPBuffer = CPUnownedMemory = CPMemoryPointer = None serializer = header.get('serializer') if serializer == 'cudf' or serializer == 'cupy': # pragma: no cover # construct a empty cuda buffer and copy from host lengths = header.get('lengths') buffers = [] for length in lengths: cuda_buffer = CPBuffer.empty(length) cupy_memory = CPUnownedMemory(cuda_buffer.ptr, length, cuda_buffer) offset = 0 chunk_size = CUDA_CHUNK_SIZE while offset < length: read_size = chunk_size if ( offset + chunk_size) < length else length - offset content = await reader.read(read_size) source_mem = np.frombuffer( content, dtype='uint8').ctypes.data_as(ctypes.c_void_p) cupy_pointer = CPMemoryPointer(cupy_memory, offset) cupy_pointer.copy_from(source_mem, len(content)) offset += read_size buffers.append(cuda_buffer) return buffers else: buffer_sizes = header.pop(BUFFER_SIZES_NAME) buffers = [await reader.read(size) for size in buffer_sizes] return buffers
def _read_init(self): from cupy.cuda.memory import UnownedMemory ptr = self._object_id.ptrs[0] self._size = self._object_id.headers['size'] self._buffer = Buffer(ptr, self._size) self._cupy_memory = UnownedMemory(ptr, self._size, self._buffer)
def read(self, size=-1) -> Buffer: if not self._initialized: self._read_init() self._initialized = True size = self._size if size < 0 else size cupy_pointer = MemoryPointer(self._cupy_memory, self._offset) self._offset += size return Buffer(cupy_pointer.ptr, size=size)
def serialize(self): header, frames = super(CumlArray, self).serialize() header["constructor-kwargs"] = { "dtype": self.dtype.str, "shape": self.shape, "order": self.order, } frames = [Buffer(f) for f in frames] return header, frames
async def open_writer(self, size=None) -> StorageFileObject: from cudf.core.buffer import Buffer cuda_buffer = Buffer.empty(size) headers = dict(size=size) object_id = CudaObjectId(headers, [cuda_buffer.ptr]) self._id_to_buffers[object_id] = cuda_buffer cuda_writer = CudaFileObject(object_id, cuda_buffer=cuda_buffer, mode='w', size=size) return StorageFileObject(cuda_writer, object_id=object_id)
def read(self, size=-1): from cudf.core import Buffer from cupy.cuda import MemoryPointer if not self._initialized: self._read_init() self._initialized = True size = self._size if size < 0 else size cupy_pointer = MemoryPointer(self._cupy_memory, self._offset) self._offset += size return Buffer(cupy_pointer.ptr, size=size)
def read(self, size: int): # we read cuda_header first and then read cuda buffers one by one, # the return value's size is not exactly the specified size. from cudf.core import Buffer from cupy.cuda import MemoryPointer from cupy.cuda.memory import UnownedMemory if not self._has_read_headers: self._has_read_headers = True return pickle.dumps(self._headers) if len(self._buffers) == 0: return '' cur_buf = self._buffers[0] # current buf read to end if cur_buf is None: # empty cuda buffer content = Buffer.empty(0) self._offset = 0 self._buffers.pop(0) return content elif size >= cur_buf.size - self._offset: if isinstance(cur_buf, UnownedMemory): cupy_pointer = MemoryPointer(cur_buf, self._offset) content = Buffer(cupy_pointer.ptr, size=cur_buf.size - self._offset) else: content = cur_buf[self._offset:self._offset + size] self._offset = 0 self._buffers.pop(0) return content else: if isinstance(cur_buf, UnownedMemory): cupy_pointer = MemoryPointer(cur_buf, self._offset) self._offset += size return Buffer(cupy_pointer.ptr, size=size) else: self._offset += size return cur_buf[self._offset, self._offset + size]
async def get(self, object_id: str, **kwargs) -> object: from cudf.core import Buffer from rmm import DeviceBuffer headers, buffers = _id_to_buffers[object_id] new_buffers = [] for buf in buffers: if isinstance(buf, cupy.ndarray): new_buffers.append( DeviceBuffer(ptr=buf.data.ptr, size=buf.size)) elif isinstance(buf, cudf.core.Buffer): new_buffers.append( Buffer(buf.ptr, buf.size, DeviceBuffer(ptr=buf.ptr, size=buf.size))) else: new_buffers.append(buf) return deserialize(headers, new_buffers)
async def get(self, object_id: CudaObjectId, **kwargs) -> object: from cudf.core.buffer import Buffer from rmm import DeviceBuffer headers = object_id.headers ptrs = object_id.ptrs data_type = headers.pop('data_type') if data_type == 'cupy': ptr = ptrs[0] size = headers['lengths'][0] cuda_buf = DeviceBuffer(ptr=ptr, size=size) buffers = [cuda_buf] elif data_type == 'cudf': buffers = [Buffer(ptr, length, DeviceBuffer(ptr=ptr, size=length)) for ptr, length in zip(ptrs, headers['lengths'])] else: raise TypeError(f'Unknown data type {data_type}') return deserialize(headers, buffers)
def write(self, content): from cudf.core import Buffer from cupy.cuda import MemoryPointer from cupy.cuda.memory import UnownedMemory if not self._has_write_headers: self._headers = headers = pickle.loads(content) buffer_types = headers['buffer_types'] for buffer_type, size in buffer_types: if buffer_type == 'cuda': self._buffers.append(Buffer.empty(size)) else: self._buffers.append(BytesIO()) self._has_write_headers = True return cur_buf = self._buffers[self._cur_buffer_index] cur_buf_size = self._headers['buffer_types'][self._cur_buffer_index][1] if isinstance(cur_buf, Buffer): cur_cupy_memory = UnownedMemory(cur_buf.ptr, len(cur_buf), cur_buf) cupy_pointer = MemoryPointer(cur_cupy_memory, self._offset) if isinstance(content, bytes): content_length = len(content) source_mem = np.frombuffer( content, dtype='uint8').ctypes.data_as(ctypes.c_void_p) else: source_mem = MemoryPointer( UnownedMemory(content.ptr, len(content), content), 0) content_length = source_mem.mem.size cupy_pointer.copy_from(source_mem, content_length) else: content_length = len(content) cur_buf.write(content) if content_length + self._offset >= cur_buf_size: if isinstance(cur_buf, BytesIO): self._buffers[self._cur_buffer_index] = cur_buf.getvalue() self._cur_buffer_index += 1 self._offset = 0 else: self._offset += content_length
async def read_buffers(header: Dict, reader: StreamReader): if cupy is not None and cudf is not None: from cudf.core import Buffer as CPBuffer from cupy.cuda.memory import UnownedMemory as CPUnownedMemory, \ MemoryPointer as CPMemoryPointer else: CPBuffer = CPUnownedMemory = CPMemoryPointer = None # construct a empty cuda buffer and copy from host is_cuda_buffers = header.get('is_cuda_buffers') buffer_sizes = header.pop(BUFFER_SIZES_NAME) buffers = [] for is_cuda_buffer, buf_size in zip(is_cuda_buffers, buffer_sizes): if is_cuda_buffer: # pragma: no cover if buf_size == 0: content = await reader.readexactly(buf_size) buffers.append(content) else: cuda_buffer = CPBuffer.empty(buf_size) cupy_memory = CPUnownedMemory(cuda_buffer.ptr, buf_size, cuda_buffer) offset = 0 chunk_size = CUDA_CHUNK_SIZE while offset < buf_size: read_size = chunk_size if ( offset + chunk_size) < buf_size else buf_size - offset content = await reader.readexactly(read_size) source_mem = np.frombuffer( content, dtype='uint8').ctypes.data_as(ctypes.c_void_p) cupy_pointer = CPMemoryPointer(cupy_memory, offset) cupy_pointer.copy_from(source_mem, len(content)) offset += read_size buffers.append(cuda_buffer) else: buffers.append(await reader.readexactly(buf_size)) return buffers
def melt( frame, id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ): """Unpivots a DataFrame from wide format to long format, optionally leaving identifier variables set. Parameters ---------- frame : DataFrame id_vars : tuple, list, or ndarray, optional Column(s) to use as identifier variables. default: None value_vars : tuple, list, or ndarray, optional Column(s) to unpivot. default: all columns that are not set as `id_vars`. var_name : scalar Name to use for the `variable` column. default: frame.columns.name or 'variable' value_name : str Name to use for the `value` column. default: 'value' Returns ------- out : DataFrame Melted result Difference from pandas: * Does not support 'col_level' because cuDF does not have multi-index Examples -------- >>> import cudf >>> import numpy as np >>> df = cudf.DataFrame({'A': {0: 1, 1: 1, 2: 5}, ... 'B': {0: 1, 1: 3, 2: 6}, ... 'C': {0: 1.0, 1: np.nan, 2: 4.0}, ... 'D': {0: 2.0, 1: 5.0, 2: 6.0}}) >>> cudf.melt(frame=df, id_vars=['A', 'B'], value_vars=['C', 'D']) A B variable value 0 1 1 C 1.0 1 1 3 C 2 5 6 C 4.0 3 1 1 D 2.0 4 1 3 D 5.0 5 5 6 D 6.0 """ assert col_level in (None, ) # Arg cleaning import collections # id_vars if id_vars is not None: if not isinstance(id_vars, collections.abc.Sequence): id_vars = [id_vars] id_vars = list(id_vars) missing = set(id_vars) - set(frame.columns) if not len(missing) == 0: raise KeyError("The following 'id_vars' are not present" " in the DataFrame: {missing}" "".format(missing=list(missing))) else: id_vars = [] # value_vars if value_vars is not None: if not isinstance(value_vars, collections.abc.Sequence): value_vars = [value_vars] value_vars = list(value_vars) missing = set(value_vars) - set(frame.columns) if not len(missing) == 0: raise KeyError("The following 'value_vars' are not present" " in the DataFrame: {missing}" "".format(missing=list(missing))) else: # then all remaining columns in frame value_vars = frame.columns.drop(id_vars) value_vars = list(value_vars) # Error for unimplemented support for datatype dtypes = [frame[col].dtype for col in id_vars + value_vars] if any(is_categorical_dtype(t) for t in dtypes): raise NotImplementedError("Categorical columns are not yet " "supported for function") # Check dtype homogeneity in value_var # Because heterogeneous concat is unimplemented dtypes = [frame[col].dtype for col in value_vars] if len(dtypes) > 0: dtype = dtypes[0] if any(t != dtype for t in dtypes): raise ValueError("all cols in value_vars must have the same dtype") # overlap overlap = set(id_vars).intersection(set(value_vars)) if not len(overlap) == 0: raise KeyError("'value_vars' and 'id_vars' cannot have overlap." " The following 'value_vars' are ALSO present" " in 'id_vars': {overlap}" "".format(overlap=list(overlap))) N = len(frame) K = len(value_vars) def _tile(A, reps): series_list = [A] * reps if reps > 0: return Series._concat(objs=series_list, index=None) else: return Series(Buffer.null(dtype=A.dtype)) # Step 1: tile id_vars mdata = collections.OrderedDict() for col in id_vars: mdata[col] = _tile(frame[col], K) # Step 2: add variable var_cols = [] for i, var in enumerate(value_vars): var_cols.append( Series(Buffer(cudautils.full(size=N, value=i, dtype=np.int8)))) temp = Series._concat(objs=var_cols, index=None) if not var_name: var_name = "variable" mdata[var_name] = Series( CategoricalColumn(categories=value_vars, data=temp._column.data, ordered=False)) # Step 3: add values mdata[value_name] = Series._concat(objs=[frame[val] for val in value_vars], index=None) return DataFrame(mdata)
def _tile(A, reps): series_list = [A] * reps if reps > 0: return Series._concat(objs=series_list, index=None) else: return Series(Buffer.null(dtype=A.dtype))