def _concat(cls, objs): head = objs[0] for o in objs: if not o.is_type_equivalent(head): raise ValueError("All series must be of same type") # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = any(o.null_count for o in objs) newsize = sum(map(len, objs)) mem = rmm.device_array(shape=newsize, dtype=head.data.dtype) data = Buffer.from_empty(mem, size=newsize) # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) # Libcudf calculates the null_count so we don't need to set it col = head.replace(data=data, mask=mask) # Performance the actual concatenation if newsize > 0: col = _gdf._column_concat(objs, col) return col
def column_empty_like(column, dtype, masked): """Allocate a new column like the given *column* """ data = rmm.device_array(shape=len(column), dtype=dtype) params = dict(data=Buffer(data)) if masked: mask = utils.make_mask(data.size) params.update(dict(mask=Buffer(mask), null_count=data.size)) return Column(**params)
def compact_mask_bytes(boolbytes): """Convert booleans (in bytes) to a bitmask """ bits = make_mask(boolbytes.size) if bits.size > 0: # Fill zero gpu_fill_value.forall(bits.size)(bits, 0) # Compact gpu_compact_mask_bytes.forall(bits.size)(boolbytes, bits) return bits
def _concat(cls, objs, dtype=None): from cudf.dataframe.string import StringColumn from cudf.dataframe.categorical import CategoricalColumn if len(objs) == 0: if pd.api.types.is_categorical_dtype(dtype): return CategoricalColumn(data=Column( Buffer.null(np.dtype('int8'))), null_count=0, ordered=False) elif dtype == np.dtype('object'): return StringColumn(data=nvstrings.to_device([]), null_count=0) else: dtype = np.dtype(dtype) return Column(Buffer.null(dtype)) # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): objs = [o._data for o in objs] return StringColumn(data=nvstrings.from_strings(*objs)) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): new_cats = tuple(set([val for o in objs for val in o])) objs = [o.cat()._set_categories(new_cats) for o in objs] head = objs[0] for o in objs: if not o.is_type_equivalent(head): raise ValueError("All series must be of same type") # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = sum(o.null_count for o in objs) newsize = sum(map(len, objs)) mem = rmm.device_array(shape=newsize, dtype=head.data.dtype) data = Buffer.from_empty(mem, size=newsize) # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = head.replace(data=data, mask=mask, null_count=nulls) # Performance the actual concatenation if newsize > 0: col = _gdf._column_concat(objs, col) return col
def make_empty_mask(size): bits = make_mask(size) if bits.size > 0: gpu_fill_value.forall(bits.size)(bits, 0) return bits
def mask_from_devary(ary): bits = make_mask(len(ary)) if bits.size > 0: gpu_fill_value.forall(bits.size)(bits, 0) gpu_mask_from_devary.forall(bits.size)(ary, bits) return bits
def _concat(cls, objs, dtype=None): from cudf.core.series import Series from cudf.core.column import ( StringColumn, CategoricalColumn, NumericalColumn, ) if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): dtype = CategoricalDtype() return column_empty(0, dtype=dtype, masked=True) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs)) if len(not_null_cols) > 0 and ( len( [ o for o in not_null_cols if not isinstance(o, NumericalColumn) or np.issubdtype(o.dtype, np.datetime64) ] ) == 0 ): col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype common_dtype = np.find_common_type(col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): objs[i] = objs[i].astype(common_dtype) # Find the first non-null column: head = objs[0] for i, obj in enumerate(objs): if len(obj) != obj.null_count: head = obj break for i, obj in enumerate(objs): # Check that all columns are the same type: if not pd.api.types.is_dtype_equal(objs[i].dtype, head.dtype): # if all null, cast to appropriate dtype if len(obj) == obj.null_count: from cudf.core.column import column_empty_like objs[i] = column_empty_like( head, dtype=head.dtype, masked=True, newsize=len(obj) ) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): cats = ( Series(ColumnBase._concat([o.categories for o in objs])) .drop_duplicates() ._column ) objs = [ o.cat()._set_categories(cats, is_unique=True) for o in objs ] head = objs[0] for obj in objs: if not (obj.dtype == head.dtype): raise ValueError("All series must be of same type") newsize = sum(map(len, objs)) if newsize > libcudfxx.MAX_COLUMN_SIZE: raise MemoryError( "Result of concat cannot have " "size > {}".format(libcudfxx.MAX_COLUMN_SIZE_STR) ) # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): result_nbytes = sum(o._nbytes for o in objs) if result_nbytes > libcudfxx.MAX_STRING_COLUMN_BYTES: raise MemoryError( "Result of concat cannot have > {} bytes".format( libcudfxx.MAX_STRING_COLUMN_BYTES_STR ) ) objs = [o.nvstrings for o in objs] return as_column(nvstrings.from_strings(*objs)) # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = any(col.nullable for col in objs) if is_categorical_dtype(head): data_dtype = head.codes.dtype data = None children = (column_empty(newsize, dtype=head.codes.dtype),) else: data_dtype = head.dtype data = Buffer.empty(size=newsize * data_dtype.itemsize) children = () # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = build_column( data=data, dtype=head.dtype, mask=mask, children=children ) # Performance the actual concatenation if newsize > 0: col = libcudf.concat._column_concat(objs, col) return col
def _concat(cls, objs, dtype=None): from cudf.dataframe.series import Series from cudf.dataframe.string import StringColumn from cudf.dataframe.categorical import CategoricalColumn from cudf.dataframe.numerical import NumericalColumn if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) if dtype.type in (np.object_, np.str_): return StringColumn(data=nvstrings.to_device([]), null_count=0) elif is_categorical_dtype(dtype): return CategoricalColumn( data=Column(Buffer.null(np.dtype("int8"))), null_count=0, ordered=False, ) else: return Column(Buffer.null(dtype)) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs)) if len(not_null_cols) > 0 and (len([ o for o in not_null_cols if not isinstance(o, NumericalColumn) or np.issubdtype(o.dtype, np.datetime64) ]) == 0): col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype common_dtype = np.find_common_type(col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): objs[i] = objs[i].astype(common_dtype) # Find the first non-null column: head = objs[0] for i, obj in enumerate(objs): if len(obj) != obj.null_count: head = obj break for i, obj in enumerate(objs): # Check that all columns are the same type: if not objs[i].is_type_equivalent(head): # if all null, cast to appropriate dtype if len(obj) == obj.null_count: from cudf.dataframe.columnops import column_empty_like objs[i] = column_empty_like(head, dtype=head.dtype, masked=True, newsize=len(obj)) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): cats = (Series(Column._concat([o.categories for o in objs ])).drop_duplicates()._column) objs = [ o.cat()._set_categories(cats, is_unique=True) for o in objs ] head = objs[0] for obj in objs: if not (obj.is_type_equivalent(head)): raise ValueError("All series must be of same type") # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): objs = [o._data for o in objs] return StringColumn(data=nvstrings.from_strings(*objs)) # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = sum(o.null_count for o in objs) newsize = sum(map(len, objs)) mem = rmm.device_array(shape=newsize, dtype=head.data.dtype) data = Buffer.from_empty(mem, size=newsize) # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = head.replace(data=data, mask=mask, null_count=nulls) # Performance the actual concatenation if newsize > 0: col = _column_concat(objs, col) return col