def _concat(cls, objs, dtype=None): from cudf.dataframe.string import StringColumn from cudf.dataframe.categorical import CategoricalColumn if len(objs) == 0: if pd.api.types.is_categorical_dtype(dtype): return CategoricalColumn(data=Column( Buffer.null(np.dtype('int8'))), null_count=0, ordered=False) elif dtype == np.dtype('object'): return StringColumn(data=nvstrings.to_device([]), null_count=0) else: dtype = np.dtype(dtype) return Column(Buffer.null(dtype)) # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): objs = [o._data for o in objs] return StringColumn(data=nvstrings.from_strings(*objs)) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): new_cats = tuple(set([val for o in objs for val in o])) objs = [o.cat()._set_categories(new_cats) for o in objs] head = objs[0] for o in objs: if not o.is_type_equivalent(head): raise ValueError("All series must be of same type") # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = sum(o.null_count for o in objs) newsize = sum(map(len, objs)) mem = rmm.device_array(shape=newsize, dtype=head.data.dtype) data = Buffer.from_empty(mem, size=newsize) # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = head.replace(data=data, mask=mask, null_count=nulls) # Performance the actual concatenation if newsize > 0: col = _gdf._column_concat(objs, col) return col
def _sortjoin(self, other, how='left', return_indexers=False): """Join with another column. When the column is a index, set *return_indexers* to obtain the indices for shuffling the remaining columns. """ from cudf.dataframe.series import Series if not self.is_type_equivalent(other): raise TypeError('*other* is not compatible') lkey, largsort = self.sort_by_values(True) rkey, rargsort = other.sort_by_values(True) with _gdf.apply_join( [lkey], [rkey], how=how, method='sort') as (lidx, ridx): if lidx.size > 0: raw_index = cudautils.gather_joined_index( lkey.to_gpu_array(), rkey.to_gpu_array(), lidx, ridx, ) buf_index = Buffer(raw_index) else: buf_index = Buffer.null(dtype=self.dtype) joined_index = lkey.replace(data=buf_index) if return_indexers: def gather(idxrange, idx): mask = (Series(idx) != -1).as_mask() return idxrange.take(idx).set_mask(mask).fillna(-1) if len(joined_index) > 0: indexers = ( gather(Series(largsort), lidx), gather(Series(rargsort), ridx), ) else: indexers = ( Series(Buffer.null(dtype=np.intp)), Series(Buffer.null(dtype=np.intp)) ) return joined_index, indexers else: return joined_index
def _hashjoin(self, other, how='left', return_indexers=False): from cudf.dataframe.series import Series if not self.is_type_equivalent(other): raise TypeError('*other* is not compatible') with _gdf.apply_join( [self], [other], how=how, method='hash') as (lidx, ridx): if lidx.size > 0: raw_index = cudautils.gather_joined_index( self.to_gpu_array(), other.to_gpu_array(), lidx, ridx, ) buf_index = Buffer(raw_index) else: buf_index = Buffer.null(dtype=self.dtype) joined_index = self.replace(data=buf_index) if return_indexers: def gather(idxrange, idx): mask = (Series(idx) != -1).as_mask() return idxrange.take(idx).set_mask(mask).fillna(-1) if len(joined_index) > 0: indexers = ( gather(Series(range(0, len(self))), lidx), gather(Series(range(0, len(other))), ridx), ) else: indexers = ( Series(Buffer.null(dtype=np.intp)), Series(Buffer.null(dtype=np.intp)) ) return joined_index, indexers else: return joined_index
def _concat(cls, objs, dtype=None): from cudf.dataframe.series import Series from cudf.dataframe.string import StringColumn from cudf.dataframe.categorical import CategoricalColumn from cudf.dataframe.numerical import NumericalColumn if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) if dtype.type in (np.object_, np.str_): return StringColumn(data=nvstrings.to_device([]), null_count=0) elif is_categorical_dtype(dtype): return CategoricalColumn( data=Column(Buffer.null(np.dtype("int8"))), null_count=0, ordered=False, ) else: return Column(Buffer.null(dtype)) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs)) if len(not_null_cols) > 0 and (len([ o for o in not_null_cols if not isinstance(o, NumericalColumn) or np.issubdtype(o.dtype, np.datetime64) ]) == 0): col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype common_dtype = np.find_common_type(col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): objs[i] = objs[i].astype(common_dtype) # Find the first non-null column: head = objs[0] for i, obj in enumerate(objs): if len(obj) != obj.null_count: head = obj break for i, obj in enumerate(objs): # Check that all columns are the same type: if not objs[i].is_type_equivalent(head): # if all null, cast to appropriate dtype if len(obj) == obj.null_count: from cudf.dataframe.columnops import column_empty_like objs[i] = column_empty_like(head, dtype=head.dtype, masked=True, newsize=len(obj)) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): cats = (Series(Column._concat([o.categories for o in objs ])).drop_duplicates()._column) objs = [ o.cat()._set_categories(cats, is_unique=True) for o in objs ] head = objs[0] for obj in objs: if not (obj.is_type_equivalent(head)): raise ValueError("All series must be of same type") # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): objs = [o._data for o in objs] return StringColumn(data=nvstrings.from_strings(*objs)) # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = sum(o.null_count for o in objs) newsize = sum(map(len, objs)) mem = rmm.device_array(shape=newsize, dtype=head.data.dtype) data = Buffer.from_empty(mem, size=newsize) # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = head.replace(data=data, mask=mask, null_count=nulls) # Performance the actual concatenation if newsize > 0: col = _column_concat(objs, col) return col