def _compute_validity_mask(self, index, row_tuple, max_length): """ Computes the valid set of indices of values in the lookup """ from cudf import DataFrame from cudf import Series from cudf import concat from cudf.utils.cudautils import arange lookup = DataFrame() for idx, row in enumerate(row_tuple): if row == slice(None): continue lookup[index._source_data.columns[idx]] = Series(row) data_table = concat( [ index._source_data, DataFrame({"idx": Series(arange(len(index._source_data)))}), ], axis=1, ) result = lookup.merge(data_table)["idx"] # Avoid computing levels unless the result of the merge is empty, # which suggests that a KeyError should be raised. if len(result) == 0: for idx, row in enumerate(row_tuple): if row == slice(None): continue if row not in index.levels[idx]._column: raise KeyError(row) return result
def _loc_to_iloc(self, arg): from cudf.core.series import Series from cudf.core.index import Index if isinstance( arg, (list, np.ndarray, pd.Series, range, Index, DeviceNDArray)): if len(arg) == 0: arg = Series(np.array([], dtype="int32")) else: arg = Series(arg) if isinstance(arg, Series): if arg.dtype in [np.bool, np.bool_]: return arg else: return indices_from_labels(self._sr, arg) elif is_scalar(arg): found_index = self._sr.index.find_label_range(arg, None)[0] return found_index elif isinstance(arg, slice): start_index, stop_index = self._sr.index.find_label_range( arg.start, arg.stop) return slice(start_index, stop_index, arg.step) else: raise NotImplementedError( ".loc not implemented for label type {}".format( type(arg).__name__))
def factorize(values, sort=False, na_sentinel=-1, size_hint=None): """Encode the input values as integer labels Parameters ---------- values: Series, Index, or CuPy array The data to be factorized. na_sentinel : number, default -1 Value to indicate missing category. Returns -------- (labels, cats) : (Series, Series) - *labels* contains the encoded values - *cats* contains the categories in order that the N-th item corresponds to the (N-1) code. Examples -------- >>> import cudf >>> data = cudf.Series(['a', 'c', 'c']) >>> codes, uniques = cudf.factorize(data) >>> codes 0 0 1 1 2 1 dtype: int8 >>> uniques 0 a 1 c dtype: object See Also -------- cudf.core.series.Series.factorize : Encode the input values of Series. """ if sort: raise NotImplementedError( "Sorting not yet supported during factorization." ) if na_sentinel is None: raise NotImplementedError("na_sentinel can not be None.") if size_hint: warn("size_hint is not applicable for cudf.factorize") return_cupy_array = isinstance(values, cp.core.core.ndarray) values = Series(values) cats = values._column.dropna().unique().astype(values.dtype) name = values.name # label_encoding mutates self.name labels = values.label_encoding(cats=cats, na_sentinel=na_sentinel).values values.name = name return labels, cats.values if return_cupy_array else Index(cats)
def from_dlpack(pycapsule_obj): """Converts from a DLPack tensor to a cuDF object. DLPack is an open-source memory tensor structure: `dmlc/dlpack <https://github.com/dmlc/dlpack>`_. This function takes a PyCapsule object which contains a pointer to a DLPack tensor as input, and returns a cuDF object. This function deep copies the data in the DLPack tensor into a cuDF object. Parameters ---------- pycapsule_obj : PyCapsule Input DLPack tensor pointer which is encapsulated in a PyCapsule object. Returns ------- A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D or 2D. """ res = libdlpack.from_dlpack(pycapsule_obj) if res._num_columns == 1: return Series(res._data[0]) else: return DataFrame(data=res._data)
def to_series(self, index=None, name=None): """ Create a Series with both index and values equal to the index keys. Useful with map for returning an indexer based on an index. Parameters ---------- index : Index, optional Index of resulting Series. If None, defaults to original index. name : str, optional Dame of resulting Series. If None, defaults to name of original index. Returns ------- Series The dtype will be based on the type of the Index values. """ from cudf.core.series import Series return Series( self._values, index=self.copy(deep=False) if index is None else index, name=self.name if name is None else name, )
def run(self, df, **launch_params): # Get input columns if isinstance(self.incols, dict): inputs = { v: df[k]._column.data_array_view for (k, v) in self.incols.items() } else: inputs = {k: df[k]._column.data_array_view for k in self.incols} # Allocate output columns outputs = {} for k, dt in self.outcols.items(): outputs[k] = column.column_empty(len(df), dt, False).data_array_view # Bind argument args = {} for dct in [inputs, outputs, self.kwargs]: args.update(dct) bound = self.sig.bind(**args) # Launch kernel self.launch_kernel(df, bound.args, **launch_params) # Prepare pessimistic nullmask if self.pessimistic_nulls: out_mask = make_aggregate_nullmask(df, columns=self.incols) else: out_mask = None # Prepare output frame outdf = df.copy() for k in sorted(self.outcols): outdf[k] = Series(outputs[k], index=outdf.index, nan_as_null=False) if out_mask is not None: outdf[k] = outdf[k].set_mask(out_mask.data_array_view) return outdf
def _get_column_major(self, df, row_tuple): from cudf import Series from cudf import DataFrame valid_indices = self._get_valid_indices_by_tuple( df.columns, row_tuple, len(df._cols)) result = df._take_columns(valid_indices) if isinstance(row_tuple, (numbers.Number, slice)): row_tuple = [row_tuple] if len(result) == 0 and len(result.columns) == 0: result_columns = df.columns.copy(deep=False) clear_codes = DataFrame() for name in df.columns.names: clear_codes[name] = Series([]) result_columns._codes = clear_codes result_columns._source_data = clear_codes result.columns = result_columns elif len(row_tuple) < len( self.levels) and (not slice(None) in row_tuple and not isinstance(row_tuple[0], slice)): columns = self._popn(len(row_tuple)) result.columns = columns.take(valid_indices) else: result.columns = self.take(valid_indices) if len(result.columns.levels) == 1: columns = [] for code in result.columns.codes[result.columns.codes.columns[0]]: columns.append(result.columns.levels[0][code]) name = result.columns.names[0] result.columns = as_index(columns, name=name) if len(row_tuple) == len(self.levels) and len(result.columns) == 1: result = list(result._cols.values())[0] return result
def _get_valid_indices_by_tuple(self, index, row_tuple, max_length): from cudf.utils.cudautils import arange from cudf import Series # Instructions for Slicing # if tuple, get first and last elements of tuple # if open beginning tuple, get 0 to highest valid_index # if open ending tuple, get highest valid_index to len() # if not open end or beginning, get range lowest beginning index # to highest ending index if isinstance(row_tuple, slice): if (isinstance(row_tuple.start, numbers.Number) or isinstance(row_tuple.stop, numbers.Number) or row_tuple == slice(None)): stop = row_tuple.stop or max_length start, stop, step = row_tuple.indices(stop) return arange(start, stop, step) start_values = self._compute_validity_mask(index, row_tuple.start, max_length) stop_values = self._compute_validity_mask(index, row_tuple.stop, max_length) return Series(arange(start_values.min(), stop_values.max() + 1)) elif isinstance(row_tuple, numbers.Number): return row_tuple return self._compute_validity_mask(index, row_tuple, max_length)
def from_dlpack(pycapsule_obj): """Converts from a DLPack tensor to a cuDF object. DLPack is an open-source memory tensor structure: `dmlc/dlpack <https://github.com/dmlc/dlpack>`_. This function takes a PyCapsule object which contains a pointer to a DLPack tensor as input, and returns a cuDF object. This function deep copies the data in the DLPack tensor into a cuDF object. Parameters ---------- pycapsule_obj : PyCapsule Input DLPack tensor pointer which is encapsulated in a PyCapsule object. Returns ------- A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D or 2D. Notes ----- cuDF from_dlpack() assumes column-major (Fortran order) input. If the input tensor is row-major, transpose it before passing it to this function. """ data, _ = libdlpack.from_dlpack(pycapsule_obj) if len(data) == 1: return Series._from_data(data) else: return DataFrame._from_data(data)
def _loc_to_iloc(self, arg): from cudf.core.column import column from cudf.core.series import Series if is_scalar(arg): try: found_index = self._sr.index._values.find_first_value( arg, closest=False ) return found_index except (TypeError, KeyError, IndexError, ValueError): raise KeyError("label scalar is out of bound") elif isinstance(arg, slice): return get_label_range_or_mask( self._sr.index, arg.start, arg.stop, arg.step ) elif isinstance(arg, (cudf.MultiIndex, pd.MultiIndex)): if isinstance(arg, pd.MultiIndex): arg = cudf.MultiIndex.from_pandas(arg) return indices_from_labels(self._sr, arg) else: arg = Series(column.as_column(arg)) if arg.dtype in (bool, np.bool_): return arg else: indices = indices_from_labels(self._sr, arg) if indices.null_count > 0: raise KeyError("label scalar is out of bound") return indices
def _group_dataframe(self, df, levels): """Group dataframe. The output dataframe has the same number of rows as the input dataframe. The rows are shuffled so that the groups are moved together in ascending order based on the multi-level index. Parameters ---------- df : DataFrame levels : list[str] Column names for the multi-level index. Returns ------- (df, segs) : namedtuple * df : DataFrame The grouped dataframe. * segs : Series. Group starting index. """ sorted_cols, offsets = libcudf.groupby.groupby_without_aggregations( df._columns, df[levels]._columns ) outdf = cudf.DataFrame._from_columns(sorted_cols) segs = Series(offsets) outdf.columns = df.columns return _dfsegs_pack(df=outdf, segs=segs)
def _get_row_major(self, df, row_tuple): from cudf import Series valid_indices = self._get_valid_indices_by_tuple( df.index, row_tuple, len(df.index)) indices = Series(valid_indices) result = df.take(indices) final = self._index_and_downcast(result, result.index, row_tuple) return final
def _apply_op(self, fn, other=None): from cudf.core.series import Series idx_series = Series(self, name=self.name) op = getattr(idx_series, fn) if other is not None: return as_index(op(other)) else: return as_index(op())
def wrapper(*args, **kwargs): ret = passed_attr(*args, **kwargs) if isinstance(ret, nvstrings.nvstrings): ret = Series( column.as_column(ret), index=self._index, name=self._name, ) return ret
def searchsorted(self, value, side="left"): """Find indices where elements should be inserted to maintain order Parameters ---------- value : Column Column of values to search for side : str {‘left’, ‘right’} optional If ‘left’, the index of the first suitable location found is given. If ‘right’, return the last such index Returns ------- An index series of insertion points with the same shape as value """ from cudf.core.series import Series idx_series = Series(self, name=self.name) result = idx_series.searchsorted(value, side) return as_index(result)
def _get_row_major(self, df, row_tuple): from cudf import Series if pd.api.types.is_bool_dtype(row_tuple): return df[row_tuple] valid_indices = self._get_valid_indices_by_tuple( df.index, row_tuple, len(df.index)) indices = Series(valid_indices) result = df.take(indices) final = self._index_and_downcast(result, result.index, row_tuple) return final
def _getitem_tuple_arg(self, arg): from cudf.core.dataframe import Series, DataFrame from cudf.core.column import column from cudf.core.index import as_index from cudf.utils.cudautils import arange from cudf import MultiIndex # Step 1: Gather columns if isinstance(self._df.columns, MultiIndex): columns_df = self._df.columns._get_column_major(self._df, arg[1]) if isinstance(columns_df, Series): return columns_df else: columns = self._get_column_selection(arg[1]) columns_df = DataFrame(index=self._df.index) for i, col in enumerate(columns): columns_df.insert(i, col, self._df[col]) # Step 2: Gather rows if isinstance(columns_df.index, MultiIndex): return columns_df.index._get_row_major(columns_df, arg[0]) else: if isinstance(self._df.columns, MultiIndex): if isinstance(arg[0], slice): start, stop, step = arg[0].indices(len(columns_df)) indices = arange(start, stop, step) df = columns_df.take(indices) else: df = columns_df.take(arg[0]) else: df = DataFrame() for col in columns_df.columns: # need Series() in case a scalar is returned df[col] = Series(columns_df[col].loc[arg[0]]) df.columns = columns_df.columns # Step 3: Gather index if df.shape[0] == 1: # we have a single row if isinstance(arg[0], slice): start = arg[0].start if start is None: start = self._df.index[0] df.index = as_index(start) else: row_selection = column.as_column(arg[0]) if pd.api.types.is_bool_dtype(row_selection.dtype): df.index = self._df.index.take(row_selection) else: df.index = as_index(row_selection) # Step 4: Downcast if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df
def _index_and_downcast(self, result, index, index_key): from cudf import DataFrame from cudf import Series if isinstance(index_key, (numbers.Number, slice)): index_key = [index_key] if ( len(index_key) > 0 and not isinstance(index_key, tuple) ) or isinstance(index_key[0], slice): index_key = index_key[0] slice_access = False if isinstance(index_key, slice): slice_access = True out_index = DataFrame() # Select the last n-k columns where n is the number of _source_data # columns and k is the length of the indexing tuple size = 0 if not isinstance(index_key, (numbers.Number, slice)): size = len(index_key) for k in range(size, len(index._source_data.columns)): if index.names is None: name = k else: name = index.names[k] out_index.add_column( name, index._source_data[index._source_data.columns[k]] ) if len(result) == 1 and size == 0 and slice_access is False: # If the final result is one row and it was not mapped into # directly, return a Series with a tuple as name. result = result.T result = result[result.columns[0]] elif len(result) == 0 and slice_access is False: # Pandas returns an empty Series with a tuple as name # the one expected result column series_name = [] for idx, code in enumerate(index._source_data.columns): series_name.append(index._source_data[code][0]) result = Series([]) result.name = tuple(series_name) elif len(out_index.columns) == 1: # If there's only one column remaining in the output index, convert # it into an Index and name the final index values according # to the _source_data column names last_column = index._source_data.columns[-1] out_index = index._source_data[last_column] out_index = as_index(out_index) out_index.name = index.names[len(index.names) - 1] index = out_index elif len(out_index.columns) > 1: # Otherwise pop the leftmost levels, names, and codes from the # source index until it has the correct number of columns (n-k) result.reset_index(drop=True) index = index._popn(size) if isinstance(index_key, tuple): result = result.set_index(index) return result
def lower(self): """ Convert strings in the Series/Index to lowercase. Returns ------- Series/Index of str dtype A copy of the object with all strings converted to lowercase. """ from cudf.core import Series return Series( self._parent.nvstrings.lower(), index=self._index, name=self._name )
def _getitem_tuple_arg(self, arg): from cudf import MultiIndex from cudf.core.dataframe import DataFrame, Series from cudf.core.index import as_index # Iloc Step 1: # Gather the columns specified by the second tuple arg columns_df = self._get_column_selection(arg[1]) columns_df._index = self._df._index # Iloc Step 2: # Gather the rows specified by the first tuple arg if isinstance(columns_df.index, MultiIndex): if isinstance(arg[0], slice): df = columns_df[arg[0]] else: df = columns_df.index._get_row_major(columns_df, arg[0]) if (len(df) == 1 and len(columns_df) >= 1) and not (isinstance( arg[0], slice) or isinstance(arg[1], slice)): # Pandas returns a numpy scalar in this case return df[0] if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df else: df = DataFrame() for i, col in enumerate(columns_df._columns): # need Series() in case a scalar is returned df[i] = Series(col[arg[0]]) df.index = as_index(columns_df.index[arg[0]]) df.columns = columns_df.columns # Iloc Step 3: # Reindex if df.shape[0] == 1: # we have a single row without an index df.index = as_index(self._df.index[arg[0]]) # Iloc Step 4: # Downcast if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) if df.shape[0] == 0 and df.shape[1] == 0 and isinstance(arg[0], slice): from cudf.core.index import RangeIndex slice_len = len(self._df) start, stop, step = arg[0].indices(slice_len) df._index = RangeIndex(start, stop) return df
def from_dlpack(pycapsule_obj): """Converts from a DLPack tensor to a cuDF object. DLPack is an open-source memory tensor structure: `dmlc/dlpack <https://github.com/dmlc/dlpack>`_. This function takes a PyCapsule object which contains a pointer to a DLPack tensor as input, and returns a cuDF object. This function deep copies the data in the DLPack tensor into a cuDF object. Parameters ---------- pycapsule_obj : PyCapsule Input DLPack tensor pointer which is encapsulated in a PyCapsule object. Returns ------- A cuDF DataFrame or Series depending on if the input DLPack tensor is 1D or 2D. """ try: res, valids = cpp_dlpack.from_dlpack(pycapsule_obj) except GDFError as err: if str(err) == "b'GDF_DATASET_EMPTY'": raise ValueError( "Cannot create a cuDF Object from a DLPack tensor of 0 size" ) else: raise err cols = [] for idx in range(len(valids)): mask = None if valids[idx]: mask = Buffer(valids[idx]) cols.append( column.build_column( Buffer(res[idx]), dtype=res[idx].dtype, mask=mask ) ) if len(cols) == 1: return Series(cols[0]) else: df = DataFrame() for idx, col in enumerate(cols): df[idx] = col return df
def _to_frame(self): from cudf import DataFrame, Series # for each column of codes # replace column with mapping from integers to levels df = self.codes.copy(deep=False) for idx, col in enumerate(df.columns): # use merge as a replace fn level = DataFrame({ "idx": Series(cupy.arange(len(self.levels[idx]), dtype=df[col].dtype)), "level": self.levels[idx], }) code = DataFrame({"idx": df[col]}) df[col] = code.merge(level).level return df
def extract(self, pat, flags=0, expand=True): """ Extract capture groups in the regex `pat` as columns in a DataFrame. For each subject string in the Series, extract groups from the first match of regular expression `pat`. Parameters ---------- pat : str Regular expression pattern with capturing groups. expand : bool, default True If True, return DataFrame with on column per capture group. If False, return a Series/Index if there is one capture group or DataFrame if there are multiple capture groups. Returns ------- DataFrame or Series/Index A DataFrame with one row for each subject string, and one column for each group. If `expand=False` and `pat` has only one capture group, then return a Series/Index. Notes ----- The `flags` parameter is not yet supported and will raise a NotImplementedError if anything other than the default value is passed. """ if flags != 0: raise NotImplementedError("`flags` parameter is not yet supported") from cudf.core import DataFrame, Series out = self._parent.nvstrings.extract(pat) if len(out) == 1 and expand is False: return Series(out[0], index=self._index, name=self._name) else: out_df = DataFrame(index=self._index) for idx, val in enumerate(out): out_df[idx] = val return out_df
def len(self): """ Computes the length of each element in the Series/Index. Returns ------- Series or Index of int: A Series or Index of integer values indicating the length of each element in the Series or Index. """ from cudf.core.series import Series out_dev_arr = rmm.device_array(len(self._parent), dtype="int32") ptr = libcudf.cudf.get_ctype_ptr(out_dev_arr) self._parent.nvstrings.len(ptr) mask = None if self._parent.has_nulls: mask = self._parent.mask col = column.build_column( Buffer(out_dev_arr), np.dtype("int32"), mask=mask ) return Series(col, index=self._index, name=self._name)
def cat(self, others=None, sep=None, na_rep=None): """ Concatenate strings in the Series/Index with given separator. If *others* is specified, this function concatenates the Series/Index and elements of others element-wise. If others is not passed, then all values in the Series/Index are concatenated into a single string with a given sep. Parameters ---------- others : Series or List of str Strings to be appended. The number of strings must match size() of this instance. This must be either a Series of string dtype or a Python list of strings. sep : str If specified, this separator will be appended to each string before appending the others. na_rep : str This character will take the place of any null strings (not empty strings) in either list. - If `na_rep` is None, and `others` is None, missing values in the Series/Index are omitted from the result. - If `na_rep` is None, and `others` is not None, a row containing a missing value in any of the columns (before concatenation) will have a missing value in the result. Returns ------- concat : str or Series/Index of str dtype If `others` is None, `str` is returned, otherwise a `Series/Index` (same type as caller) of str dtype is returned. """ from cudf.core import Series, Index if isinstance(others, Series): assert others.dtype == np.dtype("object") others = others._column.nvstrings elif isinstance(others, Index): assert others.dtype == np.dtype("object") others = others.as_column().nvstrings elif isinstance(others, StringMethods): """ If others is a StringMethods then raise an exception """ msg = "series.str is an accessor, not an array-like of strings." raise ValueError(msg) elif is_list_like(others) and others: """ If others is a list-like object (in our case lists & tuples) just another Series/Index, great go ahead with concatenation. """ """ Picking first element and checking if it really adheres to list like conditions, if not we switch to next case Note: We have made a call not to iterate over the entire list as it could be more expensive if it was of very large size. Thus only doing a sanity check on just the first element of list. """ first = others[0] if is_list_like(first) or isinstance( first, (Series, Index, pd.Series, pd.Index) ): """ Internal elements in others list should also be list-like and not a regular string/byte """ first = None for frame in others: if not isinstance(frame, Series): """ Make sure all inputs to .cat function call are of type nvstrings so creating a Series object. """ frame = Series(frame, dtype="str") if first is None: """ extracting nvstrings pointer since `frame` is of type Series/Index and first isn't yet initialized. """ first = frame._column.nvstrings else: assert frame.dtype == np.dtype("object") frame = frame._column.nvstrings first = first.cat(frame, sep=sep, na_rep=na_rep) others = first elif not is_list_like(first): """ Picking first element and checking if it really adheres to non-list like conditions. Note: We have made a call not to iterate over the entire list as it could be more expensive if it was of very large size. Thus only doing a sanity check on just the first element of list. """ others = Series(others) others = others._column.nvstrings elif isinstance(others, (pd.Series, pd.Index)): others = Series(others) others = others._column.nvstrings data = self._parent.nvstrings.cat( others=others, sep=sep, na_rep=na_rep ) out = Series(data, index=self._index, name=self._name) if len(out) == 1 and others is None: out = out[0] return out
def _getitem_tuple_arg(self, arg): from cudf import MultiIndex from cudf.core.dataframe import DataFrame, Series from cudf.core.column import column_empty from cudf.core.index import as_index # Iloc Step 1: # Gather the columns specified by the second tuple arg columns = self._get_column_selection(arg[1]) if isinstance(self._df.columns, MultiIndex): columns_df = self._df.columns._get_column_major(self._df, arg[1]) if (len(columns_df) == 0 and len(columns_df.columns) == 0 and not isinstance(arg[0], slice)): result = Series(column_empty(0, dtype="float64"), name=arg[0]) result._index = columns_df.columns.copy(deep=False) return result else: if isinstance(arg[0], slice): columns_df = DataFrame() for i, col in enumerate(columns): columns_df.insert(i, col, self._df[col]) columns_df._index = self._df._index else: columns_df = self._df._columns_view(columns) # Iloc Step 2: # Gather the rows specified by the first tuple arg if isinstance(columns_df.index, MultiIndex): df = columns_df.index._get_row_major(columns_df, arg[0]) if (len(df) == 1 and len(columns_df) >= 1) and not (isinstance( arg[0], slice) or isinstance(arg[1], slice)): # Pandas returns a numpy scalar in this case return df[0] if self._can_downcast_to_series(df, arg): return self._downcast_to_series(df, arg) return df else: df = DataFrame() for i, col in enumerate(columns_df._columns): # need Series() in case a scalar is returned df[i] = Series(col[arg[0]]) df.index = as_index(columns_df.index[arg[0]]) df.columns = columns_df.columns # Iloc Step 3: # Reindex if df.shape[0] == 1: # we have a single row without an index if isinstance(arg[0], slice): start = arg[0].start if start is None: start = 0 df.index = as_index(self._df.index[start]) else: df.index = as_index(self._df.index[arg[0]]) # Iloc Step 4: # Downcast if self._can_downcast_to_series(df, arg): if isinstance(df.columns, MultiIndex): if len(df) > 0 and not (isinstance(arg[0], slice) or isinstance(arg[1], slice)): return list(df._data.values())[0][0] elif df.shape[1] > 1: result = self._downcast_to_series(df, arg) result.index = df.columns return result elif not isinstance(arg[0], slice): if len(df._data) == 0: return Series( column_empty(0, dtype="float64"), index=df.columns, name=arg[0], ) else: result_series = df[df.columns[0]] result_series.index = df.columns result_series.name = arg[0] return result_series else: return df[df.columns[0]] return self._downcast_to_series(df, arg) if df.shape[0] == 0 and df.shape[1] == 0: from cudf.core.index import RangeIndex slice_len = arg[0].stop or len(self._df) start, stop, step = arg[0].indices(slice_len) df._index = RangeIndex(start, stop) return df
def to_series(self): from cudf.core.series import Series return Series(self._values)
def __init__(self, levels=None, codes=None, labels=None, names=None, **kwargs): from cudf.core.series import Series from cudf import DataFrame super().__init__() self._name = None column_names = [] if labels: warnings.warn( "the 'labels' keyword is deprecated, use 'codes' " "instead", FutureWarning, ) if labels and not codes: codes = labels # early termination enables lazy evaluation of codes if "source_data" in kwargs: source_data = kwargs["source_data"].copy(deep=False) source_data.reset_index(drop=True, inplace=True) if isinstance(source_data, pd.DataFrame): nan_as_null = kwargs.get("nan_as_null", None) source_data = DataFrame.from_pandas(source_data, nan_as_null=nan_as_null) names = names if names is not None else source_data._data.names # if names are unique # try using those as the source_data column names: if len(dict.fromkeys(names)) == len(names): source_data.columns = names self._data = source_data._data self.names = names self._codes = codes self._levels = levels return # name setup if isinstance( names, ( Sequence, pd.core.indexes.frozen.FrozenNDArray, pd.core.indexes.frozen.FrozenList, ), ): if sum(x is None for x in names) > 1: column_names = list(range(len(codes))) else: column_names = names elif names is None: column_names = list(range(len(codes))) else: column_names = names if len(levels) == 0: raise ValueError("Must pass non-zero number of levels/codes") if not isinstance(codes, DataFrame) and not isinstance( codes[0], (Sequence, pd.core.indexes.frozen.FrozenNDArray)): raise TypeError("Codes is not a Sequence of sequences") if isinstance(codes, DataFrame): self._codes = codes elif len(levels) == len(codes): self._codes = DataFrame() for i, codes in enumerate(codes): name = column_names[i] or i codes = column.as_column(codes) self._codes[name] = codes.astype(np.int64) else: raise ValueError("MultiIndex has unequal number of levels and " "codes and is inconsistent!") self._levels = [Series(level) for level in levels] self._validate_levels_and_codes(self._levels, self._codes) source_data = DataFrame() for i, name in enumerate(self._codes.columns): codes = as_index(self._codes[name]._column) if -1 in self._codes[name].values: # Must account for null(s) in _source_data column level = DataFrame( {name: [None] + list(self._levels[i])}, index=range(-1, len(self._levels[i])), ) else: level = DataFrame({name: self._levels[i]}) import cudf._lib as libcudf source_data[name] = libcudf.copying.gather( level, codes._data.columns[0])._data[name] self._data = source_data._data self.names = names
def __init__(self, levels=None, codes=None, labels=None, names=None, **kwargs): from cudf.core.series import Series self.name = None self.names = names self._source_data = None column_names = [] if labels: warnings.warn( "the 'labels' keyword is deprecated, use 'codes' " "instead", FutureWarning, ) if labels and not codes: codes = labels # early termination enables lazy evaluation of codes if "source_data" in kwargs: self._source_data = kwargs["source_data"].reset_index(drop=True) self._codes = codes self._levels = levels return # name setup if isinstance( names, ( Sequence, pd.core.indexes.frozen.FrozenNDArray, pd.core.indexes.frozen.FrozenList, ), ): if sum(x is None for x in names) > 1: column_names = list(range(len(codes))) else: column_names = names elif names is None: column_names = list(range(len(codes))) else: column_names = names if len(levels) == 0: raise ValueError("Must pass non-zero number of levels/codes") from cudf import DataFrame if not isinstance(codes, DataFrame) and not isinstance( codes[0], (Sequence, pd.core.indexes.frozen.FrozenNDArray)): raise TypeError("Codes is not a Sequence of sequences") if isinstance(codes, DataFrame): self._codes = codes elif len(levels) == len(codes): self._codes = DataFrame() for i, codes in enumerate(codes): name = column_names[i] or i codes = column.as_column(codes) self._codes[name] = codes.astype(np.int64) else: raise ValueError("MultiIndex has unequal number of levels and " "codes and is inconsistent!") self._levels = [Series(level) for level in levels] self._validate_levels_and_codes(self._levels, self._codes) self._source_data = DataFrame() for i, name in enumerate(self._codes.columns): codes = as_index(self._codes[name]._column) if -1 in self._codes[name].values: # Must account for null(s) in _source_data column level = DataFrame( {name: [None] + list(self._levels[i])}, index=range(-1, len(self._levels[i])), ) else: level = DataFrame({name: self._levels[i]}) level = DataFrame(index=codes).join(level) self._source_data[name] = level[name].reset_index(drop=True) self.names = [None] * len(self._levels) if names is None else names
def _concat(cls, objs, dtype=None): from cudf.core.series import Series from cudf.core.column import ( StringColumn, CategoricalColumn, NumericalColumn, ) if len(objs) == 0: dtype = pd.api.types.pandas_dtype(dtype) if is_categorical_dtype(dtype): dtype = CategoricalDtype() return column_empty(0, dtype=dtype, masked=True) # If all columns are `NumericalColumn` with different dtypes, # we cast them to a common dtype. # Notice, we can always cast pure null columns not_null_cols = list(filter(lambda o: len(o) != o.null_count, objs)) if len(not_null_cols) > 0 and ( len( [ o for o in not_null_cols if not isinstance(o, NumericalColumn) or np.issubdtype(o.dtype, np.datetime64) ] ) == 0 ): col_dtypes = [o.dtype for o in not_null_cols] # Use NumPy to find a common dtype common_dtype = np.find_common_type(col_dtypes, []) # Cast all columns to the common dtype for i in range(len(objs)): objs[i] = objs[i].astype(common_dtype) # Find the first non-null column: head = objs[0] for i, obj in enumerate(objs): if len(obj) != obj.null_count: head = obj break for i, obj in enumerate(objs): # Check that all columns are the same type: if not pd.api.types.is_dtype_equal(objs[i].dtype, head.dtype): # if all null, cast to appropriate dtype if len(obj) == obj.null_count: from cudf.core.column import column_empty_like objs[i] = column_empty_like( head, dtype=head.dtype, masked=True, newsize=len(obj) ) # Handle categories for categoricals if all(isinstance(o, CategoricalColumn) for o in objs): cats = ( Series(ColumnBase._concat([o.categories for o in objs])) .drop_duplicates() ._column ) objs = [ o.cat()._set_categories(cats, is_unique=True) for o in objs ] head = objs[0] for obj in objs: if not (obj.dtype == head.dtype): raise ValueError("All series must be of same type") newsize = sum(map(len, objs)) if newsize > libcudfxx.MAX_COLUMN_SIZE: raise MemoryError( "Result of concat cannot have " "size > {}".format(libcudfxx.MAX_COLUMN_SIZE_STR) ) # Handle strings separately if all(isinstance(o, StringColumn) for o in objs): result_nbytes = sum(o._nbytes for o in objs) if result_nbytes > libcudfxx.MAX_STRING_COLUMN_BYTES: raise MemoryError( "Result of concat cannot have > {} bytes".format( libcudfxx.MAX_STRING_COLUMN_BYTES_STR ) ) objs = [o.nvstrings for o in objs] return as_column(nvstrings.from_strings(*objs)) # Filter out inputs that have 0 length objs = [o for o in objs if len(o) > 0] nulls = any(col.nullable for col in objs) if is_categorical_dtype(head): data_dtype = head.codes.dtype data = None children = (column_empty(newsize, dtype=head.codes.dtype),) else: data_dtype = head.dtype data = Buffer.empty(size=newsize * data_dtype.itemsize) children = () # Allocate output mask only if there's nulls in the input objects mask = None if nulls: mask = Buffer(utils.make_mask(newsize)) col = build_column( data=data, dtype=head.dtype, mask=mask, children=children ) # Performance the actual concatenation if newsize > 0: col = libcudf.concat._column_concat(objs, col) return col