def to_pandas(self, index: pd.Index = None, **kwargs) -> pd.Series: if self.categories.dtype.kind == "f": new_mask = bools_to_mask(self.notnull()) col = column.build_categorical_column( categories=self.categories, codes=column.as_column(self.codes, dtype=self.codes.dtype), mask=new_mask, ordered=self.dtype.ordered, size=self.codes.size, ) else: col = self signed_dtype = min_signed_type(len(col.categories)) codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array() categories = col.categories.dropna(drop_nan=True).to_pandas() data = pd.Categorical.from_codes(codes, categories=categories, ordered=col.ordered) return pd.Series(data, index=index)
def pandas_categorical_as_column(categorical, codes=None): """Creates a CategoricalColumn from a pandas.Categorical If ``codes`` is defined, use it instead of ``categorical.codes`` """ codes = categorical.codes if codes is None else codes codes = column.as_column(codes) valid_codes = codes != codes.dtype.type(-1) mask = None if not valid_codes.all(): mask = bools_to_mask(valid_codes) return column.build_categorical_column( categories=categorical.categories, codes=column.as_column(codes.base_data, dtype=codes.dtype), size=codes.size, mask=mask, ordered=categorical.ordered, )
def timeseries( start="2000-01-01", end="2000-01-31", freq="1s", dtypes=None, nulls_frequency=0, seed=None, ): """Create timeseries dataframe with random data Parameters ---------- start : datetime (or datetime-like string) Start of time series end : datetime (or datetime-like string) End of time series dtypes : dict Mapping of column names to types. Valid types include {float, int, str, 'category'}. If none is provided, this defaults to ``{"name": "category", "id": int, "x": float, "y": float}`` freq : string String like '2s' or '1H' or '12W' for the time series frequency nulls_frequency : float Fill the series with the specified proportion of nulls. Default is 0. seed : int (optional) Randomstate seed Examples -------- >>> import cudf as gd >>> gdf = gd.datasets.timeseries() >>> gdf.head() # doctest: +SKIP timestamp id name x y 2000-01-01 00:00:00 967 Jerry -0.031348 -0.040633 2000-01-01 00:00:01 1066 Michael -0.262136 0.307107 2000-01-01 00:00:02 988 Wendy -0.526331 0.128641 2000-01-01 00:00:03 1016 Yvonne 0.620456 0.767270 2000-01-01 00:00:04 998 Ursula 0.684902 -0.463278 """ if dtypes is None: dtypes = {"name": "category", "id": int, "x": float, "y": float} index = pd.DatetimeIndex( pd.date_range(start, end, freq=freq, name="timestamp")) state = np.random.RandomState(seed) columns = {k: make[dt](len(index), state) for k, dt in dtypes.items()} df = pd.DataFrame(columns, index=index, columns=sorted(columns)) if df.index[-1] == end: df = df.iloc[:-1] gdf = cudf.from_pandas(df) for col in gdf: mask = state.choice( [True, False], size=len(index), p=[1 - nulls_frequency, nulls_frequency], ) mask_buf = bools_to_mask(cudf.core.column.as_column(mask)) masked_col = gdf[col]._column.set_mask(mask_buf) gdf[col] = cudf.Series._from_data(ColumnAccessor({None: masked_col}), index=gdf.index) return gdf