def test_is_dataframe_like(): df = cudf.DataFrame({'x': [1, 2, 3]}) assert is_dataframe_like(df) assert is_series_like(df.x) assert is_index_like(df.index) assert not is_dataframe_like(df.x) assert not is_series_like(df) assert not is_index_like(df)
def test_is_dataframe_like(monkeypatch, frame_value_counts): # When we drop support for pandas 1.0, this compat check can # be dropped if frame_value_counts: monkeypatch.setattr(pd.DataFrame, "value_counts", lambda x: None, raising=False) df = pd.DataFrame({"x": [1, 2, 3]}) ddf = dd.from_pandas(df, npartitions=1) assert is_dataframe_like(df) assert is_dataframe_like(ddf) assert not is_dataframe_like(df.x) assert not is_dataframe_like(ddf.x) assert not is_dataframe_like(df.index) assert not is_dataframe_like(ddf.index) assert not is_dataframe_like(pd.DataFrame) assert not is_series_like(df) assert not is_series_like(ddf) assert is_series_like(df.x) assert is_series_like(ddf.x) assert not is_series_like(df.index) assert not is_series_like(ddf.index) assert not is_series_like(pd.Series) assert not is_index_like(df) assert not is_index_like(ddf) assert not is_index_like(df.x) assert not is_index_like(ddf.x) assert is_index_like(df.index) assert is_index_like(ddf.index) assert not is_index_like(pd.Index) # The following checks support of class wrappers, which # requires the comparions of `x.__class__` instead of `type(x)` class DataFrameWrapper: __class__ = pd.DataFrame wrap = DataFrameWrapper() wrap.dtypes = None wrap.columns = None assert is_dataframe_like(wrap) class SeriesWrapper: __class__ = pd.Series wrap = SeriesWrapper() wrap.dtype = None wrap.name = None assert is_series_like(wrap) class IndexWrapper: __class__ = pd.Index wrap = IndexWrapper() wrap.dtype = None wrap.name = None assert is_index_like(wrap)
def test_is_dataframe_like(): df = pd.DataFrame({'x': [1, 2, 3]}) assert is_dataframe_like(df) assert not is_dataframe_like(df.x) assert not is_dataframe_like(df.index) assert not is_series_like(df) assert is_series_like(df.x) assert not is_series_like(df.index) assert not is_index_like(df) assert not is_index_like(df.x) assert is_index_like(df.index) ddf = dd.from_pandas(df, npartitions=1) assert is_dataframe_like(ddf)
def unique(x, series_name=None): out = x.unique() # out can be either an np.ndarray or may already be a series # like object. When out is an np.ndarray, it must be wrapped. if not (is_series_like(out) or is_index_like(out)): out = pd.Series(out, name=series_name) return out
def test_is_dataframe_like(monkeypatch, frame_value_counts): # When we drop support for pandas 1.0, this compat check can # be dropped if frame_value_counts: monkeypatch.setattr(pd.DataFrame, "value_counts", lambda x: None, raising=False) df = pd.DataFrame({"x": [1, 2, 3]}) ddf = dd.from_pandas(df, npartitions=1) assert is_dataframe_like(df) assert is_dataframe_like(ddf) assert not is_dataframe_like(df.x) assert not is_dataframe_like(ddf.x) assert not is_dataframe_like(df.index) assert not is_dataframe_like(ddf.index) assert not is_dataframe_like(pd.DataFrame) assert not is_series_like(df) assert not is_series_like(ddf) assert is_series_like(df.x) assert is_series_like(ddf.x) assert not is_series_like(df.index) assert not is_series_like(ddf.index) assert not is_series_like(pd.Series) assert not is_index_like(df) assert not is_index_like(ddf) assert not is_index_like(df.x) assert not is_index_like(ddf.x) assert is_index_like(df.index) assert is_index_like(ddf.index) assert not is_index_like(pd.Index)
def test_is_dataframe_like(): df = pd.DataFrame({'x': [1, 2, 3]}) ddf = dd.from_pandas(df, npartitions=1) assert is_dataframe_like(df) assert is_dataframe_like(ddf) assert not is_dataframe_like(df.x) assert not is_dataframe_like(ddf.x) assert not is_dataframe_like(df.index) assert not is_dataframe_like(ddf.index) assert not is_dataframe_like(pd.DataFrame) assert not is_series_like(df) assert not is_series_like(ddf) assert is_series_like(df.x) assert is_series_like(ddf.x) assert not is_series_like(df.index) assert not is_series_like(ddf.index) assert not is_series_like(pd.Series) assert not is_index_like(df) assert not is_index_like(ddf) assert not is_index_like(df.x) assert not is_index_like(ddf.x) assert is_index_like(df.index) assert is_index_like(ddf.index) assert not is_index_like(pd.Index)
def describe_numeric_aggregate(stats, name=None, is_timedelta_col=False, is_datetime_col=False): assert len(stats) == 6 count, mean, std, min, q, max = stats if is_series_like(count): typ = type(count.to_frame()) else: typ = type(q) if is_timedelta_col: mean = pd.to_timedelta(mean) std = pd.to_timedelta(std) min = pd.to_timedelta(min) max = pd.to_timedelta(max) q = q.apply(lambda x: pd.to_timedelta(x)) if is_datetime_col: # mean is not implemented for datetime min = pd.to_datetime(min) max = pd.to_datetime(max) q = q.apply(lambda x: pd.to_datetime(x)) if is_datetime_col: part1 = typ([count, min], index=["count", "min"]) else: part1 = typ([count, mean, std, min], index=["count", "mean", "std", "min"]) q.index = [f"{l * 100:g}%" for l in tolist(q.index)] if is_series_like(q) and typ != type(q): q = q.to_frame() part3 = typ([max], index=["max"]) result = concat([part1, q, part3], sort=False) if is_series_like(result): result.name = name return result
def _loc(self, iindexer, cindexer): """Helper function for the .loc accessor""" if isinstance(iindexer, Series): return self._loc_series(iindexer, cindexer) elif isinstance(iindexer, Array): return self._loc_array(iindexer, cindexer) elif callable(iindexer): return self._loc(iindexer(self.obj), cindexer) if self.obj.known_divisions: iindexer = self._maybe_partial_time_string(iindexer) if isinstance(iindexer, slice): return self._loc_slice(iindexer, cindexer) elif isinstance(iindexer, (list, np.ndarray)): return self._loc_list(iindexer, cindexer) elif is_series_like(iindexer) and not is_bool_dtype( iindexer.dtype): return self._loc_list(iindexer.values, cindexer) else: # element should raise KeyError return self._loc_element(iindexer, cindexer) else: if isinstance( iindexer, (list, np.ndarray)) or (is_series_like(iindexer) and not is_bool_dtype(iindexer.dtype)): # applying map_partitions to each partition # results in duplicated NaN rows msg = ("Cannot index with list against unknown division. " "Try setting divisions using ``ddf.set_index``") raise KeyError(msg) elif not isinstance(iindexer, slice): iindexer = slice(iindexer, iindexer) meta = self._make_meta(iindexer, cindexer) return self.obj.map_partitions(methods.try_loc, iindexer, cindexer, meta=meta)
def from_array(x, chunksize=50000, columns=None, meta=None): """Read any sliceable array into a Dask Dataframe Uses getitem syntax to pull slices out of the array. The array need not be a NumPy array but must support slicing syntax x[50000:100000] and have 2 dimensions: x.ndim == 2 or have a record dtype: x.dtype == [('name', 'O'), ('balance', 'i8')] Parameters ---------- x : array_like chunksize : int, optional The number of rows per partition to use. columns : list or string, optional list of column names if DataFrame, single string if Series meta : object, optional An optional `meta` parameter can be passed for dask to specify the concrete dataframe type to use for partitions of the Dask dataframe. By default, pandas DataFrame is used. Returns ------- dask.DataFrame or dask.Series A dask DataFrame/Series """ if isinstance(x, da.Array): return from_dask_array(x, columns=columns, meta=meta) meta = _meta_from_array(x, columns, meta=meta) divisions = tuple(range(0, len(x), chunksize)) divisions = divisions + (len(x) - 1, ) token = tokenize(x, chunksize, columns) name = "from_array-" + token dsk = {} for i in range(0, int(ceil(len(x) / chunksize))): data = (getitem, x, slice(i * chunksize, (i + 1) * chunksize)) if is_series_like(meta): dsk[name, i] = (type(meta), data, None, meta.dtype, meta.name) else: dsk[name, i] = (type(meta), data, None, meta.columns) return new_dd_object(dsk, name, meta, divisions)
def from_dask_array(x, columns=None, index=None, meta=None): """Create a Dask DataFrame from a Dask Array. Converts a 2d array into a DataFrame and a 1d array into a Series. Parameters ---------- x : da.Array columns : list or string list of column names if DataFrame, single string if Series index : dask.dataframe.Index, optional An optional *dask* Index to use for the output Series or DataFrame. The default output index depends on whether `x` has any unknown chunks. If there are any unknown chunks, the output has ``None`` for all the divisions (one per chunk). If all the chunks are known, a default index with known divisions is created. Specifying `index` can be useful if you're conforming a Dask Array to an existing dask Series or DataFrame, and you would like the indices to match. meta : object, optional An optional `meta` parameter can be passed for dask to specify the concrete dataframe type to be returned. By default, pandas DataFrame is used. Examples -------- >>> import dask.array as da >>> import dask.dataframe as dd >>> x = da.ones((4, 2), chunks=(2, 2)) >>> df = dd.io.from_dask_array(x, columns=['a', 'b']) >>> df.compute() a b 0 1.0 1.0 1 1.0 1.0 2 1.0 1.0 3 1.0 1.0 See Also -------- dask.bag.to_dataframe: from dask.bag dask.dataframe._Frame.values: Reverse conversion dask.dataframe._Frame.to_records: Reverse conversion """ meta = _meta_from_array(x, columns, index, meta=meta) name = "from-dask-array-" + tokenize(x, columns) graph_dependencies = [x] arrays_and_indices = [x.name, "ij" if x.ndim == 2 else "i"] numblocks = {x.name: x.numblocks} if index is not None: # An index is explicitly given by the caller, so we can pass it through to the # initializer after a few checks. if index.npartitions != x.numblocks[0]: msg = ("The index and array have different numbers of blocks. " "({} != {})".format(index.npartitions, x.numblocks[0])) raise ValueError(msg) divisions = index.divisions graph_dependencies.append(index) arrays_and_indices.extend([index._name, "i"]) numblocks[index._name] = (index.npartitions, ) elif np.isnan(sum(x.shape)): # The shape of the incoming array is not known in at least one dimension. As # such, we can't create an index for the entire output DataFrame and we set # the divisions to None to represent that. divisions = [None] * (len(x.chunks[0]) + 1) else: # The shape of the incoming array is known and we don't have an explicit index. # Create a mapping of chunk number in the incoming array to # (start row, stop row) tuples. These tuples will be used to create a sequential # RangeIndex later on that is continuous over the whole DataFrame. divisions = [0] stop = 0 index_mapping = {} for i, increment in enumerate(x.chunks[0]): stop += increment index_mapping[(i, )] = (divisions[i], stop) divisions.append(stop) divisions[-1] -= 1 arrays_and_indices.extend( [BlockwiseDepDict(mapping=index_mapping), "i"]) if is_series_like(meta): kwargs = { "dtype": x.dtype, "name": meta.name, "initializer": type(meta) } else: kwargs = {"columns": meta.columns, "initializer": type(meta)} blk = blockwise( _partition_from_array, name, "i", *arrays_and_indices, numblocks=numblocks, concatenate=True, # kwargs passed through to the DataFrame/Series initializer **kwargs, ) graph = HighLevelGraph.from_collections(name, blk, dependencies=graph_dependencies) return new_dd_object(graph, name, meta, divisions)
def cummax_aggregate(x, y): if is_series_like(x) or is_dataframe_like(x): return x.where((x > y) | x.isnull(), y, axis=x.ndim - 1) else: # scalar return x if x > y else y