Exemplo n.º 1
0
def test_is_dataframe_like():
    df = cudf.DataFrame({'x': [1, 2, 3]})
    assert is_dataframe_like(df)
    assert is_series_like(df.x)
    assert is_index_like(df.index)
    assert not is_dataframe_like(df.x)
    assert not is_series_like(df)
    assert not is_index_like(df)
Exemplo n.º 2
0
def test_is_dataframe_like(monkeypatch, frame_value_counts):
    # When we drop support for pandas 1.0, this compat check can
    # be dropped
    if frame_value_counts:
        monkeypatch.setattr(pd.DataFrame, "value_counts", lambda x: None, raising=False)

    df = pd.DataFrame({"x": [1, 2, 3]})
    ddf = dd.from_pandas(df, npartitions=1)

    assert is_dataframe_like(df)
    assert is_dataframe_like(ddf)
    assert not is_dataframe_like(df.x)
    assert not is_dataframe_like(ddf.x)
    assert not is_dataframe_like(df.index)
    assert not is_dataframe_like(ddf.index)
    assert not is_dataframe_like(pd.DataFrame)

    assert not is_series_like(df)
    assert not is_series_like(ddf)
    assert is_series_like(df.x)
    assert is_series_like(ddf.x)
    assert not is_series_like(df.index)
    assert not is_series_like(ddf.index)
    assert not is_series_like(pd.Series)

    assert not is_index_like(df)
    assert not is_index_like(ddf)
    assert not is_index_like(df.x)
    assert not is_index_like(ddf.x)
    assert is_index_like(df.index)
    assert is_index_like(ddf.index)
    assert not is_index_like(pd.Index)

    # The following checks support of class wrappers, which
    # requires the comparions of `x.__class__` instead of `type(x)`
    class DataFrameWrapper:
        __class__ = pd.DataFrame

    wrap = DataFrameWrapper()
    wrap.dtypes = None
    wrap.columns = None
    assert is_dataframe_like(wrap)

    class SeriesWrapper:
        __class__ = pd.Series

    wrap = SeriesWrapper()
    wrap.dtype = None
    wrap.name = None
    assert is_series_like(wrap)

    class IndexWrapper:
        __class__ = pd.Index

    wrap = IndexWrapper()
    wrap.dtype = None
    wrap.name = None
    assert is_index_like(wrap)
Exemplo n.º 3
0
def test_is_dataframe_like():
    df = pd.DataFrame({'x': [1, 2, 3]})
    assert is_dataframe_like(df)
    assert not is_dataframe_like(df.x)
    assert not is_dataframe_like(df.index)

    assert not is_series_like(df)
    assert is_series_like(df.x)
    assert not is_series_like(df.index)

    assert not is_index_like(df)
    assert not is_index_like(df.x)
    assert is_index_like(df.index)

    ddf = dd.from_pandas(df, npartitions=1)
    assert is_dataframe_like(ddf)
Exemplo n.º 4
0
def unique(x, series_name=None):
    out = x.unique()
    # out can be either an np.ndarray or may already be a series
    # like object.  When out is an np.ndarray, it must be wrapped.
    if not (is_series_like(out) or is_index_like(out)):
        out = pd.Series(out, name=series_name)
    return out
def test_is_dataframe_like(monkeypatch, frame_value_counts):
    # When we drop support for pandas 1.0, this compat check can
    # be dropped
    if frame_value_counts:
        monkeypatch.setattr(pd.DataFrame,
                            "value_counts",
                            lambda x: None,
                            raising=False)

    df = pd.DataFrame({"x": [1, 2, 3]})
    ddf = dd.from_pandas(df, npartitions=1)

    assert is_dataframe_like(df)
    assert is_dataframe_like(ddf)
    assert not is_dataframe_like(df.x)
    assert not is_dataframe_like(ddf.x)
    assert not is_dataframe_like(df.index)
    assert not is_dataframe_like(ddf.index)
    assert not is_dataframe_like(pd.DataFrame)

    assert not is_series_like(df)
    assert not is_series_like(ddf)
    assert is_series_like(df.x)
    assert is_series_like(ddf.x)
    assert not is_series_like(df.index)
    assert not is_series_like(ddf.index)
    assert not is_series_like(pd.Series)

    assert not is_index_like(df)
    assert not is_index_like(ddf)
    assert not is_index_like(df.x)
    assert not is_index_like(ddf.x)
    assert is_index_like(df.index)
    assert is_index_like(ddf.index)
    assert not is_index_like(pd.Index)
Exemplo n.º 6
0
def test_is_dataframe_like():
    df = pd.DataFrame({'x': [1, 2, 3]})
    ddf = dd.from_pandas(df, npartitions=1)
    assert is_dataframe_like(df)
    assert is_dataframe_like(ddf)
    assert not is_dataframe_like(df.x)
    assert not is_dataframe_like(ddf.x)
    assert not is_dataframe_like(df.index)
    assert not is_dataframe_like(ddf.index)
    assert not is_dataframe_like(pd.DataFrame)

    assert not is_series_like(df)
    assert not is_series_like(ddf)
    assert is_series_like(df.x)
    assert is_series_like(ddf.x)
    assert not is_series_like(df.index)
    assert not is_series_like(ddf.index)
    assert not is_series_like(pd.Series)

    assert not is_index_like(df)
    assert not is_index_like(ddf)
    assert not is_index_like(df.x)
    assert not is_index_like(ddf.x)
    assert is_index_like(df.index)
    assert is_index_like(ddf.index)
    assert not is_index_like(pd.Index)
Exemplo n.º 7
0
def describe_numeric_aggregate(stats,
                               name=None,
                               is_timedelta_col=False,
                               is_datetime_col=False):
    assert len(stats) == 6
    count, mean, std, min, q, max = stats

    if is_series_like(count):
        typ = type(count.to_frame())
    else:
        typ = type(q)

    if is_timedelta_col:
        mean = pd.to_timedelta(mean)
        std = pd.to_timedelta(std)
        min = pd.to_timedelta(min)
        max = pd.to_timedelta(max)
        q = q.apply(lambda x: pd.to_timedelta(x))

    if is_datetime_col:
        # mean is not implemented for datetime
        min = pd.to_datetime(min)
        max = pd.to_datetime(max)
        q = q.apply(lambda x: pd.to_datetime(x))

    if is_datetime_col:
        part1 = typ([count, min], index=["count", "min"])
    else:
        part1 = typ([count, mean, std, min],
                    index=["count", "mean", "std", "min"])

    q.index = [f"{l * 100:g}%" for l in tolist(q.index)]
    if is_series_like(q) and typ != type(q):
        q = q.to_frame()
    part3 = typ([max], index=["max"])

    result = concat([part1, q, part3], sort=False)

    if is_series_like(result):
        result.name = name

    return result
Exemplo n.º 8
0
    def _loc(self, iindexer, cindexer):
        """Helper function for the .loc accessor"""
        if isinstance(iindexer, Series):
            return self._loc_series(iindexer, cindexer)
        elif isinstance(iindexer, Array):
            return self._loc_array(iindexer, cindexer)
        elif callable(iindexer):
            return self._loc(iindexer(self.obj), cindexer)

        if self.obj.known_divisions:
            iindexer = self._maybe_partial_time_string(iindexer)

            if isinstance(iindexer, slice):
                return self._loc_slice(iindexer, cindexer)
            elif isinstance(iindexer, (list, np.ndarray)):
                return self._loc_list(iindexer, cindexer)
            elif is_series_like(iindexer) and not is_bool_dtype(
                    iindexer.dtype):
                return self._loc_list(iindexer.values, cindexer)
            else:
                # element should raise KeyError
                return self._loc_element(iindexer, cindexer)
        else:
            if isinstance(
                    iindexer,
                (list, np.ndarray)) or (is_series_like(iindexer)
                                        and not is_bool_dtype(iindexer.dtype)):
                # applying map_partitions to each partition
                # results in duplicated NaN rows
                msg = ("Cannot index with list against unknown division. "
                       "Try setting divisions using ``ddf.set_index``")
                raise KeyError(msg)
            elif not isinstance(iindexer, slice):
                iindexer = slice(iindexer, iindexer)

            meta = self._make_meta(iindexer, cindexer)
            return self.obj.map_partitions(methods.try_loc,
                                           iindexer,
                                           cindexer,
                                           meta=meta)
Exemplo n.º 9
0
def from_array(x, chunksize=50000, columns=None, meta=None):
    """Read any sliceable array into a Dask Dataframe

    Uses getitem syntax to pull slices out of the array.  The array need not be
    a NumPy array but must support slicing syntax

        x[50000:100000]

    and have 2 dimensions:

        x.ndim == 2

    or have a record dtype:

        x.dtype == [('name', 'O'), ('balance', 'i8')]

    Parameters
    ----------
    x : array_like
    chunksize : int, optional
        The number of rows per partition to use.
    columns : list or string, optional
        list of column names if DataFrame, single string if Series
    meta : object, optional
        An optional `meta` parameter can be passed for dask
        to specify the concrete dataframe type to use for partitions of
        the Dask dataframe. By default, pandas DataFrame is used.

    Returns
    -------
    dask.DataFrame or dask.Series
        A dask DataFrame/Series
    """
    if isinstance(x, da.Array):
        return from_dask_array(x, columns=columns, meta=meta)

    meta = _meta_from_array(x, columns, meta=meta)

    divisions = tuple(range(0, len(x), chunksize))
    divisions = divisions + (len(x) - 1, )
    token = tokenize(x, chunksize, columns)
    name = "from_array-" + token

    dsk = {}
    for i in range(0, int(ceil(len(x) / chunksize))):
        data = (getitem, x, slice(i * chunksize, (i + 1) * chunksize))
        if is_series_like(meta):
            dsk[name, i] = (type(meta), data, None, meta.dtype, meta.name)
        else:
            dsk[name, i] = (type(meta), data, None, meta.columns)
    return new_dd_object(dsk, name, meta, divisions)
Exemplo n.º 10
0
def from_dask_array(x, columns=None, index=None, meta=None):
    """Create a Dask DataFrame from a Dask Array.

    Converts a 2d array into a DataFrame and a 1d array into a Series.

    Parameters
    ----------
    x : da.Array
    columns : list or string
        list of column names if DataFrame, single string if Series
    index : dask.dataframe.Index, optional
        An optional *dask* Index to use for the output Series or DataFrame.

        The default output index depends on whether `x` has any unknown
        chunks. If there are any unknown chunks, the output has ``None``
        for all the divisions (one per chunk). If all the chunks are known,
        a default index with known divisions is created.

        Specifying `index` can be useful if you're conforming a Dask Array
        to an existing dask Series or DataFrame, and you would like the
        indices to match.
    meta : object, optional
        An optional `meta` parameter can be passed for dask
        to specify the concrete dataframe type to be returned.
        By default, pandas DataFrame is used.

    Examples
    --------
    >>> import dask.array as da
    >>> import dask.dataframe as dd
    >>> x = da.ones((4, 2), chunks=(2, 2))
    >>> df = dd.io.from_dask_array(x, columns=['a', 'b'])
    >>> df.compute()
         a    b
    0  1.0  1.0
    1  1.0  1.0
    2  1.0  1.0
    3  1.0  1.0

    See Also
    --------
    dask.bag.to_dataframe: from dask.bag
    dask.dataframe._Frame.values: Reverse conversion
    dask.dataframe._Frame.to_records: Reverse conversion
    """
    meta = _meta_from_array(x, columns, index, meta=meta)

    name = "from-dask-array-" + tokenize(x, columns)
    graph_dependencies = [x]
    arrays_and_indices = [x.name, "ij" if x.ndim == 2 else "i"]
    numblocks = {x.name: x.numblocks}

    if index is not None:
        # An index is explicitly given by the caller, so we can pass it through to the
        # initializer after a few checks.
        if index.npartitions != x.numblocks[0]:
            msg = ("The index and array have different numbers of blocks. "
                   "({} != {})".format(index.npartitions, x.numblocks[0]))
            raise ValueError(msg)
        divisions = index.divisions
        graph_dependencies.append(index)
        arrays_and_indices.extend([index._name, "i"])
        numblocks[index._name] = (index.npartitions, )

    elif np.isnan(sum(x.shape)):
        # The shape of the incoming array is not known in at least one dimension. As
        # such, we can't create an index for the entire output DataFrame and we set
        # the divisions to None to represent that.
        divisions = [None] * (len(x.chunks[0]) + 1)
    else:
        # The shape of the incoming array is known and we don't have an explicit index.
        # Create a mapping of chunk number in the incoming array to
        # (start row, stop row) tuples. These tuples will be used to create a sequential
        # RangeIndex later on that is continuous over the whole DataFrame.
        divisions = [0]
        stop = 0
        index_mapping = {}
        for i, increment in enumerate(x.chunks[0]):
            stop += increment
            index_mapping[(i, )] = (divisions[i], stop)
            divisions.append(stop)
        divisions[-1] -= 1
        arrays_and_indices.extend(
            [BlockwiseDepDict(mapping=index_mapping), "i"])

    if is_series_like(meta):
        kwargs = {
            "dtype": x.dtype,
            "name": meta.name,
            "initializer": type(meta)
        }
    else:
        kwargs = {"columns": meta.columns, "initializer": type(meta)}

    blk = blockwise(
        _partition_from_array,
        name,
        "i",
        *arrays_and_indices,
        numblocks=numblocks,
        concatenate=True,
        # kwargs passed through to the DataFrame/Series initializer
        **kwargs,
    )

    graph = HighLevelGraph.from_collections(name,
                                            blk,
                                            dependencies=graph_dependencies)
    return new_dd_object(graph, name, meta, divisions)
Exemplo n.º 11
0
def cummax_aggregate(x, y):
    if is_series_like(x) or is_dataframe_like(x):
        return x.where((x > y) | x.isnull(), y, axis=x.ndim - 1)
    else:  # scalar
        return x if x > y else y