Exemplo n.º 1
0
def test_concat_categorical(known, cat_index, divisions):
    frames = [pd.DataFrame({'w': list('xxxxx'),
                            'x': np.arange(5),
                            'y': list('abcbc'),
                            'z': np.arange(5, dtype='f8')}),
              pd.DataFrame({'w': list('yyyyy'),
                            'x': np.arange(5, 10),
                            'y': list('abbba'),
                            'z': np.arange(5, 10, dtype='f8')}),
              pd.DataFrame({'w': list('zzzzz'),
                            'x': np.arange(10, 15),
                            'y': list('bcbcc'),
                            'z': np.arange(10, 15, dtype='f8')})]
    for df in frames:
        df.w = df.w.astype('category')
        df.y = df.y.astype('category')

    if cat_index:
        frames = [df.set_index(df.y) for df in frames]

    dframes = [dd.from_pandas(p, npartitions=2, sort=divisions) for p in frames]

    if not known:
        dframes[0]._meta = clear_known_categories(dframes[0]._meta, ['y'],
                                                  index=True)

    def check_and_return(ddfs, dfs, join):
        sol = concat(dfs, join=join)
        res = dd.concat(ddfs, join=join, interleave_partitions=divisions)
        assert_eq(res, sol)
        if known:
            parts = compute_as_if_collection(dd.DataFrame, res.dask,
                                             res.__dask_keys__())
            for p in [i.iloc[:0] for i in parts]:
                res._meta == p  # will error if schemas don't align
        assert not cat_index or has_known_categories(res.index) == known
        return res

    for join in ['inner', 'outer']:
        # Frame
        res = check_and_return(dframes, frames, join)
        assert has_known_categories(res.w)
        assert has_known_categories(res.y) == known

        # Series
        res = check_and_return([i.y for i in dframes],
                               [i.y for i in frames], join)
        assert has_known_categories(res) == known

        # Non-cat series with cat index
        if cat_index:
            res = check_and_return([i.x for i in dframes],
                                   [i.x for i in frames], join)

        # Partition missing columns
        res = check_and_return([dframes[0][['x', 'y']]] + dframes[1:],
                               [frames[0][['x', 'y']]] + frames[1:], join)
        assert not hasattr(res, 'w') or has_known_categories(res.w)
        assert has_known_categories(res.y) == known
Exemplo n.º 2
0
def test_read_csv_include_path_column_is_dtype_category(dd_read, files):
    with filetexts(files, mode='b'):
        df = dd_read('2014-01-*.csv', include_path_column=True)
        assert df.path.dtype == 'category'
        assert has_known_categories(df.path)

        dfs = dd_read('2014-01-*.csv', include_path_column=True, collection=False)
        result = dfs[0].compute()
        assert result.path.dtype == 'category'
        assert has_known_categories(result.path)
Exemplo n.º 3
0
def test_read_csv_include_path_column_is_dtype_category(dd_read, files):
    with filetexts(files, mode='b'):
        df = dd_read('2014-01-*.csv', include_path_column=True)
        assert df.path.dtype == 'category'
        assert has_known_categories(df.path)

        dfs = dd_read('2014-01-*.csv', include_path_column=True, collection=False)
        result = dfs[0].compute()
        assert result.path.dtype == 'category'
        assert has_known_categories(result.path)
Exemplo n.º 4
0
def test_read_csv_include_path_column_is_dtype_category(dd_read, files):
    with filetexts(files, mode="b"):
        df = dd_read("2014-01-*.csv", include_path_column=True)
        assert df.path.dtype == "category"
        assert has_known_categories(df.path)

        dfs = dd_read("2014-01-*.csv", include_path_column=True)
        result = dfs.compute()
        assert result.path.dtype == "category"
        assert has_known_categories(result.path)
Exemplo n.º 5
0
def test_read_csv_include_path_column_with_multiple_partitions_per_file(dd_read, files):
    with filetexts(files, mode="b"):
        df = dd_read("2014-01-*.csv", blocksize="10B", include_path_column=True)
        assert df.npartitions > 3
        assert df.path.dtype == "category"
        assert has_known_categories(df.path)

        dfs = dd_read("2014-01-*.csv", blocksize="10B", include_path_column=True)
        result = dfs.compute()
        assert result.path.dtype == "category"
        assert has_known_categories(result.path)
Exemplo n.º 6
0
def test_categorical_dtypes():
    text1 = normalize_text("""
    fruit,count
    apple,10
    apple,25
    pear,100
    orange,15
    """)

    text2 = normalize_text("""
    fruit,count
    apple,200
    banana,300
    orange,400
    banana,10
    """)

    with filetexts({'foo.1.csv': text1, 'foo.2.csv': text2}):
        df = dd.read_csv('foo.*.csv',
                         dtype={'fruit': 'category'},
                         blocksize=25)
        assert df.fruit.dtype == 'category'
        assert not has_known_categories(df.fruit)
        res = df.compute()
        assert res.fruit.dtype == 'category'
        assert (sorted(
            res.fruit.cat.categories) == ['apple', 'banana', 'orange', 'pear'])
Exemplo n.º 7
0
def test_categorical_dtypes():
    text1 = normalize_text("""
    fruit,count
    apple,10
    apple,25
    pear,100
    orange,15
    """)

    text2 = normalize_text("""
    fruit,count
    apple,200
    banana,300
    orange,400
    banana,10
    """)

    with filetexts({"foo.1.csv": text1, "foo.2.csv": text2}):
        df = dd.read_csv("foo.*.csv",
                         dtype={"fruit": "category"},
                         blocksize=25)
        assert df.fruit.dtype == "category"
        assert not has_known_categories(df.fruit)
        res = df.compute()
        assert res.fruit.dtype == "category"
        assert sorted(
            res.fruit.cat.categories) == ["apple", "banana", "orange", "pear"]
Exemplo n.º 8
0
def test_categorical_dtypes():
    text1 = normalize_text("""
    fruit,count
    apple,10
    apple,25
    pear,100
    orange,15
    """)

    text2 = normalize_text("""
    fruit,count
    apple,200
    banana,300
    orange,400
    banana,10
    """)

    with filetexts({'foo.1.csv': text1, 'foo.2.csv': text2}):
        df = dd.read_csv('foo.*.csv', dtype={'fruit': 'category'}, blocksize=25)
        assert df.fruit.dtype == 'category'
        assert not has_known_categories(df.fruit)
        res = df.compute()
        assert res.fruit.dtype == 'category'
        assert (sorted(res.fruit.cat.categories) ==
                ['apple', 'banana', 'orange', 'pear'])
Exemplo n.º 9
0
 def check_and_return(ddfs, dfs, join):
     sol = concat(dfs, join=join)
     res = dd.concat(ddfs, join=join, interleave_partitions=divisions)
     assert_eq(res, sol)
     if known:
         parts = compute_as_if_collection(dd.DataFrame, res.dask,
                                          res.__dask_keys__())
         for p in [i.iloc[:0] for i in parts]:
             res._meta == p  # will error if schemas don't align
     assert not cat_index or has_known_categories(res.index) == known
     return res
Exemplo n.º 10
0
def test_append_categorical():
    frames = [pd.DataFrame({'x': np.arange(5, 10),
                            'y': list('abbba'),
                            'z': np.arange(5, 10, dtype='f8')}),
              pd.DataFrame({'x': np.arange(10, 15),
                            'y': list('bcbcc'),
                            'z': np.arange(10, 15, dtype='f8')})]
    frames2 = []
    for df in frames:
        df.y = df.y.astype('category')
        df2 = df.copy()
        df2.y = df2.y.cat.set_categories(list('abc'))
        df.index = df.y
        frames2.append(df2.set_index(df2.y))

    df1, df2 = frames2

    for known in [True, False]:
        dframes = [dd.from_pandas(p, npartitions=2, sort=False) for p in frames]
        if not known:
            dframes[0]._meta = clear_known_categories(dframes[0]._meta,
                                                      ['y'], index=True)
        ddf1, ddf2 = dframes

        res = ddf1.append(ddf2)
        assert_eq(res, df1.append(df2))
        assert has_known_categories(res.index) == known
        assert has_known_categories(res.y) == known

        res = ddf1.y.append(ddf2.y)
        assert_eq(res, df1.y.append(df2.y))
        assert has_known_categories(res.index) == known
        assert has_known_categories(res) == known

        res = ddf1.index.append(ddf2.index)
        assert_eq(res, df1.index.append(df2.index))
        assert has_known_categories(res) == known
Exemplo n.º 11
0
def categorize(df, columns=None, index=None, split_every=None, **kwargs):
    """Convert columns of the DataFrame to category dtype.

    Parameters
    ----------
    columns : list, optional
        A list of column names to convert to categoricals. By default any
        column with an object dtype is converted to a categorical, and any
        unknown categoricals are made known.
    index : bool, optional
        Whether to categorize the index. By default, object indices are
        converted to categorical, and unknown categorical indices are made
        known. Set True to always categorize the index, False to never.
    split_every : int, optional
        Group partitions into groups of this size while performing a
        tree-reduction. If set to False, no tree-reduction will be used.
        Default is 16.
    kwargs
        Keyword arguments are passed on to compute.
    """
    meta = df._meta
    if columns is None:
        columns = list(meta.select_dtypes(["object", "category"]).columns)
    elif is_scalar(columns):
        columns = [columns]

    # Filter out known categorical columns
    columns = [
        c for c in columns if not (
            is_categorical_dtype(meta[c]) and has_known_categories(meta[c]))
    ]

    if index is not False:
        if is_categorical_dtype(meta.index):
            index = not has_known_categories(meta.index)
        elif index is None:
            index = meta.index.dtype == object

    # Nothing to do
    if not len(columns) and index is False:
        return df

    if split_every is None:
        split_every = 16
    elif split_every is False:
        split_every = df.npartitions
    elif not isinstance(split_every, Integral) or split_every < 2:
        raise ValueError("split_every must be an integer >= 2")

    token = tokenize(df, columns, index, split_every)
    a = "get-categories-chunk-" + token
    dsk = {(a, i): (_get_categories, key, columns, index)
           for (i, key) in enumerate(df.__dask_keys__())}

    prefix = "get-categories-agg-" + token
    k = df.npartitions
    depth = 0
    while k > split_every:
        b = prefix + str(depth)
        for part_i, inds in enumerate(partition_all(split_every, range(k))):
            dsk[(b, part_i)] = (_get_categories_agg, [(a, i) for i in inds])
        k = part_i + 1
        a = b
        depth += 1

    dsk[(prefix, 0)] = (_get_categories_agg, [(a, i) for i in range(k)])
    dsk.update(df.dask)

    # Compute the categories
    categories, index = compute_as_if_collection(df.__class__, dsk,
                                                 (prefix, 0), **kwargs)

    # some operations like get_dummies() rely on the order of categories
    categories = {k: v.sort_values() for k, v in categories.items()}

    # Categorize each partition
    return df.map_partitions(_categorize_block, categories, index)
Exemplo n.º 12
0
 def known(self):
     """Whether the categories are fully known"""
     return has_known_categories(self._series)
Exemplo n.º 13
0
def pivot_table(df, index=None, columns=None, values=None, aggfunc="mean"):
    """
    Create a spreadsheet-style pivot table as a DataFrame. Target ``columns``
    must have category dtype to infer result's ``columns``.
    ``index``, ``columns``, and ``aggfunc`` must be all scalar.
    ``values`` can be scalar or list-like.

    Parameters
    ----------
    df : DataFrame
    index : scalar
        column to be index
    columns : scalar
        column to be columns
    values : scalar or list(scalar)
        column(s) to aggregate
    aggfunc : {'mean', 'sum', 'count', 'first', 'last'}, default 'mean'

    Returns
    -------
    table : DataFrame

    See Also
    --------
    pandas.DataFrame.pivot_table
    """

    if not is_scalar(index) or index is None:
        raise ValueError("'index' must be the name of an existing column")
    if not is_scalar(columns) or columns is None:
        raise ValueError("'columns' must be the name of an existing column")
    if not methods.is_categorical_dtype(df[columns]):
        raise ValueError("'columns' must be category dtype")
    if not has_known_categories(df[columns]):
        raise ValueError("'columns' must have known categories. Please use "
                         "`df[columns].cat.as_known()` beforehand to ensure "
                         "known categories")
    if not (is_list_like(values) and all([is_scalar(v) for v in values])
            or is_scalar(values)):
        raise ValueError(
            "'values' must refer to an existing column or columns")

    available_aggfuncs = ["mean", "sum", "count", "first", "last"]

    if not is_scalar(aggfunc) or aggfunc not in available_aggfuncs:
        raise ValueError("aggfunc must be either " +
                         ", ".join(f"'{x}'" for x in available_aggfuncs))

    # _emulate can't work for empty data
    # the result must have CategoricalIndex columns

    columns_contents = pd.CategoricalIndex(df[columns].cat.categories,
                                           name=columns)
    if is_scalar(values):
        new_columns = columns_contents
    else:
        new_columns = pd.MultiIndex.from_product(
            (sorted(values), columns_contents), names=[None, columns])

    if aggfunc in ["first", "last"]:
        # Infer datatype as non-numeric values are allowed
        if is_scalar(values):
            meta = pd.DataFrame(
                columns=new_columns,
                dtype=df[values].dtype,
                index=pd.Index(df._meta[index]),
            )
        else:
            meta = pd.DataFrame(
                columns=new_columns,
                index=pd.Index(df._meta[index]),
            )
            for value_col in values:
                meta[value_col] = meta[value_col].astype(
                    df[values].dtypes[value_col])
    else:
        # Use float64 as other aggregate functions require numerical data
        meta = pd.DataFrame(columns=new_columns,
                            dtype=np.float64,
                            index=pd.Index(df._meta[index]))

    kwargs = {"index": index, "columns": columns, "values": values}

    if aggfunc in ["sum", "mean"]:
        pv_sum = apply_concat_apply(
            [df],
            chunk=methods.pivot_sum,
            aggregate=methods.pivot_agg,
            meta=meta,
            token="pivot_table_sum",
            chunk_kwargs=kwargs,
        )

    if aggfunc in ["count", "mean"]:
        pv_count = apply_concat_apply(
            [df],
            chunk=methods.pivot_count,
            aggregate=methods.pivot_agg,
            meta=meta,
            token="pivot_table_count",
            chunk_kwargs=kwargs,
        )

    if aggfunc == "sum":
        return pv_sum
    elif aggfunc == "count":
        return pv_count
    elif aggfunc == "mean":
        return pv_sum / pv_count
    elif aggfunc == "first":
        return apply_concat_apply(
            [df],
            chunk=methods.pivot_first,
            aggregate=methods.pivot_agg_first,
            meta=meta,
            token="pivot_table_first",
            chunk_kwargs=kwargs,
        )
    elif aggfunc == "last":
        return apply_concat_apply(
            [df],
            chunk=methods.pivot_last,
            aggregate=methods.pivot_agg_last,
            meta=meta,
            token="pivot_table_last",
            chunk_kwargs=kwargs,
        )
    else:
        raise ValueError
Exemplo n.º 14
0
def get_dummies(
    data,
    prefix=None,
    prefix_sep="_",
    dummy_na=False,
    columns=None,
    sparse=False,
    drop_first=False,
    dtype=np.uint8,
    **kwargs,
):
    """
    Convert categorical variable into dummy/indicator variables.

    Data must have category dtype to infer result's ``columns``.

    Parameters
    ----------
    data : Series, or DataFrame
        For Series, the dtype must be categorical.
        For DataFrame, at least one column must be categorical.
    prefix : string, list of strings, or dict of strings, default None
        String to append DataFrame column names.
        Pass a list with length equal to the number of columns
        when calling get_dummies on a DataFrame. Alternatively, `prefix`
        can be a dictionary mapping column names to prefixes.
    prefix_sep : string, default '_'
        If appending prefix, separator/delimiter to use. Or pass a
        list or dictionary as with `prefix.`
    dummy_na : bool, default False
        Add a column to indicate NaNs, if False NaNs are ignored.
    columns : list-like, default None
        Column names in the DataFrame to be encoded.
        If `columns` is None then all the columns with
        `category` dtype will be converted.
    sparse : bool, default False
        Whether the dummy columns should be sparse or not.  Returns
        SparseDataFrame if `data` is a Series or if all columns are included.
        Otherwise returns a DataFrame with some SparseBlocks.

        .. versionadded:: 0.18.2

    drop_first : bool, default False
        Whether to get k-1 dummies out of k categorical levels by removing the
        first level.

    dtype : dtype, default np.uint8
        Data type for new columns. Only a single dtype is allowed.

        .. versionadded:: 0.18.2

    Returns
    -------
    dummies : DataFrame

    Examples
    --------
    Dask's version only works with Categorical data, as this is the only way to
    know the output shape without computing all the data.

    >>> import pandas as pd
    >>> import dask.dataframe as dd
    >>> s = dd.from_pandas(pd.Series(list('abca')), npartitions=2)
    >>> dd.get_dummies(s)
    Traceback (most recent call last):
        ...
    NotImplementedError: `get_dummies` with non-categorical dtypes is not supported...

    With categorical data:

    >>> s = dd.from_pandas(pd.Series(list('abca'), dtype='category'), npartitions=2)
    >>> dd.get_dummies(s)  # doctest: +NORMALIZE_WHITESPACE
    Dask DataFrame Structure:
                       a      b      c
    npartitions=2
    0              uint8  uint8  uint8
    2                ...    ...    ...
    3                ...    ...    ...
    Dask Name: get_dummies, 4 tasks
    >>> dd.get_dummies(s).compute()  # doctest: +ELLIPSIS
       a  b  c
    0  1  0  0
    1  0  1  0
    2  0  0  1
    3  1  0  0

    See Also
    --------
    pandas.get_dummies
    """
    if isinstance(data, (pd.Series, pd.DataFrame)):
        return pd.get_dummies(
            data,
            prefix=prefix,
            prefix_sep=prefix_sep,
            dummy_na=dummy_na,
            columns=columns,
            sparse=sparse,
            drop_first=drop_first,
            dtype=dtype,
            **kwargs,
        )

    not_cat_msg = ("`get_dummies` with non-categorical dtypes is not "
                   "supported. Please use `df.categorize()` beforehand to "
                   "convert to categorical dtype.")

    unknown_cat_msg = ("`get_dummies` with unknown categories is not "
                       "supported. Please use `column.cat.as_known()` or "
                       "`df.categorize()` beforehand to ensure known "
                       "categories")

    if isinstance(data, Series):
        if not methods.is_categorical_dtype(data):
            raise NotImplementedError(not_cat_msg)
        if not has_known_categories(data):
            raise NotImplementedError(unknown_cat_msg)
    elif isinstance(data, DataFrame):
        if columns is None:
            if (data.dtypes == "object").any():
                raise NotImplementedError(not_cat_msg)
            columns = data._meta.select_dtypes(include=["category"]).columns
        else:
            if not all(methods.is_categorical_dtype(data[c]) for c in columns):
                raise NotImplementedError(not_cat_msg)

        if not all(has_known_categories(data[c]) for c in columns):
            raise NotImplementedError(unknown_cat_msg)

    package_name = data._meta.__class__.__module__.split(".")[0]
    dummies = sys.modules[package_name].get_dummies

    return map_partitions(
        dummies,
        data,
        prefix=prefix,
        prefix_sep=prefix_sep,
        dummy_na=dummy_na,
        columns=columns,
        sparse=sparse,
        drop_first=drop_first,
        dtype=dtype,
        **kwargs,
    )