Пример #1
0
 def as_unknown(self):
     """Ensure the categories in this series are unknown"""
     if not self.known:
         return self._series
     out = self._series.copy()
     out._meta = clear_known_categories(out._meta)
     return out
Пример #2
0
def test_concat_categorical(known, cat_index, divisions):
    frames = [pd.DataFrame({'w': list('xxxxx'),
                            'x': np.arange(5),
                            'y': list('abcbc'),
                            'z': np.arange(5, dtype='f8')}),
              pd.DataFrame({'w': list('yyyyy'),
                            'x': np.arange(5, 10),
                            'y': list('abbba'),
                            'z': np.arange(5, 10, dtype='f8')}),
              pd.DataFrame({'w': list('zzzzz'),
                            'x': np.arange(10, 15),
                            'y': list('bcbcc'),
                            'z': np.arange(10, 15, dtype='f8')})]
    for df in frames:
        df.w = df.w.astype('category')
        df.y = df.y.astype('category')

    if cat_index:
        frames = [df.set_index(df.y) for df in frames]

    dframes = [dd.from_pandas(p, npartitions=2, sort=divisions) for p in frames]

    if not known:
        dframes[0]._meta = clear_known_categories(dframes[0]._meta, ['y'],
                                                  index=True)

    def check_and_return(ddfs, dfs, join):
        sol = concat(dfs, join=join)
        res = dd.concat(ddfs, join=join, interleave_partitions=divisions)
        assert_eq(res, sol)
        if known:
            parts = compute_as_if_collection(dd.DataFrame, res.dask,
                                             res.__dask_keys__())
            for p in [i.iloc[:0] for i in parts]:
                res._meta == p  # will error if schemas don't align
        assert not cat_index or has_known_categories(res.index) == known
        return res

    for join in ['inner', 'outer']:
        # Frame
        res = check_and_return(dframes, frames, join)
        assert has_known_categories(res.w)
        assert has_known_categories(res.y) == known

        # Series
        res = check_and_return([i.y for i in dframes],
                               [i.y for i in frames], join)
        assert has_known_categories(res) == known

        # Non-cat series with cat index
        if cat_index:
            res = check_and_return([i.x for i in dframes],
                                   [i.x for i in frames], join)

        # Partition missing columns
        res = check_and_return([dframes[0][['x', 'y']]] + dframes[1:],
                               [frames[0][['x', 'y']]] + frames[1:], join)
        assert not hasattr(res, 'w') or has_known_categories(res.w)
        assert has_known_categories(res.y) == known
Пример #3
0
def test_categorize():
    # rename y to y_ to avoid pandas future warning about ambiguous
    # levels
    meta = clear_known_categories(frames4[0]).rename(columns={'y': 'y_'})
    ddf = dd.DataFrame({('unknown', i): df
                        for (i, df) in enumerate(frames3)}, 'unknown', meta,
                       [None] * 4).rename(columns={'y': 'y_'})
    ddf = ddf.assign(w=ddf.w.cat.set_categories(['x', 'y', 'z']))
    assert ddf.w.cat.known
    assert not ddf.y_.cat.known
    assert not ddf.index.cat.known
    df = ddf.compute()

    for index in [None, True, False]:
        known_index = index is not False
        # By default categorize object and unknown cat columns
        ddf2 = ddf.categorize(index=index)
        assert ddf2.y_.cat.known
        assert ddf2.v.cat.known
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df.astype({'v': 'category'}), check_categorical=False)

        # Specifying split_every works
        ddf2 = ddf.categorize(index=index, split_every=2)
        assert ddf2.y_.cat.known
        assert ddf2.v.cat.known
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df.astype({'v': 'category'}), check_categorical=False)

        # Specifying one column doesn't affect others
        ddf2 = ddf.categorize('v', index=index)
        assert not ddf2.y_.cat.known
        assert ddf2.v.cat.known
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df.astype({'v': 'category'}), check_categorical=False)

        ddf2 = ddf.categorize('y_', index=index)
        assert ddf2.y_.cat.known
        assert ddf2.v.dtype == 'object'
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df)

    ddf_known_index = ddf.categorize(columns=[], index=True)
    assert ddf_known_index.index.cat.known
    assert_eq(ddf_known_index, df)

    # Specifying known categorical or no columns is a no-op:
    assert ddf.categorize(['w'], index=False) is ddf
    assert ddf.categorize([], index=False) is ddf
    assert ddf_known_index.categorize(['w']) is ddf_known_index
    assert ddf_known_index.categorize([]) is ddf_known_index

    # Bad split_every fails
    with pytest.raises(ValueError):
        ddf.categorize(split_every=1)

    with pytest.raises(ValueError):
        ddf.categorize(split_every='foo')
Пример #4
0
def test_categorize():
    # rename y to y_ to avoid pandas future warning about ambiguous
    # levels
    meta = clear_known_categories(frames4[0]).rename(columns={'y': 'y_'})
    ddf = dd.DataFrame({('unknown', i): df for (i, df) in enumerate(frames3)},
                       'unknown', meta, [None] * 4).rename(columns={'y': 'y_'})
    ddf = ddf.assign(w=ddf.w.cat.set_categories(['x', 'y', 'z']))
    assert ddf.w.cat.known
    assert not ddf.y_.cat.known
    assert not ddf.index.cat.known
    df = ddf.compute()

    for index in [None, True, False]:
        known_index = index is not False
        # By default categorize object and unknown cat columns
        ddf2 = ddf.categorize(index=index)
        assert ddf2.y_.cat.known
        assert ddf2.v.cat.known
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df.astype({'v': 'category'}), check_categorical=False)

        # Specifying split_every works
        ddf2 = ddf.categorize(index=index, split_every=2)
        assert ddf2.y_.cat.known
        assert ddf2.v.cat.known
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df.astype({'v': 'category'}), check_categorical=False)

        # Specifying one column doesn't affect others
        ddf2 = ddf.categorize('v', index=index)
        assert not ddf2.y_.cat.known
        assert ddf2.v.cat.known
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df.astype({'v': 'category'}), check_categorical=False)

        ddf2 = ddf.categorize('y_', index=index)
        assert ddf2.y_.cat.known
        assert ddf2.v.dtype == 'object'
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df)

    ddf_known_index = ddf.categorize(columns=[], index=True)
    assert ddf_known_index.index.cat.known
    assert_eq(ddf_known_index, df)

    # Specifying known categorical or no columns is a no-op:
    assert ddf.categorize(['w'], index=False) is ddf
    assert ddf.categorize([], index=False) is ddf
    assert ddf_known_index.categorize(['w']) is ddf_known_index
    assert ddf_known_index.categorize([]) is ddf_known_index

    # Bad split_every fails
    with pytest.raises(ValueError):
        ddf.categorize(split_every=1)

    with pytest.raises(ValueError):
        ddf.categorize(split_every='foo')
Пример #5
0
def test_categorize():
    meta = clear_known_categories(frames4[0])
    ddf = dd.DataFrame({('unknown', i): df
                        for (i, df) in enumerate(frames3)}, 'unknown', meta,
                       [None] * 4)
    ddf = ddf.assign(w=ddf.w.cat.set_categories(['x', 'y', 'z']))
    assert ddf.w.cat.known
    assert not ddf.y.cat.known
    assert not ddf.index.cat.known
    df = ddf.compute()

    for index in [None, True, False]:
        known_index = index is not False
        # By default categorize object and unknown cat columns
        ddf2 = ddf.categorize(index=index)
        assert ddf2.y.cat.known
        assert ddf2.v.cat.known
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df.astype({'v': 'category'}), check_categorical=False)

        # Specifying one column doesn't affect others
        ddf2 = ddf.categorize('v', index=index)
        assert not ddf2.y.cat.known
        assert ddf2.v.cat.known
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df.astype({'v': 'category'}), check_categorical=False)

        ddf2 = ddf.categorize('y', index=index)
        assert ddf2.y.cat.known
        assert ddf2.v.dtype == 'object'
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df)

    ddf_known_index = ddf.categorize(columns=[], index=True)
    assert ddf_known_index.index.cat.known
    assert_eq(ddf_known_index, df)

    # Specifying known categorical or no columns is a no-op:
    assert ddf.categorize(['w'], index=False) is ddf
    assert ddf.categorize([], index=False) is ddf
    assert ddf_known_index.categorize(['w']) is ddf_known_index
    assert ddf_known_index.categorize([]) is ddf_known_index
Пример #6
0
def test_categorize():
    meta = clear_known_categories(frames4[0])
    ddf = dd.DataFrame({('unknown', i): df for (i, df) in enumerate(frames3)},
                       'unknown', meta, [None] * 4)
    ddf = ddf.assign(w=ddf.w.cat.set_categories(['x', 'y', 'z']))
    assert ddf.w.cat.known
    assert not ddf.y.cat.known
    assert not ddf.index.cat.known
    df = ddf.compute()

    for index in [None, True, False]:
        known_index = index is not False
        # By default categorize object and unknown cat columns
        ddf2 = ddf.categorize(index=index)
        assert ddf2.y.cat.known
        assert ddf2.v.cat.known
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df.astype({'v': 'category'}), check_categorical=False)

        # Specifying one column doesn't affect others
        ddf2 = ddf.categorize('v', index=index)
        assert not ddf2.y.cat.known
        assert ddf2.v.cat.known
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df.astype({'v': 'category'}), check_categorical=False)

        ddf2 = ddf.categorize('y', index=index)
        assert ddf2.y.cat.known
        assert ddf2.v.dtype == 'object'
        assert ddf2.index.cat.known == known_index
        assert_eq(ddf2, df)

    ddf_known_index = ddf.categorize(columns=[], index=True)
    assert ddf_known_index.index.cat.known
    assert_eq(ddf_known_index, df)

    # Specifying known categorical or no columns is a no-op:
    assert ddf.categorize(['w'], index=False) is ddf
    assert ddf.categorize([], index=False) is ddf
    assert ddf_known_index.categorize(['w']) is ddf_known_index
    assert ddf_known_index.categorize([]) is ddf_known_index
Пример #7
0
def test_append_categorical():
    frames = [pd.DataFrame({'x': np.arange(5, 10),
                            'y': list('abbba'),
                            'z': np.arange(5, 10, dtype='f8')}),
              pd.DataFrame({'x': np.arange(10, 15),
                            'y': list('bcbcc'),
                            'z': np.arange(10, 15, dtype='f8')})]
    frames2 = []
    for df in frames:
        df.y = df.y.astype('category')
        df2 = df.copy()
        df2.y = df2.y.cat.set_categories(list('abc'))
        df.index = df.y
        frames2.append(df2.set_index(df2.y))

    df1, df2 = frames2

    for known in [True, False]:
        dframes = [dd.from_pandas(p, npartitions=2, sort=False) for p in frames]
        if not known:
            dframes[0]._meta = clear_known_categories(dframes[0]._meta,
                                                      ['y'], index=True)
        ddf1, ddf2 = dframes

        res = ddf1.append(ddf2)
        assert_eq(res, df1.append(df2))
        assert has_known_categories(res.index) == known
        assert has_known_categories(res.y) == known

        res = ddf1.y.append(ddf2.y)
        assert_eq(res, df1.y.append(df2.y))
        assert has_known_categories(res.index) == known
        assert has_known_categories(res) == known

        res = ddf1.index.append(ddf2.index)
        assert_eq(res, df1.index.append(df2.index))
        assert has_known_categories(res) == known
Пример #8
0
def text_blocks_to_pandas(
    reader,
    block_lists,
    header,
    head,
    kwargs,
    enforce=False,
    specified_dtypes=None,
    path=None,
    blocksize=None,
    urlpath=None,
):
    """Convert blocks of bytes to a dask.dataframe

    This accepts a list of lists of values of bytes where each list corresponds
    to one file, and the value of bytes concatenate to comprise the entire
    file, in order.

    Parameters
    ----------
    reader : callable
        ``pd.read_csv`` or ``pd.read_table``.
    block_lists : list of lists of delayed values of bytes
        The lists of bytestrings where each list corresponds to one logical file
    header : bytestring
        The header, found at the front of the first file, to be prepended to
        all blocks
    head : pd.DataFrame
        An example Pandas DataFrame to be used for metadata.
    kwargs : dict
        Keyword arguments to pass down to ``reader``
    path : tuple, optional
        A tuple containing column name for path and the path_converter if provided

    Returns
    -------
    A dask.dataframe
    """
    dtypes = head.dtypes.to_dict()
    # dtypes contains only instances of CategoricalDtype, which causes issues
    # in coerce_dtypes for non-uniform categories across partitions.
    # We will modify `dtype` (which is inferred) to
    # 1. contain instances of CategoricalDtypes for user-provided types
    # 2. contain 'category' for data inferred types
    categoricals = head.select_dtypes(include=["category"]).columns

    if isinstance(specified_dtypes, Mapping):
        known_categoricals = [
            k for k in categoricals
            if isinstance(specified_dtypes.get(k), CategoricalDtype)
            and specified_dtypes.get(k).categories is not None
        ]
        unknown_categoricals = categoricals.difference(known_categoricals)
    else:
        unknown_categoricals = categoricals

    # Fixup the dtypes
    for k in unknown_categoricals:
        dtypes[k] = "category"

    columns = list(head.columns)

    blocks = tuple(flatten(block_lists))
    # Create mask of first blocks from nested block_lists
    is_first = tuple(block_mask(block_lists))
    is_last = tuple(block_mask_last(block_lists))

    if path:
        colname, path_converter = path
        paths = [b[1].path for b in blocks]
        if path_converter:
            paths = [path_converter(p) for p in paths]
        head = head.assign(
            **{
                colname:
                pd.Categorical.from_codes(np.zeros(len(head), dtype=int),
                                          set(paths))
            })
        path = (colname, paths)

    if len(unknown_categoricals):
        head = clear_known_categories(head, cols=unknown_categoricals)

    # Define parts
    parts = []
    colname, paths = path or (None, None)
    for i in range(len(blocks)):
        parts.append(
            [blocks[i], paths[i] if paths else None, is_first[i], is_last[i]])

    # Construct the output collection with from_map
    return from_map(
        CSVFunctionWrapper(
            columns,
            None,
            colname,
            head,
            header,
            reader,
            dtypes,
            enforce,
            kwargs,
        ),
        parts,
        meta=head,
        label="read-csv",
        token=tokenize(reader, urlpath, columns, enforce, head, blocksize),
        enforce_metadata=False,
        produces_tasks=True,
    )