示例#1
0
def test_is_categorical_dtype():
    df = pd.DataFrame({"cat": pd.Categorical([1, 2, 3, 4]), "x": [1, 2, 3, 4]})

    assert is_categorical_dtype(df["cat"])
    assert not is_categorical_dtype(df["x"])

    ddf = dd.from_pandas(df, 2)

    assert is_categorical_dtype(ddf["cat"])
    assert not is_categorical_dtype(ddf["x"])
示例#2
0
def test_is_categorical_dtype():
    df = pd.DataFrame({'cat': pd.Categorical([1, 2, 3, 4]), 'x': [1, 2, 3, 4]})

    assert is_categorical_dtype(df['cat'])
    assert not is_categorical_dtype(df['x'])

    ddf = dd.from_pandas(df, 2)

    assert is_categorical_dtype(ddf['cat'])
    assert not is_categorical_dtype(ddf['x'])
示例#3
0
def test_is_categorical_dtype():
    df = pd.DataFrame({'cat': pd.Categorical([1, 2, 3, 4]),
                       'x': [1, 2, 3, 4]})

    assert is_categorical_dtype(df['cat'])
    assert not is_categorical_dtype(df['x'])

    ddf = dd.from_pandas(df, 2)

    assert is_categorical_dtype(ddf['cat'])
    assert not is_categorical_dtype(ddf['x'])
示例#4
0
def test_from_bcolz():
    bcolz = pytest.importorskip('bcolz')

    t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                     names=['x', 'y', 'a'])
    d = dd.from_bcolz(t, chunksize=2)
    assert d.npartitions == 2
    assert is_categorical_dtype(d.dtypes['a'])
    assert list(d.x.compute(scheduler='sync')) == [1, 2, 3]
    assert list(d.a.compute(scheduler='sync')) == ['a', 'b', 'a']
    L = list(d.index.compute(scheduler='sync'))
    assert L == [0, 1, 2]

    d = dd.from_bcolz(t, chunksize=2, index='x')
    L = list(d.index.compute(scheduler='sync'))
    assert L == [1, 2, 3] or L == [1, 3, 2]

    # Names
    assert (sorted(dd.from_bcolz(t, chunksize=2).dask) ==
            sorted(dd.from_bcolz(t, chunksize=2).dask))
    assert (sorted(dd.from_bcolz(t, chunksize=2).dask) !=
            sorted(dd.from_bcolz(t, chunksize=3).dask))

    dsk = dd.from_bcolz(t, chunksize=3).dask

    t.append((4, 4., 'b'))
    t.flush()

    assert (sorted(dd.from_bcolz(t, chunksize=2).dask) !=
            sorted(dsk))
示例#5
0
文件: test_io.py 项目: aparwal7/6242
def test_from_bcolz():
    bcolz = pytest.importorskip("bcolz")

    t = bcolz.ctable([[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]],
                     names=["x", "y", "a"])
    d = dd.from_bcolz(t, chunksize=2)
    assert d.npartitions == 2
    assert is_categorical_dtype(d.dtypes["a"])
    assert list(d.x.compute(scheduler="sync")) == [1, 2, 3]
    assert list(d.a.compute(scheduler="sync")) == ["a", "b", "a"]
    L = list(d.index.compute(scheduler="sync"))
    assert L == [0, 1, 2]

    d = dd.from_bcolz(t, chunksize=2, index="x")
    L = list(d.index.compute(scheduler="sync"))
    assert L == [1, 2, 3] or L == [1, 3, 2]

    # Names
    assert sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted(
        dd.from_bcolz(t, chunksize=2).dask)
    assert sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(
        dd.from_bcolz(t, chunksize=3).dask)

    dsk = dd.from_bcolz(t, chunksize=3).dask

    t.append((4, 4.0, "b"))
    t.flush()

    assert sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(dsk)
示例#6
0
def percentiles_summary(df, num_old, num_new, upsample, state):
    """Summarize data using percentiles and derived weights.

    These summaries can be merged, compressed, and converted back into
    approximate percentiles.

    Parameters
    ----------
    df: pandas.Series
        Data to summarize
    num_old: int
        Number of partitions of the current object
    num_new: int
        Number of partitions of the new object
    upsample: float
        Scale factor to increase the number of percentiles calculated in
        each partition.  Use to improve accuracy.
    """
    from dask.array.dispatch import percentile_lookup as _percentile

    length = len(df)
    if length == 0:
        return ()
    random_state = np.random.RandomState(state)
    qs = sample_percentiles(num_old, num_new, length, upsample, random_state)
    data = df
    interpolation = "linear"

    if is_categorical_dtype(data):
        data = data.cat.codes
        interpolation = "nearest"
    elif isinstance(data.dtype, pd.core.dtypes.dtypes.DatetimeTZDtype) or np.issubdtype(
        data.dtype, np.integer
    ):
        interpolation = "nearest"
    vals, n = _percentile(data, qs, interpolation=interpolation)
    if (
        is_cupy_type(data)
        and interpolation == "linear"
        and np.issubdtype(data.dtype, np.integer)
    ):
        vals = np.round(vals).astype(data.dtype)
        if qs[0] == 0:
            # Ensure the 0th quantile is the minimum value of the data
            vals[0] = data.min()
    vals_and_weights = percentiles_to_weights(qs, vals, length)
    return vals_and_weights
示例#7
0
    def check(i):
        t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']],
                         names=['x', 'y', 'a'])
        d = dd.from_bcolz(t, chunksize=2)
        assert d.npartitions == 2
        assert is_categorical_dtype(d.dtypes['a'])
        assert list(d.x.compute(scheduler='sync')) == [1, 2, 3]
        assert list(d.a.compute(scheduler='sync')) == ['a', 'b', 'a']

        d = dd.from_bcolz(t, chunksize=2, index='x')
        L = list(d.index.compute(scheduler='sync'))
        assert L == [1, 2, 3] or L == [1, 3, 2]

        # Names
        assert (sorted(dd.from_bcolz(t, chunksize=2).dask) ==
                sorted(dd.from_bcolz(t, chunksize=2).dask))
        assert (sorted(dd.from_bcolz(t, chunksize=2).dask) !=
                sorted(dd.from_bcolz(t, chunksize=3).dask))
示例#8
0
文件: test_io.py 项目: aparwal7/6242
    def check(i):
        t = bcolz.ctable([[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]],
                         names=["x", "y", "a"])
        d = dd.from_bcolz(t, chunksize=2)
        assert d.npartitions == 2
        assert is_categorical_dtype(d.dtypes["a"])
        assert list(d.x.compute(scheduler="sync")) == [1, 2, 3]
        assert list(d.a.compute(scheduler="sync")) == ["a", "b", "a"]

        d = dd.from_bcolz(t, chunksize=2, index="x")
        L = list(d.index.compute(scheduler="sync"))
        assert L == [1, 2, 3] or L == [1, 3, 2]

        # Names
        assert sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted(
            dd.from_bcolz(t, chunksize=2).dask)
        assert sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(
            dd.from_bcolz(t, chunksize=3).dask)
示例#9
0
def dtype_info(df):
    info = None
    if is_categorical_dtype(df):
        data = df.values
        info = (data.categories, data.ordered)
    return df.dtype, info
示例#10
0
def process_val_weights(vals_and_weights, npartitions, dtype_info):
    """Calculate final approximate percentiles given weighted vals

    ``vals_and_weights`` is assumed to be sorted.  We take a cumulative
    sum of the weights, which makes them percentile-like (their scale is
    [0, N] instead of [0, 100]).  Next we find the divisions to create
    partitions of approximately equal size.

    It is possible for adjacent values of the result to be the same.  Since
    these determine the divisions of the new partitions, some partitions
    may be empty.  This can happen if we under-sample the data, or if there
    aren't enough unique values in the column.  Increasing ``upsample``
    keyword argument in ``df.set_index`` may help.
    """
    dtype, info = dtype_info

    if not vals_and_weights:
        try:
            return np.array(None, dtype=dtype)
        except Exception:
            # dtype does not support None value so allow it to change
            return np.array(None, dtype=np.float_)

    vals, weights = vals_and_weights
    vals = np.array(vals)
    weights = np.array(weights)

    # We want to create exactly `npartition` number of groups of `vals` that
    # are approximately the same weight and non-empty if possible.  We use a
    # simple approach (more accurate algorithms exist):
    # 1. Remove all the values with weights larger than the relative
    #    percentile width from consideration (these are `jumbo`s)
    # 2. Calculate percentiles with "interpolation=left" of percentile-like
    #    weights of the remaining values.  These are guaranteed to be unique.
    # 3. Concatenate the values from (1) and (2), sort, and return.
    #
    # We assume that all values are unique, which happens in the previous
    # step `merge_and_compress_summaries`.

    if len(vals) == npartitions + 1:
        rv = vals
    elif len(vals) < npartitions + 1:
        # The data is under-sampled
        if np.issubdtype(vals.dtype, np.number) and not is_categorical_dtype(dtype):
            # Interpolate extra divisions
            q_weights = np.cumsum(weights)
            q_target = np.linspace(q_weights[0], q_weights[-1], npartitions + 1)
            rv = np.interp(q_target, q_weights, vals)
        else:
            # Distribute the empty partitions
            duplicated_index = np.linspace(
                0, len(vals) - 1, npartitions - len(vals) + 1, dtype=int
            )
            duplicated_vals = vals[duplicated_index]
            rv = np.concatenate([vals, duplicated_vals])
            rv.sort()
    else:
        target_weight = weights.sum() / npartitions
        jumbo_mask = weights >= target_weight
        jumbo_vals = vals[jumbo_mask]

        trimmed_vals = vals[~jumbo_mask]
        trimmed_weights = weights[~jumbo_mask]
        trimmed_npartitions = npartitions - len(jumbo_vals)

        # percentile-like, but scaled by weights
        q_weights = np.cumsum(trimmed_weights)
        q_target = np.linspace(0, q_weights[-1], trimmed_npartitions + 1)

        left = np.searchsorted(q_weights, q_target, side="left")
        right = np.searchsorted(q_weights, q_target, side="right") - 1
        # stay inbounds
        np.maximum(right, 0, right)
        lower = np.minimum(left, right)
        trimmed = trimmed_vals[lower]

        rv = np.concatenate([trimmed, jumbo_vals])
        rv.sort()

    if is_categorical_dtype(dtype):
        rv = pd.Categorical.from_codes(rv, info[0], info[1])
    elif is_datetime64tz_dtype(dtype):
        rv = pd.DatetimeIndex(rv).tz_localize(dtype.tz)
    elif "datetime64" in str(dtype):
        rv = pd.DatetimeIndex(rv, dtype=dtype)
    elif rv.dtype != dtype:
        rv = rv.astype(dtype)
    return rv