def test_is_categorical_dtype(): df = pd.DataFrame({"cat": pd.Categorical([1, 2, 3, 4]), "x": [1, 2, 3, 4]}) assert is_categorical_dtype(df["cat"]) assert not is_categorical_dtype(df["x"]) ddf = dd.from_pandas(df, 2) assert is_categorical_dtype(ddf["cat"]) assert not is_categorical_dtype(ddf["x"])
def test_is_categorical_dtype(): df = pd.DataFrame({'cat': pd.Categorical([1, 2, 3, 4]), 'x': [1, 2, 3, 4]}) assert is_categorical_dtype(df['cat']) assert not is_categorical_dtype(df['x']) ddf = dd.from_pandas(df, 2) assert is_categorical_dtype(ddf['cat']) assert not is_categorical_dtype(ddf['x'])
def test_is_categorical_dtype(): df = pd.DataFrame({'cat': pd.Categorical([1, 2, 3, 4]), 'x': [1, 2, 3, 4]}) assert is_categorical_dtype(df['cat']) assert not is_categorical_dtype(df['x']) ddf = dd.from_pandas(df, 2) assert is_categorical_dtype(ddf['cat']) assert not is_categorical_dtype(ddf['x'])
def test_from_bcolz(): bcolz = pytest.importorskip('bcolz') t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']], names=['x', 'y', 'a']) d = dd.from_bcolz(t, chunksize=2) assert d.npartitions == 2 assert is_categorical_dtype(d.dtypes['a']) assert list(d.x.compute(scheduler='sync')) == [1, 2, 3] assert list(d.a.compute(scheduler='sync')) == ['a', 'b', 'a'] L = list(d.index.compute(scheduler='sync')) assert L == [0, 1, 2] d = dd.from_bcolz(t, chunksize=2, index='x') L = list(d.index.compute(scheduler='sync')) assert L == [1, 2, 3] or L == [1, 3, 2] # Names assert (sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted(dd.from_bcolz(t, chunksize=2).dask)) assert (sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(dd.from_bcolz(t, chunksize=3).dask)) dsk = dd.from_bcolz(t, chunksize=3).dask t.append((4, 4., 'b')) t.flush() assert (sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(dsk))
def test_from_bcolz(): bcolz = pytest.importorskip("bcolz") t = bcolz.ctable([[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]], names=["x", "y", "a"]) d = dd.from_bcolz(t, chunksize=2) assert d.npartitions == 2 assert is_categorical_dtype(d.dtypes["a"]) assert list(d.x.compute(scheduler="sync")) == [1, 2, 3] assert list(d.a.compute(scheduler="sync")) == ["a", "b", "a"] L = list(d.index.compute(scheduler="sync")) assert L == [0, 1, 2] d = dd.from_bcolz(t, chunksize=2, index="x") L = list(d.index.compute(scheduler="sync")) assert L == [1, 2, 3] or L == [1, 3, 2] # Names assert sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted( dd.from_bcolz(t, chunksize=2).dask) assert sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted( dd.from_bcolz(t, chunksize=3).dask) dsk = dd.from_bcolz(t, chunksize=3).dask t.append((4, 4.0, "b")) t.flush() assert sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(dsk)
def percentiles_summary(df, num_old, num_new, upsample, state): """Summarize data using percentiles and derived weights. These summaries can be merged, compressed, and converted back into approximate percentiles. Parameters ---------- df: pandas.Series Data to summarize num_old: int Number of partitions of the current object num_new: int Number of partitions of the new object upsample: float Scale factor to increase the number of percentiles calculated in each partition. Use to improve accuracy. """ from dask.array.dispatch import percentile_lookup as _percentile length = len(df) if length == 0: return () random_state = np.random.RandomState(state) qs = sample_percentiles(num_old, num_new, length, upsample, random_state) data = df interpolation = "linear" if is_categorical_dtype(data): data = data.cat.codes interpolation = "nearest" elif isinstance(data.dtype, pd.core.dtypes.dtypes.DatetimeTZDtype) or np.issubdtype( data.dtype, np.integer ): interpolation = "nearest" vals, n = _percentile(data, qs, interpolation=interpolation) if ( is_cupy_type(data) and interpolation == "linear" and np.issubdtype(data.dtype, np.integer) ): vals = np.round(vals).astype(data.dtype) if qs[0] == 0: # Ensure the 0th quantile is the minimum value of the data vals[0] = data.min() vals_and_weights = percentiles_to_weights(qs, vals, length) return vals_and_weights
def check(i): t = bcolz.ctable([[1, 2, 3], [1., 2., 3.], ['a', 'b', 'a']], names=['x', 'y', 'a']) d = dd.from_bcolz(t, chunksize=2) assert d.npartitions == 2 assert is_categorical_dtype(d.dtypes['a']) assert list(d.x.compute(scheduler='sync')) == [1, 2, 3] assert list(d.a.compute(scheduler='sync')) == ['a', 'b', 'a'] d = dd.from_bcolz(t, chunksize=2, index='x') L = list(d.index.compute(scheduler='sync')) assert L == [1, 2, 3] or L == [1, 3, 2] # Names assert (sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted(dd.from_bcolz(t, chunksize=2).dask)) assert (sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted(dd.from_bcolz(t, chunksize=3).dask))
def check(i): t = bcolz.ctable([[1, 2, 3], [1.0, 2.0, 3.0], ["a", "b", "a"]], names=["x", "y", "a"]) d = dd.from_bcolz(t, chunksize=2) assert d.npartitions == 2 assert is_categorical_dtype(d.dtypes["a"]) assert list(d.x.compute(scheduler="sync")) == [1, 2, 3] assert list(d.a.compute(scheduler="sync")) == ["a", "b", "a"] d = dd.from_bcolz(t, chunksize=2, index="x") L = list(d.index.compute(scheduler="sync")) assert L == [1, 2, 3] or L == [1, 3, 2] # Names assert sorted(dd.from_bcolz(t, chunksize=2).dask) == sorted( dd.from_bcolz(t, chunksize=2).dask) assert sorted(dd.from_bcolz(t, chunksize=2).dask) != sorted( dd.from_bcolz(t, chunksize=3).dask)
def dtype_info(df): info = None if is_categorical_dtype(df): data = df.values info = (data.categories, data.ordered) return df.dtype, info
def process_val_weights(vals_and_weights, npartitions, dtype_info): """Calculate final approximate percentiles given weighted vals ``vals_and_weights`` is assumed to be sorted. We take a cumulative sum of the weights, which makes them percentile-like (their scale is [0, N] instead of [0, 100]). Next we find the divisions to create partitions of approximately equal size. It is possible for adjacent values of the result to be the same. Since these determine the divisions of the new partitions, some partitions may be empty. This can happen if we under-sample the data, or if there aren't enough unique values in the column. Increasing ``upsample`` keyword argument in ``df.set_index`` may help. """ dtype, info = dtype_info if not vals_and_weights: try: return np.array(None, dtype=dtype) except Exception: # dtype does not support None value so allow it to change return np.array(None, dtype=np.float_) vals, weights = vals_and_weights vals = np.array(vals) weights = np.array(weights) # We want to create exactly `npartition` number of groups of `vals` that # are approximately the same weight and non-empty if possible. We use a # simple approach (more accurate algorithms exist): # 1. Remove all the values with weights larger than the relative # percentile width from consideration (these are `jumbo`s) # 2. Calculate percentiles with "interpolation=left" of percentile-like # weights of the remaining values. These are guaranteed to be unique. # 3. Concatenate the values from (1) and (2), sort, and return. # # We assume that all values are unique, which happens in the previous # step `merge_and_compress_summaries`. if len(vals) == npartitions + 1: rv = vals elif len(vals) < npartitions + 1: # The data is under-sampled if np.issubdtype(vals.dtype, np.number) and not is_categorical_dtype(dtype): # Interpolate extra divisions q_weights = np.cumsum(weights) q_target = np.linspace(q_weights[0], q_weights[-1], npartitions + 1) rv = np.interp(q_target, q_weights, vals) else: # Distribute the empty partitions duplicated_index = np.linspace( 0, len(vals) - 1, npartitions - len(vals) + 1, dtype=int ) duplicated_vals = vals[duplicated_index] rv = np.concatenate([vals, duplicated_vals]) rv.sort() else: target_weight = weights.sum() / npartitions jumbo_mask = weights >= target_weight jumbo_vals = vals[jumbo_mask] trimmed_vals = vals[~jumbo_mask] trimmed_weights = weights[~jumbo_mask] trimmed_npartitions = npartitions - len(jumbo_vals) # percentile-like, but scaled by weights q_weights = np.cumsum(trimmed_weights) q_target = np.linspace(0, q_weights[-1], trimmed_npartitions + 1) left = np.searchsorted(q_weights, q_target, side="left") right = np.searchsorted(q_weights, q_target, side="right") - 1 # stay inbounds np.maximum(right, 0, right) lower = np.minimum(left, right) trimmed = trimmed_vals[lower] rv = np.concatenate([trimmed, jumbo_vals]) rv.sort() if is_categorical_dtype(dtype): rv = pd.Categorical.from_codes(rv, info[0], info[1]) elif is_datetime64tz_dtype(dtype): rv = pd.DatetimeIndex(rv).tz_localize(dtype.tz) elif "datetime64" in str(dtype): rv = pd.DatetimeIndex(rv, dtype=dtype) elif rv.dtype != dtype: rv = rv.astype(dtype) return rv