def check_and_return(ddfs, dfs, join): sol = concat(dfs, join=join) res = dd.concat(ddfs, join=join, interleave_partitions=divisions) assert_eq(res, sol) if known: parts = compute_as_if_collection(dd.DataFrame, res.dask, res.__dask_keys__()) for p in [i.iloc[:0] for i in parts]: res._meta == p # will error if schemas don't align assert not cat_index or has_known_categories(res.index) == known return res
def _get_categories_agg(parts): res = defaultdict(list) res_ind = [] for p in parts: for k, v in p[0].items(): res[k].append(v) res_ind.append(p[1]) res = { k: methods.concat(v, ignore_index=True).drop_duplicates() for k, v in res.items() } if res_ind[0] is None: return res, None return res, res_ind[0].append(res_ind[1:]).drop_duplicates()
def _tail_timedelta(prevs, current, before): """Return the concatenated rows of each dataframe in ``prevs`` whose index is after the first observation in ``current`` - ``before``. Parameters ---------- current : DataFrame prevs : list of DataFrame objects before : timedelta Returns ------- overlapped : DataFrame """ selected = methods.concat( [prev[prev.index > (current.index.min() - before)] for prev in prevs]) return selected
def test_concat_datetimeindex(): # https://github.com/dask/dask/issues/2932 b2 = pd.DataFrame({'x': ['a']}, index=pd.DatetimeIndex(['2015-03-24 00:00:16'], dtype='datetime64[ns]')) b3 = pd.DataFrame({'x': ['c']}, index=pd.DatetimeIndex(['2015-03-29 00:00:44'], dtype='datetime64[ns]')) b2['x'] = b2.x.astype('category').cat.set_categories(['a', 'c']) b3['x'] = b3.x.astype('category').cat.set_categories(['a', 'c']) db2 = dd.from_pandas(b2, 1) db3 = dd.from_pandas(b3, 1) result = concat([b2.iloc[:0], b3.iloc[:0]]) assert result.index.dtype == '<M8[ns]' result = dd.concat([db2, db3]) expected = pd.concat([b2, b3]) assert_eq(result, expected)
def _combined_parts(prev_part, current_part, next_part, before, after): msg = ("Partition size is less than overlapping " "window size. Try using ``df.repartition`` " "to increase the partition size.") if prev_part is not None and isinstance(before, Integral): if prev_part.shape[0] != before: raise NotImplementedError(msg) if next_part is not None and isinstance(after, Integral): if next_part.shape[0] != after: raise NotImplementedError(msg) parts = [p for p in (prev_part, current_part, next_part) if p is not None] combined = methods.concat(parts) return CombinedOutput(( combined, len(prev_part) if prev_part is not None else None, len(next_part) if next_part is not None else None, ))
def overlap_chunk( func, prev_part, current_part, next_part, before, after, args, kwargs ): msg = ( "Partition size is less than overlapping " "window size. Try using ``df.repartition`` " "to increase the partition size." ) if prev_part is not None and isinstance(before, Integral): if prev_part.shape[0] != before: raise NotImplementedError(msg) if next_part is not None and isinstance(after, Integral): if next_part.shape[0] != after: raise NotImplementedError(msg) parts = [p for p in (prev_part, current_part, next_part) if p is not None] combined = methods.concat(parts) out = func(combined, *args, **kwargs) if prev_part is None: before = None if isinstance(before, datetime.timedelta): before = len(prev_part) expansion = None if combined.shape[0] != 0: expansion = out.shape[0] // combined.shape[0] if before and expansion: before *= expansion if next_part is None: return out.iloc[before:] if isinstance(after, datetime.timedelta): after = len(next_part) if after and expansion: after *= expansion return out.iloc[before:-after]