def _aca_agg(self, token, func, aggfunc=None): if aggfunc is None: aggfunc = func if isinstance(self.index, Series): def chunk(df, index, func=func, key=self.key): if isinstance(df, pd.Series): return func(df.groupby(index)) else: return func(df.groupby(index)[key]) agg = lambda df: aggfunc(df.groupby(level=0)) token = self._token_prefix + token return aca([self.df, self.index], chunk=chunk, aggregate=agg, columns=self.key, token=token) else: def chunk(df, index=self.index, func=func, key=self.key): return func(df.groupby(index)[key]) if isinstance(self.index, list): levels = list(range(len(self.index))) else: levels = 0 agg = lambda df: aggfunc(df.groupby(level=levels)) token = self._token_prefix + token return aca(self.df, chunk=chunk, aggregate=agg, columns=self.key, token=token)
def test_deterministic_apply_concat_apply_names(): df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [5, 6, 7, 8]}) a = dd.from_pandas(df, npartitions=2) assert sorted(a.x.nlargest(2).dask) == sorted(a.x.nlargest(2).dask) assert sorted(a.x.nlargest(2).dask) != sorted(a.x.nlargest(3).dask) assert sorted(a.x.drop_duplicates().dask) == sorted(a.x.drop_duplicates().dask) assert sorted(a.groupby("x").y.mean().dask) == sorted(a.groupby("x").y.mean().dask) # Test aca without passing in token string f = lambda a: a.nlargest(5) f2 = lambda a: a.nlargest(3) assert sorted(aca(a.x, f, f, a.x.name).dask) != sorted(aca(a.x, f2, f2, a.x.name).dask) assert sorted(aca(a.x, f, f, a.x.name).dask) == sorted(aca(a.x, f, f, a.x.name).dask)
def nunique(self): def chunk(df, index): # we call set_index here to force a possibly duplicate index # for our reduce step if isinstance(df, pd.DataFrame): grouped = (df.groupby(index).apply( pd.DataFrame.drop_duplicates, subset=self.key)) grouped.index = grouped.index.get_level_values(level=0) else: if isinstance(index, np.ndarray): assert len(index) == len(df) index = pd.Series(index, index=df.index) grouped = pd.concat([df, index], axis=1).drop_duplicates() return grouped def agg(df): if isinstance(self.df, Series): return df.groupby(df.columns[1])[df.columns[0]].nunique() else: return df.groupby(level=0)[self.key].nunique() return aca([self.df, self.index], chunk=chunk, aggregate=agg, columns=self.key, token='series-groupby-nunique')
def nunique(self): def chunk(df, index, key): # we call set_index here to force a possibly duplicate index # for our reduce step if isinstance(df, pd.DataFrame): grouped = (df.groupby(index) .apply(pd.DataFrame.drop_duplicates, subset=key)) grouped.index = grouped.index.get_level_values(level=0) else: if isinstance(index, np.ndarray): assert len(index) == len(df) index = pd.Series(index, index=df.index) grouped = pd.concat([df, index], axis=1).drop_duplicates() return grouped key = self.key is_series = isinstance(self.df, Series) def agg(df): if is_series: return df.groupby(df.columns[1])[df.columns[0]].nunique() else: return df.groupby(level=0)[key].nunique() return aca([self.df, self.index, self.key], chunk=chunk, aggregate=agg, columns=self.key, token='series-groupby-nunique')
def test_deterministic_apply_concat_apply_names(): df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [5, 6, 7, 8]}) a = dd.from_pandas(df, npartitions=2) assert sorted(a.x.nlargest(2).dask) == sorted(a.x.nlargest(2).dask) assert sorted(a.x.nlargest(2).dask) != sorted(a.x.nlargest(3).dask) assert sorted(a.x.drop_duplicates().dask) == \ sorted(a.x.drop_duplicates().dask) assert sorted(a.groupby('x').y.mean().dask) == \ sorted(a.groupby('x').y.mean().dask) # Test aca without passing in token string f = lambda a: a.nlargest(5) f2 = lambda a: a.nlargest(3) assert sorted(aca(a.x, f, f, a.x.name).dask) !=\ sorted(aca(a.x, f2, f2, a.x.name).dask) assert sorted(aca(a.x, f, f, a.x.name).dask) ==\ sorted(aca(a.x, f, f, a.x.name).dask)
def nunique(self): name = self._pd.obj.name if isinstance(self.obj, DataFrame): def agg(df): return df.groupby(level=0)[name].nunique() return aca([self.obj, self.index], chunk=_nunique_df_chunk, aggregate=agg, columns=name, token='series-groupby-nunique') else: def agg(df): return df.groupby(df.columns[1])[df.columns[0]].nunique() return aca([self.obj, self.index], chunk=_nunique_series_chunk, aggregate=agg, columns=name, token='series-groupby-nunique')
def var(self, ddof=1): from functools import partial meta = self.obj._pd if isinstance(meta, pd.Series): meta = meta.to_frame() meta = meta.groupby(self.index).var(ddof=1) result = aca([self.obj, self.index], _var_chunk, partial(_var_agg, ddof=ddof), meta, token=self._token_prefix + 'var') if isinstance(self.obj, Series): result = result[result.columns[0]] if self._slice: result = result[self._slice] return result
def _aca_agg(self, token, func, aggfunc=None): if aggfunc is None: aggfunc = func dummy = func(self._pd) columns = dummy.name if isinstance(dummy, pd.Series) else dummy.columns token = self._token_prefix + token if isinstance(self.index, list): levels = list(range(len(self.index))) else: levels = 0 agg = lambda df: aggfunc(df.groupby(level=levels)) return aca([self.obj, self.index, func, columns], chunk=_apply_chunk, aggregate=agg, columns=dummy, token=token)