def _shuffle_group(df, columns, stage, k, npartitions, ignore_index): c = hash_object_dispatch(df[columns], index=False) typ = np.min_scalar_type(npartitions * 2) c = np.mod(c, npartitions).astype(typ, copy=False) if stage > 0: np.floor_divide(c, k**stage, out=c) if k < int(npartitions / (k**stage)): np.mod(c, k, out=c) return group_split_dispatch(df, c.astype(np.int32), k, ignore_index=ignore_index)
def _hash_series(s): """Row-wise Series hash""" if isinstance(s, pd.Series): # Using pandas hashing, which does not produce the # same result as cudf.Series.hash_values(). Do not # expect hash-based data transformations to be the # same on CPU and CPU. TODO: Fix this (maybe use # murmurhash3 manually on CPU). return hash_object_dispatch(s).values else: if _is_list_dtype(s): return s.list.leaves.hash_values() else: return s.hash_values()
def _shuffle_group_2(df, cols, ignore_index, nparts): if not len(df): return {}, df ind = (hash_object_dispatch(df[cols] if cols else df, index=False) % int(nparts)).astype(np.int32) n = ind.max() + 1 result2 = group_split_dispatch(df, ind.values, n, ignore_index=ignore_index) return result2, df.iloc[:0]
def _shuffle_group(df, columns, stage, k, npartitions, ignore_index, nfinal): ind = hash_object_dispatch(df[columns], index=False) if nfinal and nfinal != npartitions: # Want to start with final mapping here ind = ind % int(nfinal) c = ind.values typ = np.min_scalar_type(npartitions * 2) c = np.mod(c, npartitions).astype(typ, copy=False) if stage > 0: np.floor_divide(c, k**stage, out=c) if k < int(npartitions / (k**stage)): np.mod(c, k, out=c) return group_split_dispatch(df, c.astype(np.int32), k, ignore_index=ignore_index)
def set_partitions_hash(df, columns, npartitions): c = hash_object_dispatch(df[columns], index=False) return np.mod(c, npartitions)