def merge_and_compress_summaries(vals_and_weights): """Merge and sort percentile summaries that are already sorted. Each item is a tuple like ``(vals, weights)`` where vals and weights are lists. We sort both by vals. Equal values will be combined, their weights summed together. """ vals_and_weights = [x for x in vals_and_weights if x] if not vals_and_weights: return () it = merge_sorted(*[zip(x, y) for x, y in vals_and_weights]) vals = [] weights = [] vals_append = vals.append weights_append = weights.append val, weight = prev_val, prev_weight = next(it) for val, weight in it: if val == prev_val: prev_weight += weight else: vals_append(prev_val) weights_append(prev_weight) prev_val, prev_weight = val, weight if val == prev_val: vals_append(prev_val) weights_append(prev_weight) return vals, weights
def create_merge_tree(func, keys, token): """Create a task tree that merges all the keys with a reduction function. Parameters ---------- func: callable Reduction function that accepts a single list of values to reduce. keys: iterable Keys to reduce from the source dask graph. token: object Included in each key of the returned dict. This creates a k-ary tree where k depends on the current level and is greater the further away a node is from the root node. This reduces the total number of nodes (thereby reducing scheduler overhead), but still has beneficial properties of trees. For reasonable numbers of keys, N < 1e5, the total number of nodes in the tree is roughly ``N**0.78``. For 1e5 < N < 2e5, is it roughly ``N**0.8``. """ level = 0 prev_width = len(keys) prev_keys = iter(keys) rv = {} while prev_width > 1: width = tree_width(prev_width) groups = tree_groups(prev_width, width) keys = [(token, level, i) for i in range(width)] rv.update((key, (func, list(take(num, prev_keys)))) for num, key in zip(groups, keys)) prev_width = width prev_keys = iter(keys) level += 1 return rv
def partition_quantiles(df, npartitions, upsample=1.0, random_state=None): """ Approximate quantiles of Series used for repartitioning """ assert isinstance(df, Series) # currently, only Series has quantile method # Index.quantile(list-like) must be pd.Series, not pd.Index return_type = Series qs = np.linspace(0, 1, npartitions + 1) token = tokenize(df, qs, upsample) if random_state is None: random_state = int(token, 16) % np.iinfo(np.int32).max state_data = random_state_data(df.npartitions, random_state) df_keys = df.__dask_keys__() name0 = "re-quantiles-0-" + token dtype_dsk = {(name0, 0): (dtype_info, df_keys[0])} name1 = "re-quantiles-1-" + token val_dsk = {(name1, i): ( percentiles_summary, key, df.npartitions, npartitions, upsample, state, ) for i, (state, key) in enumerate(zip(state_data, df_keys))} name2 = "re-quantiles-2-" + token merge_dsk = create_merge_tree(merge_and_compress_summaries, sorted(val_dsk), name2) if not merge_dsk: # Compress the data even if we only have one partition merge_dsk = { (name2, 0, 0): (merge_and_compress_summaries, [list(val_dsk)[0]]) } merged_key = max(merge_dsk) name3 = "re-quantiles-3-" + token last_dsk = { (name3, 0): ( pd.Series, (process_val_weights, merged_key, npartitions, (name0, 0)), qs, None, df.name, ) } dsk = merge(df.dask, dtype_dsk, val_dsk, merge_dsk, last_dsk) new_divisions = [0.0, 1.0] return return_type(dsk, name3, df._meta, new_divisions)
def partition_quantiles(df, npartitions, upsample=1.0, random_state=None): """ Approximate quantiles of Series used for repartitioning """ assert isinstance(df, Series) # currently, only Series has quantile method # Index.quantile(list-like) must be pd.Series, not pd.Index return_type = Series qs = np.linspace(0, 1, npartitions + 1) token = tokenize(df, qs, upsample) if random_state is None: random_state = hash(token) % np.iinfo(np.int32).max state_data = random_state_data(df.npartitions, random_state) df_keys = df._keys() name0 = 're-quantiles-0-' + token dtype_dsk = {(name0, 0): (dtype_info, df_keys[0])} name1 = 're-quantiles-1-' + token val_dsk = {(name1, i): (percentiles_summary, key, df.npartitions, npartitions, upsample, state) for i, (state, key) in enumerate(zip(state_data, df_keys))} name2 = 're-quantiles-2-' + token merge_dsk = create_merge_tree(merge_and_compress_summaries, sorted(val_dsk), name2) if not merge_dsk: # Compress the data even if we only have one partition merge_dsk = {(name2, 0, 0): (merge_and_compress_summaries, [list(val_dsk)[0]])} merged_key = max(merge_dsk) name3 = 're-quantiles-3-' + token last_dsk = {(name3, 0): (pd.Series, (process_val_weights, merged_key, npartitions, (name0, 0)), qs, None, df.name)} dsk = merge(df.dask, dtype_dsk, val_dsk, merge_dsk, last_dsk) new_divisions = [0.0, 1.0] return return_type(dsk, name3, df._meta, new_divisions)