def test_df_merge_sorted_ignore_index(keys, na_position, ascending): size = 100 nparts = 3 keys_1 = keys or ["timestamp"] # Null values NOT currently supported with Categorical data # or when `ascending=False` add_null = keys_1[0] not in ("name") df, dfs = _prepare_merge_sorted_test( size, nparts, keys_1, add_null=add_null, na_position=na_position, ascending=ascending, ) expect = df.sort_values( keys_1, na_position=na_position, ascending=ascending ) result = cudf.merge_sorted( dfs, keys=keys, na_position=na_position, ascending=ascending, ignore_index=True, ) if keys: expect = expect[keys] result = result[keys] assert_eq(expect.reset_index(drop=True), result)
def merge_quantiles(finalq, qs, vals): """ Combine several quantile calculations of different data. [NOTE: Same logic as dask.array merge_percentiles] """ if isinstance(finalq, Iterator): finalq = list(finalq) finalq = np.array(finalq) qs = list(map(list, qs)) vals = list(vals) vals, Ns = zip(*vals) Ns = list(Ns) L = list(zip(*[(q, val, N) for q, val, N in zip(qs, vals, Ns) if N])) if not L: raise ValueError("No non-trivial arrays found") qs, vals, Ns = L if len(vals) != len(qs) or len(Ns) != len(qs): raise ValueError("qs, vals, and Ns parameters must be the same length") # transform qs and Ns into number of observations between quantiles counts = [] for q, N in zip(qs, Ns): count = np.empty(len(q)) count[1:] = np.diff(q) count[0] = q[0] count *= N counts.append(count) def _append_counts(val, count): val["_counts"] = count return val # Sort by calculated quantile values, then number of observations. combined_vals_counts = gd.merge_sorted( [*map(_append_counts, vals, counts)] ) combined_counts = cupy.asnumpy(combined_vals_counts["_counts"].values) combined_vals = combined_vals_counts.drop(columns=["_counts"]) # quantile-like, but scaled by total number of observations combined_q = np.cumsum(combined_counts) # rescale finalq quantiles to match combined_q desired_q = finalq * sum(Ns) # TODO: Support other interpolation methods # For now - Always use "nearest" for interpolation left = np.searchsorted(combined_q, desired_q, side="left") right = np.searchsorted(combined_q, desired_q, side="right") - 1 np.minimum(left, len(combined_vals) - 1, left) # don't exceed max index lower = np.minimum(left, right) upper = np.maximum(left, right) lower_residual = np.abs(combined_q[lower] - desired_q) upper_residual = np.abs(combined_q[upper] - desired_q) mask = lower_residual > upper_residual index = lower # alias; we no longer need lower index[mask] = upper[mask] rv = combined_vals.iloc[index] return rv.reset_index(drop=True)
def test_df_merge_sorted_index(nparts, index, ascending): size = 100 df, dfs = _prepare_merge_sorted_test( size, nparts, index, ascending=ascending, index=True ) expect = df.sort_index(ascending=ascending) result = cudf.merge_sorted(dfs, by_index=True, ascending=ascending) assert_eq(expect.index, result.index)
def test_series_merge_sorted(nparts, key, na_position, ascending): size = 100 df, dfs = _prepare_merge_sorted_test( size, nparts, [key], na_position=na_position, ascending=ascending, series=True, ) expect = df.sort_values(na_position=na_position, ascending=ascending) result = cudf.merge_sorted( dfs, na_position=na_position, ascending=ascending ) assert_eq(expect.reset_index(drop=True), result.reset_index(drop=True))