Пример #1
0
def test_df_merge_sorted_ignore_index(keys, na_position, ascending):
    size = 100
    nparts = 3
    keys_1 = keys or ["timestamp"]
    # Null values NOT currently supported with Categorical data
    # or when `ascending=False`
    add_null = keys_1[0] not in ("name")
    df, dfs = _prepare_merge_sorted_test(
        size,
        nparts,
        keys_1,
        add_null=add_null,
        na_position=na_position,
        ascending=ascending,
    )

    expect = df.sort_values(
        keys_1, na_position=na_position, ascending=ascending
    )
    result = cudf.merge_sorted(
        dfs,
        keys=keys,
        na_position=na_position,
        ascending=ascending,
        ignore_index=True,
    )
    if keys:
        expect = expect[keys]
        result = result[keys]

    assert_eq(expect.reset_index(drop=True), result)
Пример #2
0
def merge_quantiles(finalq, qs, vals):
    """ Combine several quantile calculations of different data.
    [NOTE: Same logic as dask.array merge_percentiles]
    """
    if isinstance(finalq, Iterator):
        finalq = list(finalq)
    finalq = np.array(finalq)
    qs = list(map(list, qs))
    vals = list(vals)
    vals, Ns = zip(*vals)
    Ns = list(Ns)

    L = list(zip(*[(q, val, N) for q, val, N in zip(qs, vals, Ns) if N]))
    if not L:
        raise ValueError("No non-trivial arrays found")
    qs, vals, Ns = L

    if len(vals) != len(qs) or len(Ns) != len(qs):
        raise ValueError("qs, vals, and Ns parameters must be the same length")

    # transform qs and Ns into number of observations between quantiles
    counts = []
    for q, N in zip(qs, Ns):
        count = np.empty(len(q))
        count[1:] = np.diff(q)
        count[0] = q[0]
        count *= N
        counts.append(count)

    def _append_counts(val, count):
        val["_counts"] = count
        return val

    # Sort by calculated quantile values, then number of observations.
    combined_vals_counts = gd.merge_sorted(
        [*map(_append_counts, vals, counts)]
    )
    combined_counts = cupy.asnumpy(combined_vals_counts["_counts"].values)
    combined_vals = combined_vals_counts.drop(columns=["_counts"])

    # quantile-like, but scaled by total number of observations
    combined_q = np.cumsum(combined_counts)

    # rescale finalq quantiles to match combined_q
    desired_q = finalq * sum(Ns)

    # TODO: Support other interpolation methods
    # For now - Always use "nearest" for interpolation
    left = np.searchsorted(combined_q, desired_q, side="left")
    right = np.searchsorted(combined_q, desired_q, side="right") - 1
    np.minimum(left, len(combined_vals) - 1, left)  # don't exceed max index
    lower = np.minimum(left, right)
    upper = np.maximum(left, right)
    lower_residual = np.abs(combined_q[lower] - desired_q)
    upper_residual = np.abs(combined_q[upper] - desired_q)
    mask = lower_residual > upper_residual
    index = lower  # alias; we no longer need lower
    index[mask] = upper[mask]
    rv = combined_vals.iloc[index]
    return rv.reset_index(drop=True)
Пример #3
0
def test_df_merge_sorted_index(nparts, index, ascending):
    size = 100
    df, dfs = _prepare_merge_sorted_test(
        size, nparts, index, ascending=ascending, index=True
    )

    expect = df.sort_index(ascending=ascending)
    result = cudf.merge_sorted(dfs, by_index=True, ascending=ascending)

    assert_eq(expect.index, result.index)
Пример #4
0
def test_series_merge_sorted(nparts, key, na_position, ascending):
    size = 100
    df, dfs = _prepare_merge_sorted_test(
        size,
        nparts,
        [key],
        na_position=na_position,
        ascending=ascending,
        series=True,
    )

    expect = df.sort_values(na_position=na_position, ascending=ascending)
    result = cudf.merge_sorted(
        dfs, na_position=na_position, ascending=ascending
    )

    assert_eq(expect.reset_index(drop=True), result.reset_index(drop=True))