示例#1
0
def _get_aggregate_funcs(
    df: DataFrame, aggregates: Dict[str, Dict[str, Any]],
) -> Dict[str, NamedAgg]:
    """
    Converts a set of aggregate config objects into functions that pandas can use as
    aggregators. Currently only numpy aggregators are supported.

    :param df: DataFrame on which to perform aggregate operation.
    :param aggregates: Mapping from column name to aggregate config.
    :return: Mapping from metric name to function that takes a single input argument.
    """
    agg_funcs: Dict[str, NamedAgg] = {}
    for name, agg_obj in aggregates.items():
        column = agg_obj.get("column", name)
        if column not in df:
            raise QueryObjectValidationError(
                _(
                    "Column referenced by aggregate is undefined: %(column)s",
                    column=column,
                )
            )
        if "operator" not in agg_obj:
            raise QueryObjectValidationError(
                _("Operator undefined for aggregator: %(name)s", name=name,)
            )
        operator = agg_obj["operator"]
        if operator not in ALLOWLIST_NUMPY_FUNCTIONS or not hasattr(np, operator):
            raise QueryObjectValidationError(
                _("Invalid numpy function: %(operator)s", operator=operator,)
            )
        func = getattr(np, operator)
        options = agg_obj.get("options", {})
        agg_funcs[name] = NamedAgg(column=column, aggfunc=partial(func, **options))

    return agg_funcs
示例#2
0
    with option_context("compute.use_numba", True):
        result = grouped.agg(func_1, engine=None)
    tm.assert_frame_equal(expected, result)


@td.skip_if_no("numba", "0.46.0")
@pytest.mark.parametrize(
    "agg_func",
    [
        ["min", "max"],
        "min",
        {
            "B": ["min", "max"],
            "C": "sum"
        },
        NamedAgg(column="B", aggfunc="min"),
    ],
)
def test_multifunc_notimplimented(agg_func):
    data = DataFrame(
        {
            0: ["a", "a", "b", "b", "a"],
            1: [1.0, 2.0, 3.0, 4.0, 5.0]
        },
        columns=[0, 1],
    )
    grouped = data.groupby(0)
    with pytest.raises(NotImplementedError, match="Numba engine can"):
        grouped.agg(agg_func, engine="numba")

    with pytest.raises(NotImplementedError, match="Numba engine can"):
示例#3
0
def test_agg_misc():
    # test with all three Resampler apis and TimeGrouper

    np.random.seed(1234)
    index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D")
    index.name = "date"
    df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index)
    df_col = df.reset_index()
    df_mult = df_col.copy()
    df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
                                              names=["index", "date"])

    r = df.resample("2D")
    cases = [
        r,
        df_col.resample("2D", on="date"),
        df_mult.resample("2D", level="date"),
        df.groupby(pd.Grouper(freq="2D")),
    ]

    # passed lambda
    for t in cases:
        result = t.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)})
        rcustom = t["B"].apply(lambda x: np.std(x, ddof=1))
        expected = pd.concat([r["A"].sum(), rcustom], axis=1)
        tm.assert_frame_equal(result, expected, check_like=True)

        result = t.agg(A=("A", np.sum), B=("B", lambda x: np.std(x, ddof=1)))
        tm.assert_frame_equal(result, expected, check_like=True)

        result = t.agg(A=NamedAgg("A", np.sum),
                       B=NamedAgg("B", lambda x: np.std(x, ddof=1)))
        tm.assert_frame_equal(result, expected, check_like=True)

    # agg with renamers
    expected = pd.concat(
        [t["A"].sum(), t["B"].sum(), t["A"].mean(), t["B"].mean()], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([("result1", "A"),
                                                  ("result1", "B"),
                                                  ("result2", "A"),
                                                  ("result2", "B")])

    msg = r"Column\(s\) \['result1', 'result2'\] do not exist"
    for t in cases:
        with pytest.raises(KeyError, match=msg):
            t[["A", "B"]].agg({"result1": np.sum, "result2": np.mean})

        with pytest.raises(KeyError, match=msg):
            t[["A", "B"]].agg(A=("result1", np.sum), B=("result2", np.mean))

        with pytest.raises(KeyError, match=msg):
            t[["A", "B"]].agg(A=NamedAgg("result1", np.sum),
                              B=NamedAgg("result2", np.mean))

    # agg with different hows
    expected = pd.concat(
        [t["A"].sum(), t["A"].std(), t["B"].mean(), t["B"].std()], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([("A", "sum"), ("A", "std"),
                                                  ("B", "mean"), ("B", "std")])
    for t in cases:
        result = t.agg({"A": ["sum", "std"], "B": ["mean", "std"]})
        tm.assert_frame_equal(result, expected, check_like=True)

    # equivalent of using a selection list / or not
    for t in cases:
        result = t[["A", "B"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]})
        tm.assert_frame_equal(result, expected, check_like=True)

    msg = "nested renamer is not supported"

    # series like aggs
    for t in cases:
        with pytest.raises(pd.core.base.SpecificationError, match=msg):
            t["A"].agg({"A": ["sum", "std"]})

        with pytest.raises(pd.core.base.SpecificationError, match=msg):
            t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]})

    # errors
    # invalid names in the agg specification
    msg = r"Column\(s\) \['B'\] do not exist"
    for t in cases:
        with pytest.raises(KeyError, match=msg):
            t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]})
示例#4
0
def test_agg():
    # test with all three Resampler apis and TimeGrouper

    np.random.seed(1234)
    index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D")
    index.name = "date"
    df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index)
    df_col = df.reset_index()
    df_mult = df_col.copy()
    df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index],
                                              names=["index", "date"])
    r = df.resample("2D")
    cases = [
        r,
        df_col.resample("2D", on="date"),
        df_mult.resample("2D", level="date"),
        df.groupby(pd.Grouper(freq="2D")),
    ]

    a_mean = r["A"].mean()
    a_std = r["A"].std()
    a_sum = r["A"].sum()
    b_mean = r["B"].mean()
    b_std = r["B"].std()
    b_sum = r["B"].sum()

    expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
    expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean",
                                                                "std"]])
    for t in cases:
        warn = FutureWarning if t in cases[1:3] else None
        with tm.assert_produces_warning(
                warn,
                match=r"\['date'\] did not aggregate successfully",
        ):
            # .var on dt64 column raises and is dropped
            result = t.aggregate([np.mean, np.std])
        tm.assert_frame_equal(result, expected)

    expected = pd.concat([a_mean, b_std], axis=1)
    for t in cases:
        result = t.aggregate({"A": np.mean, "B": np.std})
        tm.assert_frame_equal(result, expected, check_like=True)

        result = t.aggregate(A=("A", np.mean), B=("B", np.std))
        tm.assert_frame_equal(result, expected, check_like=True)

        result = t.aggregate(A=NamedAgg("A", np.mean), B=NamedAgg("B", np.std))
        tm.assert_frame_equal(result, expected, check_like=True)

    expected = pd.concat([a_mean, a_std], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")])
    for t in cases:
        result = t.aggregate({"A": ["mean", "std"]})
        tm.assert_frame_equal(result, expected)

    expected = pd.concat([a_mean, a_sum], axis=1)
    expected.columns = ["mean", "sum"]
    for t in cases:
        result = t["A"].aggregate(["mean", "sum"])
        tm.assert_frame_equal(result, expected)

        result = t["A"].aggregate(mean="mean", sum="sum")
        tm.assert_frame_equal(result, expected)

    msg = "nested renamer is not supported"
    for t in cases:
        with pytest.raises(pd.core.base.SpecificationError, match=msg):
            t.aggregate({"A": {"mean": "mean", "sum": "sum"}})

    expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "sum"),
                                                  ("B", "mean2"),
                                                  ("B", "sum2")])
    for t in cases:
        with pytest.raises(pd.core.base.SpecificationError, match=msg):
            t.aggregate({
                "A": {
                    "mean": "mean",
                    "sum": "sum"
                },
                "B": {
                    "mean2": "mean",
                    "sum2": "sum"
                },
            })

    expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std"),
                                                  ("B", "mean"), ("B", "std")])
    for t in cases:
        result = t.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]})
        tm.assert_frame_equal(result, expected, check_like=True)

    expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1)
    expected.columns = pd.MultiIndex.from_tuples([
        ("r1", "A", "mean"),
        ("r1", "A", "sum"),
        ("r2", "B", "mean"),
        ("r2", "B", "sum"),
    ])
示例#5
0
 def resolve(self, gdf):
     return NamedAgg(column=self.column, aggfunc='min')
示例#6
0
 def resolve(self, gdf):
     return NamedAgg(column=self.column, aggfunc=Series.nunique)