def _get_aggregate_funcs( df: DataFrame, aggregates: Dict[str, Dict[str, Any]], ) -> Dict[str, NamedAgg]: """ Converts a set of aggregate config objects into functions that pandas can use as aggregators. Currently only numpy aggregators are supported. :param df: DataFrame on which to perform aggregate operation. :param aggregates: Mapping from column name to aggregate config. :return: Mapping from metric name to function that takes a single input argument. """ agg_funcs: Dict[str, NamedAgg] = {} for name, agg_obj in aggregates.items(): column = agg_obj.get("column", name) if column not in df: raise QueryObjectValidationError( _( "Column referenced by aggregate is undefined: %(column)s", column=column, ) ) if "operator" not in agg_obj: raise QueryObjectValidationError( _("Operator undefined for aggregator: %(name)s", name=name,) ) operator = agg_obj["operator"] if operator not in ALLOWLIST_NUMPY_FUNCTIONS or not hasattr(np, operator): raise QueryObjectValidationError( _("Invalid numpy function: %(operator)s", operator=operator,) ) func = getattr(np, operator) options = agg_obj.get("options", {}) agg_funcs[name] = NamedAgg(column=column, aggfunc=partial(func, **options)) return agg_funcs
with option_context("compute.use_numba", True): result = grouped.agg(func_1, engine=None) tm.assert_frame_equal(expected, result) @td.skip_if_no("numba", "0.46.0") @pytest.mark.parametrize( "agg_func", [ ["min", "max"], "min", { "B": ["min", "max"], "C": "sum" }, NamedAgg(column="B", aggfunc="min"), ], ) def test_multifunc_notimplimented(agg_func): data = DataFrame( { 0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0] }, columns=[0, 1], ) grouped = data.groupby(0) with pytest.raises(NotImplementedError, match="Numba engine can"): grouped.agg(agg_func, engine="numba") with pytest.raises(NotImplementedError, match="Numba engine can"):
def test_agg_misc(): # test with all three Resampler apis and TimeGrouper np.random.seed(1234) index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") index.name = "date" df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index) df_col = df.reset_index() df_mult = df_col.copy() df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], names=["index", "date"]) r = df.resample("2D") cases = [ r, df_col.resample("2D", on="date"), df_mult.resample("2D", level="date"), df.groupby(pd.Grouper(freq="2D")), ] # passed lambda for t in cases: result = t.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) rcustom = t["B"].apply(lambda x: np.std(x, ddof=1)) expected = pd.concat([r["A"].sum(), rcustom], axis=1) tm.assert_frame_equal(result, expected, check_like=True) result = t.agg(A=("A", np.sum), B=("B", lambda x: np.std(x, ddof=1))) tm.assert_frame_equal(result, expected, check_like=True) result = t.agg(A=NamedAgg("A", np.sum), B=NamedAgg("B", lambda x: np.std(x, ddof=1))) tm.assert_frame_equal(result, expected, check_like=True) # agg with renamers expected = pd.concat( [t["A"].sum(), t["B"].sum(), t["A"].mean(), t["B"].mean()], axis=1) expected.columns = pd.MultiIndex.from_tuples([("result1", "A"), ("result1", "B"), ("result2", "A"), ("result2", "B")]) msg = r"Column\(s\) \['result1', 'result2'\] do not exist" for t in cases: with pytest.raises(KeyError, match=msg): t[["A", "B"]].agg({"result1": np.sum, "result2": np.mean}) with pytest.raises(KeyError, match=msg): t[["A", "B"]].agg(A=("result1", np.sum), B=("result2", np.mean)) with pytest.raises(KeyError, match=msg): t[["A", "B"]].agg(A=NamedAgg("result1", np.sum), B=NamedAgg("result2", np.mean)) # agg with different hows expected = pd.concat( [t["A"].sum(), t["A"].std(), t["B"].mean(), t["B"].std()], axis=1) expected.columns = pd.MultiIndex.from_tuples([("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")]) for t in cases: result = t.agg({"A": ["sum", "std"], "B": ["mean", "std"]}) tm.assert_frame_equal(result, expected, check_like=True) # equivalent of using a selection list / or not for t in cases: result = t[["A", "B"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) tm.assert_frame_equal(result, expected, check_like=True) msg = "nested renamer is not supported" # series like aggs for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t["A"].agg({"A": ["sum", "std"]}) with pytest.raises(pd.core.base.SpecificationError, match=msg): t["A"].agg({"A": ["sum", "std"], "B": ["mean", "std"]}) # errors # invalid names in the agg specification msg = r"Column\(s\) \['B'\] do not exist" for t in cases: with pytest.raises(KeyError, match=msg): t[["A"]].agg({"A": ["sum", "std"], "B": ["mean", "std"]})
def test_agg(): # test with all three Resampler apis and TimeGrouper np.random.seed(1234) index = date_range(datetime(2005, 1, 1), datetime(2005, 1, 10), freq="D") index.name = "date" df = DataFrame(np.random.rand(10, 2), columns=list("AB"), index=index) df_col = df.reset_index() df_mult = df_col.copy() df_mult.index = pd.MultiIndex.from_arrays([range(10), df.index], names=["index", "date"]) r = df.resample("2D") cases = [ r, df_col.resample("2D", on="date"), df_mult.resample("2D", level="date"), df.groupby(pd.Grouper(freq="2D")), ] a_mean = r["A"].mean() a_std = r["A"].std() a_sum = r["A"].sum() b_mean = r["B"].mean() b_std = r["B"].std() b_sum = r["B"].sum() expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) for t in cases: warn = FutureWarning if t in cases[1:3] else None with tm.assert_produces_warning( warn, match=r"\['date'\] did not aggregate successfully", ): # .var on dt64 column raises and is dropped result = t.aggregate([np.mean, np.std]) tm.assert_frame_equal(result, expected) expected = pd.concat([a_mean, b_std], axis=1) for t in cases: result = t.aggregate({"A": np.mean, "B": np.std}) tm.assert_frame_equal(result, expected, check_like=True) result = t.aggregate(A=("A", np.mean), B=("B", np.std)) tm.assert_frame_equal(result, expected, check_like=True) result = t.aggregate(A=NamedAgg("A", np.mean), B=NamedAgg("B", np.std)) tm.assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std")]) for t in cases: result = t.aggregate({"A": ["mean", "std"]}) tm.assert_frame_equal(result, expected) expected = pd.concat([a_mean, a_sum], axis=1) expected.columns = ["mean", "sum"] for t in cases: result = t["A"].aggregate(["mean", "sum"]) tm.assert_frame_equal(result, expected) result = t["A"].aggregate(mean="mean", sum="sum") tm.assert_frame_equal(result, expected) msg = "nested renamer is not supported" for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t.aggregate({"A": {"mean": "mean", "sum": "sum"}}) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "sum"), ("B", "mean2"), ("B", "sum2")]) for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): t.aggregate({ "A": { "mean": "mean", "sum": "sum" }, "B": { "mean2": "mean", "sum2": "sum" }, }) expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_tuples([("A", "mean"), ("A", "std"), ("B", "mean"), ("B", "std")]) for t in cases: result = t.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) tm.assert_frame_equal(result, expected, check_like=True) expected = pd.concat([a_mean, a_sum, b_mean, b_sum], axis=1) expected.columns = pd.MultiIndex.from_tuples([ ("r1", "A", "mean"), ("r1", "A", "sum"), ("r2", "B", "mean"), ("r2", "B", "sum"), ])
def resolve(self, gdf): return NamedAgg(column=self.column, aggfunc='min')
def resolve(self, gdf): return NamedAgg(column=self.column, aggfunc=Series.nunique)