def test_apply_no_grouper(df): df = df[["x", "y"]] res = GroupBy(["a"]).apply(df, lambda x: x.sort_values("x")) assert_array_equal(res.columns, ["x", "y"]) assert_array_equal(res["x"], df["x"].sort_values()) assert_array_equal(res["y"], df.loc[np.argsort(df["x"]), "y"])
def __call__(self, data: DataFrame, groupby: GroupBy, orient: str) -> DataFrame: # TODO where to ensure that other semantic variables are sorted properly? # TODO why are we not using the passed in groupby here? groupers = ["col", "row", orient] return GroupBy(groupers).apply(data, self._stack, orient)
def test_agg_one_grouper(df): res = GroupBy(["a"]).agg(df, {"y": "max"}) assert_array_equal(res.index, [0, 1]) assert_array_equal(res.columns, ["a", "y"]) assert_array_equal(res["a"], ["a", "b"]) assert_array_equal(res["y"], [.8, .5])
def test_default_groups(self, df, orient): other = {"x": "y", "y": "x"}[orient] gb = GroupBy(["grp2"]) res = Norm()(df, gb, orient) for _, grp in res.groupby("grp2"): assert grp[other].max() == pytest.approx(1)
def test_faceted_drop(self, toy_df_facets): groupby = GroupBy(["x", "grp", "col"]) res = Dodge(empty="drop")(toy_df_facets, groupby, "x") assert_array_equal(res["y"], [1, 2, 3, 1, 2, 3]) assert_array_almost_equal(res["x"], [-.2, .2, 1, 0, 1, 2]) assert_array_almost_equal(res["width"], [.4] * 6)
def test_faceted_default(self, toy_df_facets): groupby = GroupBy(["x", "grp", "col"]) res = Dodge()(toy_df_facets, groupby, "x") assert_array_equal(res["y"], [1, 2, 3, 1, 2, 3]) assert_array_almost_equal(res["x"], [-.2, .2, .8, .2, .8, 2.2]) assert_array_almost_equal(res["width"], [.4] * 6)
def test_widths_drop(self, toy_df_widths): groupby = GroupBy(["x", "grp"]) res = Dodge(empty="drop")(toy_df_widths, groupby, "x") assert_array_equal(res["y"], [1, 2, 3]) assert_array_almost_equal(res["x"], [-.08, .32, 1]) assert_array_almost_equal(res["width"], [.64, .16, .2])
def test_gap(self, toy_df): groupby = GroupBy(["x", "grp"]) res = Dodge(gap=.25)(toy_df, groupby, "x") assert_array_equal(res["y"], [1, 2, 3]) assert_array_almost_equal(res["x"], [-.2, .2, 1.2]) assert_array_almost_equal(res["width"], [.3, .3, .3])
def test_drop(self, toy_df): groupby = GroupBy(["x", "grp"]) res = Dodge("drop")(toy_df, groupby, "x") assert_array_equal(res["y"], [1, 2, 3]) assert_array_almost_equal(res["x"], [-.2, .2, 1]) assert_array_almost_equal(res["width"], [.4, .4, .4])
def test_fill(self, toy_df): groupby = GroupBy(["x", "grp"]) res = Dodge(empty="fill")(toy_df, groupby, "x") assert_array_equal(res["y"], [1, 2, 3]), assert_array_almost_equal(res["x"], [-.2, .2, 1]) assert_array_almost_equal(res["width"], [.4, .4, .8])
def triple_args(self): groupby = GroupBy(["group", "a", "s"]) class Scale: scale_type = "continuous" return groupby, "x", {"x": Scale()}
def test_basic(self, toy_df): groupby = GroupBy(["color", "group"]) res = Stack()(toy_df, groupby, "x") assert_array_equal(res["x"], [0, 0, 1]) assert_array_equal(res["y"], [1, 3, 3]) assert_array_equal(res["baseline"], [0, 1, 0])
def test_apply_one_grouper(df): res = GroupBy(["a"]).apply(df, lambda x: x.sort_values("x")) assert_array_equal(res.index, [0, 1, 2, 3, 4]) assert_array_equal(res.columns, ["a", "b", "x", "y"]) assert_array_equal(res["a"], ["a", "a", "a", "b", "b"]) assert_array_equal(res["b"], ["g", "h", "f", "f", "h"]) assert_array_equal(res["x"], [1, 1, 2, 2, 3])
def test_baseline_homogeneity_check(self, toy_df): toy_df["baseline"] = [0, 1, 2] groupby = GroupBy(["color", "group"]) move = Stack() err = "Stack move cannot be used when baselines" with pytest.raises(RuntimeError, match=err): move(toy_df, groupby, "x")
def test_faceted(self, toy_df_facets): groupby = GroupBy(["color", "group"]) res = Stack()(toy_df_facets, groupby, "x") assert_array_equal(res["x"], [0, 0, 1, 0, 1, 2]) assert_array_equal(res["y"], [1, 3, 3, 1, 2, 3]) assert_array_equal(res["baseline"], [0, 1, 0, 0, 0, 0])
def test_agg_two_groupers(df): res = GroupBy(["a", "x"]).agg(df, {"y": "min"}) assert_array_equal(res.index, [0, 1, 2, 3, 4, 5]) assert_array_equal(res.columns, ["a", "x", "y"]) assert_array_equal(res["a"], ["a", "a", "a", "b", "b", "b"]) assert_array_equal(res["x"], [1, 2, 3, 1, 2, 3]) assert_array_equal(res["y"], [.2, .8, np.nan, np.nan, .4, .5])
def test_orient(self, toy_df): df = toy_df.assign(x=toy_df["y"], y=toy_df["x"]) groupby = GroupBy(["y", "grp"]) res = Dodge("drop")(df, groupby, "y") assert_array_equal(res["x"], [1, 2, 3]) assert_array_almost_equal(res["y"], [-.2, .2, 1]) assert_array_almost_equal(res["width"], [.4, .4, .4])
def test_no_grouper(self, df): groupby = GroupBy(["group"]) res = PolyFit(order=1, gridsize=100)(df[["x", "y"]], groupby, "x", {}) assert_array_equal(res.columns, ["x", "y"]) grid = np.linspace(df["x"].min(), df["x"].max(), 100) assert_array_equal(res["x"], grid) assert_array_almost_equal(res["y"].diff().diff().dropna(), np.zeros(grid.size - 2))
def test_agg_two_groupers_ordered(df): order = {"b": ["h", "g", "f"], "x": [3, 2, 1]} res = GroupBy(order).agg(df, {"a": "min", "y": lambda x: x.iloc[0]}) assert_array_equal(res.index, [0, 1, 2, 3, 4, 5, 6, 7, 8]) assert_array_equal(res.columns, ["a", "b", "x", "y"]) assert_array_equal(res["b"], ["h", "h", "h", "g", "g", "g", "f", "f", "f"]) assert_array_equal(res["x"], [3, 2, 1, 3, 2, 1, 3, 2, 1]) T, F = True, False assert_array_equal(res["a"].isna(), [F, T, F, T, T, F, T, F, T]) assert_array_equal(res["a"].dropna(), ["b", "a", "a", "a"]) assert_array_equal(res["y"].dropna(), [.5, .3, .2, .8])
def __call__( self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], ) -> DataFrame: var = {"x": "y", "y": "x"}.get(orient) res = (groupby.agg(data, { var: self.func }).dropna().reset_index(drop=True)) return res
def test_apply_replace_columns(df): def add_sorted_cumsum(df): x = df["x"].sort_values() z = df.loc[x.index, "y"].cumsum() return pd.DataFrame(dict(x=x.values, z=z.values)) res = GroupBy(["a"]).apply(df, add_sorted_cumsum) assert_array_equal(res.index, df.index) assert_array_equal(res.columns, ["a", "x", "z"]) assert_array_equal(res["a"], ["a", "a", "a", "b", "b"]) assert_array_equal(res["x"], [1, 1, 2, 2, 3]) assert_array_equal(res["z"], [.2, .5, 1.3, .4, .9])
def test_one_grouper(self, df): groupby = GroupBy(["group"]) gridsize = 50 res = PolyFit(gridsize=gridsize)(df, groupby, "x", {}) assert res.columns.to_list() == ["x", "y", "group"] ngroups = df["group"].nunique() assert_array_equal(res.index, np.arange(ngroups * gridsize)) for _, part in res.groupby("group"): grid = np.linspace(part["x"].min(), part["x"].max(), gridsize) assert_array_equal(part["x"], grid) assert part["y"].diff().diff().dropna().abs().gt(0).all()
def test_single_semantic(self, df, grp): groupby = GroupBy(["x", grp]) res = Dodge()(df, groupby, "x") levels = categorical_order(df[grp]) w, n = 0.8, len(levels) shifts = np.linspace(0, w - w / n, n) shifts -= shifts.mean() assert_series_equal(res["y"], df["y"]) assert_series_equal(res["width"], df["width"] / n) for val, shift in zip(levels, shifts): rows = df[grp] == val assert_series_equal(res.loc[rows, "x"], df.loc[rows, "x"] + shift)
def test_apply_mutate_columns(df): xx = np.arange(0, 5) hats = [] def polyfit(df): fit = np.polyfit(df["x"], df["y"], 1) hat = np.polyval(fit, xx) hats.append(hat) return pd.DataFrame(dict(x=xx, y=hat)) res = GroupBy(["a"]).apply(df, polyfit) assert_array_equal(res.index, np.arange(xx.size * 2)) assert_array_equal(res.columns, ["a", "x", "y"]) assert_array_equal(res["a"], ["a"] * xx.size + ["b"] * xx.size) assert_array_equal(res["x"], xx.tolist() + xx.tolist()) assert_array_equal(res["y"], np.concatenate(hats))
def test_two_semantics(self, df): groupby = GroupBy(["x", "grp2", "grp3"]) res = Dodge()(df, groupby, "x") levels = categorical_order(df["grp2"]), categorical_order(df["grp3"]) w, n = 0.8, len(levels[0]) * len(levels[1]) shifts = np.linspace(0, w - w / n, n) shifts -= shifts.mean() assert_series_equal(res["y"], df["y"]) assert_series_equal(res["width"], df["width"] / n) for (v2, v3), shift in zip(product(*levels), shifts): rows = (df["grp2"] == v2) & (df["grp3"] == v3) assert_series_equal(res.loc[rows, "x"], df.loc[rows, "x"] + shift)
def __call__( self, data: DataFrame, groupby: GroupBy, orient: str, scales: dict[str, Scale], ) -> DataFrame: boot_kws = {"n_boot": self.n_boot, "seed": self.seed} engine = EstimateAggregator(self.func, self.errorbar, **boot_kws) var = {"x": "y", "y": "x"}.get(orient) res = (groupby.apply( data, self._process, var, engine).dropna(subset=["x", "y"]).reset_index(drop=True)) res = res.fillna({f"{var}min": res[var], f"{var}max": res[var]}) return res
def __call__(self, data: DataFrame, groupby: GroupBy, orient: str) -> DataFrame: grouping_vars = [v for v in groupby.order if v in data] groups = groupby.agg(data, {"width": "max"}) if self.empty == "fill": groups = groups.dropna() def groupby_pos(s): grouper = [groups[v] for v in [orient, "col", "row"] if v in data] return s.groupby(grouper, sort=False, observed=True) def scale_widths(w): # TODO what value to fill missing widths??? Hard problem... # TODO short circuit this if outer widths has no variance? empty = 0 if self.empty == "fill" else w.mean() filled = w.fillna(empty) scale = filled.max() norm = filled.sum() if self.empty == "keep": w = filled return w / norm * scale def widths_to_offsets(w): return w.shift(1).fillna(0).cumsum() + (w - w.sum()) / 2 new_widths = groupby_pos(groups["width"]).transform(scale_widths) offsets = groupby_pos(new_widths).transform(widths_to_offsets) if self.gap: new_widths *= 1 - self.gap groups["_dodged"] = groups[orient] + offsets groups["width"] = new_widths out = (data.drop("width", axis=1).merge( groups, on=grouping_vars, how="left").drop(orient, axis=1).rename(columns={"_dodged": orient})) return out
def __call__(self, data, groupby, orient, scales): # TODO better to do this as an isinstance check? # We are only asking about Nominal scales now, # but presumably would apply to Ordinal too? scale_type = scales[orient].__class__.__name__.lower() grouping_vars = [v for v in data if v in groupby.order] if not grouping_vars or self.common_bins is True: bin_kws = self._define_bin_params(data, orient, scale_type) data = groupby.apply(data, self._eval, orient, bin_kws) else: if self.common_bins is False: bin_groupby = GroupBy(grouping_vars) else: bin_groupby = GroupBy(self.common_bins) data = bin_groupby.apply( data, self._get_bins_and_eval, orient, groupby, scale_type, ) # TODO Make this an option? # (This needs to be tested if enabled, and maybe should be in _eval) # other = {"x": "y", "y": "x"}[orient] # data = data[data[other] > 0] if not grouping_vars or self.common_norm is True: data = self._normalize(data, orient) else: if self.common_norm is False: norm_grouper = grouping_vars else: norm_grouper = self.common_norm normalize = partial(self._normalize, orient=orient) data = GroupBy(norm_grouper).apply(data, normalize) return data
def __call__(self, data, groupby, orient, scales): scale_type = scales[orient].scale_type grouping_vars = [v for v in data if v in groupby.order] if not grouping_vars or self.common_bins is True: bin_kws = self._define_bin_params(data, orient, scale_type) data = groupby.apply(data, self._eval, orient, bin_kws) else: if self.common_bins is False: bin_groupby = GroupBy(grouping_vars) else: bin_groupby = GroupBy(self.common_bins) data = bin_groupby.apply( data, self._get_bins_and_eval, orient, groupby, scale_type, ) # TODO Make this an option? # (This needs to be tested if enabled, and maybe should be in _eval) # other = {"x": "y", "y": "x"}[orient] # data = data[data[other] > 0] if not grouping_vars or self.common_norm is True: data = self._normalize(data, orient) else: if self.common_norm is False: norm_grouper = grouping_vars else: norm_grouper = self.common_norm normalize = partial(self._normalize, orient=orient) data = GroupBy(norm_grouper).apply(data, normalize) return data
def get_groupby(self, df, orient): other = {"x": "y", "y": "x"}[orient] cols = [c for c in df if c != other] return GroupBy(cols)