def test_categorical_order(): x = pd.Series(["a", "c", "c", "b", "a", "d"]) y = pd.Series([3, 2, 5, 1, 4]) order = ["a", "b", "c", "d"] out = categorical_order(x) assert out == ["a", "c", "b", "d"] out = categorical_order(x, order) assert out == order out = categorical_order(x, ["b", "a"]) assert out == ["b", "a"] out = categorical_order(y) assert out == [1, 2, 3, 4, 5] out = categorical_order(pd.Series(y)) assert out == [1, 2, 3, 4, 5] y_cat = pd.Series(pd.Categorical(y, y)) out = categorical_order(y_cat) assert out == list(y) x = pd.Series(x).astype("category") out = categorical_order(x) assert out == list(x.cat.categories) out = categorical_order(x, ["b", "a"]) assert out == ["b", "a"] x = pd.Series(["a", np.nan, "c", "c", "b", "a", "d"]) out = categorical_order(x) assert out == ["a", "c", "b", "d"]
def get_mapping(self, scale: Scale, data: Series) -> Callable[[ArrayLike], ArrayLike]: """Return a function that maps each data value to True or False.""" # TODO categorical_order is going to return [False, True] for booleans, # and [0, 1] for binary, but the default values order is [True, False]. # We should special case this to handle it properly, or change # categorical_order to not "sort" booleans. Note that we need to sync with # what's going to happen upstream in the scale, so we can't just do it here. order = getattr(scale, "order", None) levels = categorical_order(data, order) if isinstance(scale.values, list): values = [bool(x) for x in scale.values] elif isinstance(scale.values, dict): values = [bool(scale.values[x]) for x in levels] elif scale.values is None: values = self._default_values(len(levels)) else: msg = " ".join([ f"Scale values for {self.variable} must be passed in", f"a list or dict; not {type(scale.values)}." ]) raise TypeError(msg) def mapping(x): ixs = np.asarray(x, np.intp) return [ values[ix] if np.isfinite(x_i) else False for x_i, ix in zip(x, ixs) ] return mapping
def get_mapping( self, scale: Scale, data: Series, ) -> Callable[[ArrayLike], list]: """Define mapping as lookup into list of object values.""" order = getattr(scale, "order", None) levels = categorical_order(data, order) n = len(levels) if isinstance(scale.values, dict): self._check_dict_entries(levels, scale.values) values = [scale.values[x] for x in levels] elif isinstance(scale.values, list): values = self._check_list_length(levels, scale.values) elif scale.values is None: values = self._default_values(n) else: msg = " ".join([ f"Scale values for a {self.variable} variable must be provided", f"in a dict or list; not {type(scale.values)}." ]) raise TypeError(msg) values = [self.standardize(x) for x in values] def mapping(x): ixs = np.asarray(x, np.intp) return [ values[ix] if np.isfinite(x_i) else self.null_value for x_i, ix in zip(x, ixs) ] return mapping
def _get_categorical_mapping( self, scale: Nominal, data: ArrayLike) -> Callable[[ArrayLike], ArrayLike]: """Identify evenly-spaced values using interval or explicit mapping.""" levels = categorical_order(data, scale.order) if isinstance(scale.values, dict): self._check_dict_entries(levels, scale.values) values = [scale.values[x] for x in levels] elif isinstance(scale.values, list): values = self._check_list_length(levels, scale.values) else: if scale.values is None: vmin, vmax = self.default_range elif isinstance(scale.values, tuple): vmin, vmax = scale.values else: scale_class = scale.__class__.__name__ err = " ".join([ f"Values for {self.variable} variables with {scale_class} scale", f"must be a dict, list or tuple; not {type(scale.values)}", ]) raise TypeError(err) vmin, vmax = self._forward([vmin, vmax]) values = self._inverse(np.linspace(vmax, vmin, len(levels))) def mapping(x): ixs = np.asarray(x, np.intp) out = np.full(len(x), np.nan) use = np.isfinite(x) out[use] = np.take(values, ixs[use]) return out return mapping
def test_two_semantics(self, df): groupby = GroupBy(["x", "grp2", "grp3"]) res = Dodge()(df, groupby, "x") levels = categorical_order(df["grp2"]), categorical_order(df["grp3"]) w, n = 0.8, len(levels[0]) * len(levels[1]) shifts = np.linspace(0, w - w / n, n) shifts -= shifts.mean() assert_series_equal(res["y"], df["y"]) assert_series_equal(res["width"], df["width"] / n) for (v2, v3), shift in zip(product(*levels), shifts): rows = (df["grp2"] == v2) & (df["grp3"] == v3) assert_series_equal(res.loc[rows, "x"], df.loc[rows, "x"] + shift)
def test_single_semantic(self, df, grp): groupby = GroupBy(["x", grp]) res = Dodge()(df, groupby, "x") levels = categorical_order(df[grp]) w, n = 0.8, len(levels) shifts = np.linspace(0, w - w / n, n) shifts -= shifts.mean() assert_series_equal(res["y"], df["y"]) assert_series_equal(res["width"], df["width"] / n) for val, shift in zip(levels, shifts): rows = df[grp] == val assert_series_equal(res.loc[rows, "x"], df.loc[rows, "x"] + shift)
def _get_categorical_mapping(self, scale, data): """Define mapping as lookup in list of discrete color values.""" levels = categorical_order(data, scale.order) n = len(levels) values = scale.values if isinstance(values, dict): self._check_dict_entries(levels, values) # TODO where to ensure that dict values have consistent representation? colors = [values[x] for x in levels] elif isinstance(values, list): colors = self._check_list_length(levels, scale.values) elif isinstance(values, tuple): colors = blend_palette(values, n) elif isinstance(values, str): colors = color_palette(values, n) elif values is None: if n <= len(get_color_cycle()): # Use current (global) default palette colors = color_palette(n_colors=n) else: colors = color_palette("husl", n) else: scale_class = scale.__class__.__name__ msg = " ".join([ f"Scale values for {self.variable} with a {scale_class} mapping", f"must be string, list, tuple, or dict; not {type(scale.values)}." ]) raise TypeError(msg) # If color specified here has alpha channel, it will override alpha property colors = self._standardize_color_sequence(colors) def mapping(x): ixs = np.asarray(x, np.intp) use = np.isfinite(x) out = np.full((len(ixs), colors.shape[1]), np.nan) out[use] = np.take(colors, ixs[use], axis=0) return out return mapping
def _get_groups(self, data: DataFrame) -> MultiIndex: """Return index with Cartesian product of ordered grouping variable levels.""" levels = {} for var, order in self.order.items(): if var in data: if order is None: order = categorical_order(data[var]) levels[var] = order grouper: str | list[str] groups: Index | MultiIndex | None if not levels: grouper = [] groups = None elif len(levels) > 1: grouper = list(levels) groups = pd.MultiIndex.from_product(levels.values(), names=grouper) else: grouper, = list(levels) groups = pd.Index(levels[grouper], name=grouper) return grouper, groups
def _setup( self, data: Series, prop: Property, axis: Axis | None = None, ) -> Scale: new = copy(self) if new._tick_params is None: new = new.tick() if new._label_params is None: new = new.label() # TODO flexibility over format() which isn't great for numbers / dates stringify = np.vectorize(format) units_seed = categorical_order(data, new.order) # TODO move to Nominal._get_scale? # TODO this needs some more complicated rethinking about how to pass # a unit dictionary down to these methods, along with how much we want # to invest in their API. What is it useful for tick() to do here? # (Ordinal may be different if we draw that contrast). # Any customization we do to allow, e.g., label wrapping will probably # require defining our own Formatter subclass. # We could also potentially implement auto-wrapping in an Axis subclass # (see Axis.draw ... it already is computing the bboxes). # major_locator, minor_locator = new._get_locators(**new._tick_params) # major_formatter = new._get_formatter(major_locator, **new._label_params) class CatScale(mpl.scale.LinearScale): name = None # To work around mpl<3.4 compat issues def set_default_locators_and_formatters(self, axis): ... # axis.set_major_locator(major_locator) # if minor_locator is not None: # axis.set_minor_locator(minor_locator) # axis.set_major_formatter(major_formatter) mpl_scale = CatScale(data.name) if axis is None: axis = PseudoAxis(mpl_scale) # TODO Currently just used in non-Coordinate contexts, but should # we use this to (A) set the padding we want for categorial plots # and (B) allow the values parameter for a Coordinate to set xlim/ylim axis.set_view_interval(0, len(units_seed) - 1) new._matplotlib_scale = mpl_scale # TODO array cast necessary to handle float/int mixture, which we need # to solve in a more systematic way probably # (i.e. if we have [1, 2.5], do we want [1.0, 2.5]? Unclear) axis.update_units(stringify(np.array(units_seed))) # TODO define this more centrally def convert_units(x): # TODO only do this with explicit order? # (But also category dtype?) # TODO isin fails when units_seed mixes numbers and strings (numpy error?) # but np.isin also does not seem any faster? (Maybe not broadcasting in C) # keep = x.isin(units_seed) keep = np.array([x_ in units_seed for x_ in x], bool) out = np.full(len(x), np.nan) out[keep] = axis.convert_units(stringify(x[keep])) return out new._pipeline = [ convert_units, prop.get_mapping(new, data), # TODO how to handle color representation consistency? ] def spacer(x): return 1 new._spacer = spacer if prop.legend: new._legend = units_seed, list(stringify(units_seed)) return new
def setup( self, data: Series, prop: Property, axis: Axis | None = None, ) -> Scale: class CatScale(mpl.scale.LinearScale): # TODO turn this into a real thing I guess name = None # To work around mpl<3.4 compat issues def set_default_locators_and_formatters(self, axis): pass # TODO flexibility over format() which isn't great for numbers / dates stringify = np.vectorize(format) units_seed = categorical_order(data, self.order) mpl_scale = CatScale(data.name) if axis is None: axis = PseudoAxis(mpl_scale) # TODO Currently just used in non-Coordinate contexts, but should # we use this to (A) set the padding we want for categorial plots # and (B) allow the values parameter for a Coordinate to set xlim/ylim axis.set_view_interval(0, len(units_seed) - 1) # TODO array cast necessary to handle float/int mixture, which we need # to solve in a more systematic way probably # (i.e. if we have [1, 2.5], do we want [1.0, 2.5]? Unclear) axis.update_units(stringify(np.array(units_seed))) # TODO define this more centrally def convert_units(x): # TODO only do this with explicit order? # (But also category dtype?) # TODO isin fails when units_seed mixes numbers and strings (numpy error?) # but np.isin also does not seem any faster? (Maybe not broadcasting in C) # keep = x.isin(units_seed) keep = np.array([x_ in units_seed for x_ in x], bool) out = np.full(len(x), np.nan) out[keep] = axis.convert_units(stringify(x[keep])) return out forward_pipe = [ convert_units, prop.get_mapping(self, data), # TODO how to handle color representation consistency? ] def spacer(x): return 1 if prop.legend: legend = units_seed, list(stringify(units_seed)) else: legend = None scale_type = self.__class__.__name__.lower() scale = Scale(forward_pipe, spacer, legend, scale_type, mpl_scale) return scale