示例#1
0
def test_categorical_order():

    x = pd.Series(["a", "c", "c", "b", "a", "d"])
    y = pd.Series([3, 2, 5, 1, 4])
    order = ["a", "b", "c", "d"]

    out = categorical_order(x)
    assert out == ["a", "c", "b", "d"]

    out = categorical_order(x, order)
    assert out == order

    out = categorical_order(x, ["b", "a"])
    assert out == ["b", "a"]

    out = categorical_order(y)
    assert out == [1, 2, 3, 4, 5]

    out = categorical_order(pd.Series(y))
    assert out == [1, 2, 3, 4, 5]

    y_cat = pd.Series(pd.Categorical(y, y))
    out = categorical_order(y_cat)
    assert out == list(y)

    x = pd.Series(x).astype("category")
    out = categorical_order(x)
    assert out == list(x.cat.categories)

    out = categorical_order(x, ["b", "a"])
    assert out == ["b", "a"]

    x = pd.Series(["a", np.nan, "c", "c", "b", "a", "d"])
    out = categorical_order(x)
    assert out == ["a", "c", "b", "d"]
示例#2
0
    def get_mapping(self, scale: Scale,
                    data: Series) -> Callable[[ArrayLike], ArrayLike]:
        """Return a function that maps each data value to True or False."""
        # TODO categorical_order is going to return [False, True] for booleans,
        # and [0, 1] for binary, but the default values order is [True, False].
        # We should special case this to handle it properly, or change
        # categorical_order to not "sort" booleans. Note that we need to sync with
        # what's going to happen upstream in the scale, so we can't just do it here.
        order = getattr(scale, "order", None)
        levels = categorical_order(data, order)

        if isinstance(scale.values, list):
            values = [bool(x) for x in scale.values]
        elif isinstance(scale.values, dict):
            values = [bool(scale.values[x]) for x in levels]
        elif scale.values is None:
            values = self._default_values(len(levels))
        else:
            msg = " ".join([
                f"Scale values for {self.variable} must be passed in",
                f"a list or dict; not {type(scale.values)}."
            ])
            raise TypeError(msg)

        def mapping(x):
            ixs = np.asarray(x, np.intp)
            return [
                values[ix] if np.isfinite(x_i) else False
                for x_i, ix in zip(x, ixs)
            ]

        return mapping
示例#3
0
    def get_mapping(
        self,
        scale: Scale,
        data: Series,
    ) -> Callable[[ArrayLike], list]:
        """Define mapping as lookup into list of object values."""
        order = getattr(scale, "order", None)
        levels = categorical_order(data, order)
        n = len(levels)

        if isinstance(scale.values, dict):
            self._check_dict_entries(levels, scale.values)
            values = [scale.values[x] for x in levels]
        elif isinstance(scale.values, list):
            values = self._check_list_length(levels, scale.values)
        elif scale.values is None:
            values = self._default_values(n)
        else:
            msg = " ".join([
                f"Scale values for a {self.variable} variable must be provided",
                f"in a dict or list; not {type(scale.values)}."
            ])
            raise TypeError(msg)

        values = [self.standardize(x) for x in values]

        def mapping(x):
            ixs = np.asarray(x, np.intp)
            return [
                values[ix] if np.isfinite(x_i) else self.null_value
                for x_i, ix in zip(x, ixs)
            ]

        return mapping
示例#4
0
    def _get_categorical_mapping(
            self, scale: Nominal,
            data: ArrayLike) -> Callable[[ArrayLike], ArrayLike]:
        """Identify evenly-spaced values using interval or explicit mapping."""
        levels = categorical_order(data, scale.order)

        if isinstance(scale.values, dict):
            self._check_dict_entries(levels, scale.values)
            values = [scale.values[x] for x in levels]
        elif isinstance(scale.values, list):
            values = self._check_list_length(levels, scale.values)
        else:
            if scale.values is None:
                vmin, vmax = self.default_range
            elif isinstance(scale.values, tuple):
                vmin, vmax = scale.values
            else:
                scale_class = scale.__class__.__name__
                err = " ".join([
                    f"Values for {self.variable} variables with {scale_class} scale",
                    f"must be a dict, list or tuple; not {type(scale.values)}",
                ])
                raise TypeError(err)

            vmin, vmax = self._forward([vmin, vmax])
            values = self._inverse(np.linspace(vmax, vmin, len(levels)))

        def mapping(x):
            ixs = np.asarray(x, np.intp)
            out = np.full(len(x), np.nan)
            use = np.isfinite(x)
            out[use] = np.take(values, ixs[use])
            return out

        return mapping
示例#5
0
    def test_two_semantics(self, df):

        groupby = GroupBy(["x", "grp2", "grp3"])
        res = Dodge()(df, groupby, "x")

        levels = categorical_order(df["grp2"]), categorical_order(df["grp3"])
        w, n = 0.8, len(levels[0]) * len(levels[1])

        shifts = np.linspace(0, w - w / n, n)
        shifts -= shifts.mean()

        assert_series_equal(res["y"], df["y"])
        assert_series_equal(res["width"], df["width"] / n)

        for (v2, v3), shift in zip(product(*levels), shifts):
            rows = (df["grp2"] == v2) & (df["grp3"] == v3)
            assert_series_equal(res.loc[rows, "x"], df.loc[rows, "x"] + shift)
示例#6
0
    def test_single_semantic(self, df, grp):

        groupby = GroupBy(["x", grp])
        res = Dodge()(df, groupby, "x")

        levels = categorical_order(df[grp])
        w, n = 0.8, len(levels)

        shifts = np.linspace(0, w - w / n, n)
        shifts -= shifts.mean()

        assert_series_equal(res["y"], df["y"])
        assert_series_equal(res["width"], df["width"] / n)

        for val, shift in zip(levels, shifts):
            rows = df[grp] == val
            assert_series_equal(res.loc[rows, "x"], df.loc[rows, "x"] + shift)
示例#7
0
    def _get_categorical_mapping(self, scale, data):
        """Define mapping as lookup in list of discrete color values."""
        levels = categorical_order(data, scale.order)
        n = len(levels)
        values = scale.values

        if isinstance(values, dict):
            self._check_dict_entries(levels, values)
            # TODO where to ensure that dict values have consistent representation?
            colors = [values[x] for x in levels]
        elif isinstance(values, list):
            colors = self._check_list_length(levels, scale.values)
        elif isinstance(values, tuple):
            colors = blend_palette(values, n)
        elif isinstance(values, str):
            colors = color_palette(values, n)
        elif values is None:
            if n <= len(get_color_cycle()):
                # Use current (global) default palette
                colors = color_palette(n_colors=n)
            else:
                colors = color_palette("husl", n)
        else:
            scale_class = scale.__class__.__name__
            msg = " ".join([
                f"Scale values for {self.variable} with a {scale_class} mapping",
                f"must be string, list, tuple, or dict; not {type(scale.values)}."
            ])
            raise TypeError(msg)

        # If color specified here has alpha channel, it will override alpha property
        colors = self._standardize_color_sequence(colors)

        def mapping(x):
            ixs = np.asarray(x, np.intp)
            use = np.isfinite(x)
            out = np.full((len(ixs), colors.shape[1]), np.nan)
            out[use] = np.take(colors, ixs[use], axis=0)
            return out

        return mapping
示例#8
0
    def _get_groups(self, data: DataFrame) -> MultiIndex:
        """Return index with Cartesian product of ordered grouping variable levels."""
        levels = {}
        for var, order in self.order.items():
            if var in data:
                if order is None:
                    order = categorical_order(data[var])
                levels[var] = order

        grouper: str | list[str]
        groups: Index | MultiIndex | None
        if not levels:
            grouper = []
            groups = None
        elif len(levels) > 1:
            grouper = list(levels)
            groups = pd.MultiIndex.from_product(levels.values(), names=grouper)
        else:
            grouper, = list(levels)
            groups = pd.Index(levels[grouper], name=grouper)
        return grouper, groups
示例#9
0
    def _setup(
        self, data: Series, prop: Property, axis: Axis | None = None,
    ) -> Scale:

        new = copy(self)
        if new._tick_params is None:
            new = new.tick()
        if new._label_params is None:
            new = new.label()

        # TODO flexibility over format() which isn't great for numbers / dates
        stringify = np.vectorize(format)

        units_seed = categorical_order(data, new.order)

        # TODO move to Nominal._get_scale?
        # TODO this needs some more complicated rethinking about how to pass
        # a unit dictionary down to these methods, along with how much we want
        # to invest in their API. What is it useful for tick() to do here?
        # (Ordinal may be different if we draw that contrast).
        # Any customization we do to allow, e.g., label wrapping will probably
        # require defining our own Formatter subclass.
        # We could also potentially implement auto-wrapping in an Axis subclass
        # (see Axis.draw ... it already is computing the bboxes).
        # major_locator, minor_locator = new._get_locators(**new._tick_params)
        # major_formatter = new._get_formatter(major_locator, **new._label_params)

        class CatScale(mpl.scale.LinearScale):
            name = None  # To work around mpl<3.4 compat issues

            def set_default_locators_and_formatters(self, axis):
                ...
                # axis.set_major_locator(major_locator)
                # if minor_locator is not None:
                #     axis.set_minor_locator(minor_locator)
                # axis.set_major_formatter(major_formatter)

        mpl_scale = CatScale(data.name)
        if axis is None:
            axis = PseudoAxis(mpl_scale)

            # TODO Currently just used in non-Coordinate contexts, but should
            # we use this to (A) set the padding we want for categorial plots
            # and (B) allow the values parameter for a Coordinate to set xlim/ylim
            axis.set_view_interval(0, len(units_seed) - 1)

        new._matplotlib_scale = mpl_scale

        # TODO array cast necessary to handle float/int mixture, which we need
        # to solve in a more systematic way probably
        # (i.e. if we have [1, 2.5], do we want [1.0, 2.5]? Unclear)
        axis.update_units(stringify(np.array(units_seed)))

        # TODO define this more centrally
        def convert_units(x):
            # TODO only do this with explicit order?
            # (But also category dtype?)
            # TODO isin fails when units_seed mixes numbers and strings (numpy error?)
            # but np.isin also does not seem any faster? (Maybe not broadcasting in C)
            # keep = x.isin(units_seed)
            keep = np.array([x_ in units_seed for x_ in x], bool)
            out = np.full(len(x), np.nan)
            out[keep] = axis.convert_units(stringify(x[keep]))
            return out

        new._pipeline = [
            convert_units,
            prop.get_mapping(new, data),
            # TODO how to handle color representation consistency?
        ]

        def spacer(x):
            return 1

        new._spacer = spacer

        if prop.legend:
            new._legend = units_seed, list(stringify(units_seed))

        return new
示例#10
0
    def setup(
        self,
        data: Series,
        prop: Property,
        axis: Axis | None = None,
    ) -> Scale:
        class CatScale(mpl.scale.LinearScale):
            # TODO turn this into a real thing I guess
            name = None  # To work around mpl<3.4 compat issues

            def set_default_locators_and_formatters(self, axis):
                pass

        # TODO flexibility over format() which isn't great for numbers / dates
        stringify = np.vectorize(format)

        units_seed = categorical_order(data, self.order)

        mpl_scale = CatScale(data.name)
        if axis is None:
            axis = PseudoAxis(mpl_scale)

            # TODO Currently just used in non-Coordinate contexts, but should
            # we use this to (A) set the padding we want for categorial plots
            # and (B) allow the values parameter for a Coordinate to set xlim/ylim
            axis.set_view_interval(0, len(units_seed) - 1)

        # TODO array cast necessary to handle float/int mixture, which we need
        # to solve in a more systematic way probably
        # (i.e. if we have [1, 2.5], do we want [1.0, 2.5]? Unclear)
        axis.update_units(stringify(np.array(units_seed)))

        # TODO define this more centrally
        def convert_units(x):
            # TODO only do this with explicit order?
            # (But also category dtype?)
            # TODO isin fails when units_seed mixes numbers and strings (numpy error?)
            # but np.isin also does not seem any faster? (Maybe not broadcasting in C)
            # keep = x.isin(units_seed)
            keep = np.array([x_ in units_seed for x_ in x], bool)
            out = np.full(len(x), np.nan)
            out[keep] = axis.convert_units(stringify(x[keep]))
            return out

        forward_pipe = [
            convert_units,
            prop.get_mapping(self, data),
            # TODO how to handle color representation consistency?
        ]

        def spacer(x):
            return 1

        if prop.legend:
            legend = units_seed, list(stringify(units_seed))
        else:
            legend = None

        scale_type = self.__class__.__name__.lower()
        scale = Scale(forward_pipe, spacer, legend, scale_type, mpl_scale)
        return scale