Exemplo n.º 1
0
 def test_categorical_1d_only(self):
     # ndim > 1
     msg = "> 1 ndim Categorical are not supported at this time"
     with pytest.raises(NotImplementedError, match=msg):
         Categorical(np.array([list("abcd")]))
 def test_mode(self, values, categories, exp_mode):
     s = Categorical(values, categories=categories, ordered=True)
     res = s.mode()
     exp = Categorical(exp_mode, categories=categories, ordered=True)
     tm.assert_categorical_equal(res, exp)
Exemplo n.º 3
0
def create_block(typestr, placement, item_shape=None, num_offset=0):
    """
    Supported typestr:

        * float, f8, f4, f2
        * int, i8, i4, i2, i1
        * uint, u8, u4, u2, u1
        * complex, c16, c8
        * bool
        * object, string, O
        * datetime, dt, M8[ns], M8[ns, tz]
        * timedelta, td, m8[ns]
        * sparse (SparseArray with fill_value=0.0)
        * sparse_na (SparseArray with fill_value=np.nan)
        * category, category2

    """
    placement = BlockPlacement(placement)
    num_items = len(placement)

    if item_shape is None:
        item_shape = (N, )

    shape = (num_items, ) + item_shape

    mat = get_numeric_mat(shape)

    if typestr in (
            "float",
            "f8",
            "f4",
            "f2",
            "int",
            "i8",
            "i4",
            "i2",
            "i1",
            "uint",
            "u8",
            "u4",
            "u2",
            "u1",
    ):
        values = mat.astype(typestr) + num_offset
    elif typestr in ("complex", "c16", "c8"):
        values = 1.0j * (mat.astype(typestr) + num_offset)
    elif typestr in ("object", "string", "O"):
        values = np.reshape([f"A{i:d}" for i in mat.ravel() + num_offset],
                            shape)
    elif typestr in ("b", "bool"):
        values = np.ones(shape, dtype=np.bool_)
    elif typestr in ("datetime", "dt", "M8[ns]"):
        values = (mat * 1e9).astype("M8[ns]")
    elif typestr.startswith("M8[ns"):
        # datetime with tz
        m = re.search(r"M8\[ns,\s*(\w+\/?\w*)\]", typestr)
        assert m is not None, f"incompatible typestr -> {typestr}"
        tz = m.groups()[0]
        assert num_items == 1, "must have only 1 num items for a tz-aware"
        values = DatetimeIndex(np.arange(N) * 1e9, tz=tz)
    elif typestr in ("timedelta", "td", "m8[ns]"):
        values = (mat * 1).astype("m8[ns]")
    elif typestr in ("category", ):
        values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4])
    elif typestr in ("category2", ):
        values = Categorical(
            ["a", "a", "a", "a", "b", "b", "c", "c", "c", "d"])
    elif typestr in ("sparse", "sparse_na"):
        # FIXME: doesn't support num_rows != 10
        assert shape[-1] == 10
        assert all(s == 1 for s in shape[:-1])
        if typestr.endswith("_na"):
            fill_value = np.nan
        else:
            fill_value = 0.0
        values = SparseArray(
            [fill_value, fill_value, 1, 2, 3, fill_value, 4, 5, fill_value, 6],
            fill_value=fill_value,
        )
        arr = values.sp_values.view()
        arr += num_offset - 1
    else:
        raise ValueError(f'Unsupported typestr: "{typestr}"')

    return make_block(values, placement=placement, ndim=len(shape))
 def test_qcut_index(self):
     result = qcut([0, 2], 2)
     intervals = [Interval(-0.001, 1), Interval(1, 2)]
     expected = Categorical(intervals, ordered=True)
     tm.assert_categorical_equal(result, expected)
 def test_repeat(self):
     # GH10183
     cat = Categorical(["a", "b"], categories=["a", "b"])
     exp = Categorical(["a", "a", "b", "b"], categories=["a", "b"])
     res = cat.repeat(2)
     tm.assert_categorical_equal(res, exp)
Exemplo n.º 6
0
class TestSeriesFillNA:
    def test_fillna_nat(self):
        series = Series([0, 1, 2, NaT.value], dtype="M8[ns]")

        filled = series.fillna(method="pad")
        filled2 = series.fillna(value=series.values[2])

        expected = series.copy()
        expected.values[3] = expected.values[2]

        tm.assert_series_equal(filled, expected)
        tm.assert_series_equal(filled2, expected)

        df = DataFrame({"A": series})
        filled = df.fillna(method="pad")
        filled2 = df.fillna(value=series.values[2])
        expected = DataFrame({"A": expected})
        tm.assert_frame_equal(filled, expected)
        tm.assert_frame_equal(filled2, expected)

        series = Series([NaT.value, 0, 1, 2], dtype="M8[ns]")

        filled = series.fillna(method="bfill")
        filled2 = series.fillna(value=series[1])

        expected = series.copy()
        expected[0] = expected[1]

        tm.assert_series_equal(filled, expected)
        tm.assert_series_equal(filled2, expected)

        df = DataFrame({"A": series})
        filled = df.fillna(method="bfill")
        filled2 = df.fillna(value=series[1])
        expected = DataFrame({"A": expected})
        tm.assert_frame_equal(filled, expected)
        tm.assert_frame_equal(filled2, expected)

    def test_fillna(self, datetime_series):
        ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5))

        tm.assert_series_equal(ts, ts.fillna(method="ffill"))

        ts[2] = np.NaN

        exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index)
        tm.assert_series_equal(ts.fillna(method="ffill"), exp)

        exp = Series([0.0, 1.0, 3.0, 3.0, 4.0], index=ts.index)
        tm.assert_series_equal(ts.fillna(method="backfill"), exp)

        exp = Series([0.0, 1.0, 5.0, 3.0, 4.0], index=ts.index)
        tm.assert_series_equal(ts.fillna(value=5), exp)

        msg = "Must specify a fill 'value' or 'method'"
        with pytest.raises(ValueError, match=msg):
            ts.fillna()

        msg = "Cannot specify both 'value' and 'method'"
        with pytest.raises(ValueError, match=msg):
            datetime_series.fillna(value=0, method="ffill")

        # GH#5703
        s1 = Series([np.nan])
        s2 = Series([1])
        result = s1.fillna(s2)
        expected = Series([1.0])
        tm.assert_series_equal(result, expected)
        result = s1.fillna({})
        tm.assert_series_equal(result, s1)
        result = s1.fillna(Series((), dtype=object))
        tm.assert_series_equal(result, s1)
        result = s2.fillna(s1)
        tm.assert_series_equal(result, s2)
        result = s1.fillna({0: 1})
        tm.assert_series_equal(result, expected)
        result = s1.fillna({1: 1})
        tm.assert_series_equal(result, Series([np.nan]))
        result = s1.fillna({0: 1, 1: 1})
        tm.assert_series_equal(result, expected)
        result = s1.fillna(Series({0: 1, 1: 1}))
        tm.assert_series_equal(result, expected)
        result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5]))
        tm.assert_series_equal(result, s1)

        s1 = Series([0, 1, 2], list("abc"))
        s2 = Series([0, np.nan, 2], list("bac"))
        result = s2.fillna(s1)
        expected = Series([0, 0, 2.0], list("bac"))
        tm.assert_series_equal(result, expected)

        # limit
        ser = Series(np.nan, index=[0, 1, 2])
        result = ser.fillna(999, limit=1)
        expected = Series([999, np.nan, np.nan], index=[0, 1, 2])
        tm.assert_series_equal(result, expected)

        result = ser.fillna(999, limit=2)
        expected = Series([999, 999, np.nan], index=[0, 1, 2])
        tm.assert_series_equal(result, expected)

        # GH#9043
        # make sure a string representation of int/float values can be filled
        # correctly without raising errors or being converted
        vals = ["0", "1.5", "-0.3"]
        for val in vals:
            ser = Series([0, 1, np.nan, np.nan, 4], dtype="float64")
            result = ser.fillna(val)
            expected = Series([0, 1, val, val, 4], dtype="object")
            tm.assert_series_equal(result, expected)

    def test_fillna_consistency(self):
        # GH#16402
        # fillna with a tz aware to a tz-naive, should result in object

        ser = Series([Timestamp("20130101"), NaT])

        result = ser.fillna(Timestamp("20130101", tz="US/Eastern"))
        expected = Series(
            [Timestamp("20130101"),
             Timestamp("2013-01-01", tz="US/Eastern")],
            dtype="object",
        )
        tm.assert_series_equal(result, expected)

        # where (we ignore the errors=)
        result = ser.where([True, False],
                           Timestamp("20130101", tz="US/Eastern"),
                           errors="ignore")
        tm.assert_series_equal(result, expected)

        result = ser.where([True, False],
                           Timestamp("20130101", tz="US/Eastern"),
                           errors="ignore")
        tm.assert_series_equal(result, expected)

        # with a non-datetime
        result = ser.fillna("foo")
        expected = Series([Timestamp("20130101"), "foo"])
        tm.assert_series_equal(result, expected)

        # assignment
        ser2 = ser.copy()
        ser2[1] = "foo"
        tm.assert_series_equal(ser2, expected)

    def test_fillna_downcast(self):
        # GH#15277
        # infer int64 from float64
        ser = Series([1.0, np.nan])
        result = ser.fillna(0, downcast="infer")
        expected = Series([1, 0])
        tm.assert_series_equal(result, expected)

        # infer int64 from float64 when fillna value is a dict
        ser = Series([1.0, np.nan])
        result = ser.fillna({1: 0}, downcast="infer")
        expected = Series([1, 0])
        tm.assert_series_equal(result, expected)

    def test_timedelta_fillna(self, frame_or_series):
        # GH#3371
        ser = Series([
            Timestamp("20130101"),
            Timestamp("20130101"),
            Timestamp("20130102"),
            Timestamp("20130103 9:01:01"),
        ])
        td = ser.diff()
        obj = frame_or_series(td)

        # reg fillna
        result = obj.fillna(Timedelta(seconds=0))
        expected = Series([
            timedelta(0),
            timedelta(0),
            timedelta(1),
            timedelta(days=1, seconds=9 * 3600 + 60 + 1),
        ])
        expected = frame_or_series(expected)
        tm.assert_equal(result, expected)

        # interpreted as seconds, deprecated
        with pytest.raises(TypeError, match="Passing integers to fillna"):
            obj.fillna(1)

        result = obj.fillna(Timedelta(seconds=1))
        expected = Series([
            timedelta(seconds=1),
            timedelta(0),
            timedelta(1),
            timedelta(days=1, seconds=9 * 3600 + 60 + 1),
        ])
        expected = frame_or_series(expected)
        tm.assert_equal(result, expected)

        result = obj.fillna(timedelta(days=1, seconds=1))
        expected = Series([
            timedelta(days=1, seconds=1),
            timedelta(0),
            timedelta(1),
            timedelta(days=1, seconds=9 * 3600 + 60 + 1),
        ])
        expected = frame_or_series(expected)
        tm.assert_equal(result, expected)

        result = obj.fillna(np.timedelta64(10**9))
        expected = Series([
            timedelta(seconds=1),
            timedelta(0),
            timedelta(1),
            timedelta(days=1, seconds=9 * 3600 + 60 + 1),
        ])
        expected = frame_or_series(expected)
        tm.assert_equal(result, expected)

        result = obj.fillna(NaT)
        expected = Series(
            [
                NaT,
                timedelta(0),
                timedelta(1),
                timedelta(days=1, seconds=9 * 3600 + 60 + 1),
            ],
            dtype="m8[ns]",
        )
        expected = frame_or_series(expected)
        tm.assert_equal(result, expected)

        # ffill
        td[2] = np.nan
        obj = frame_or_series(td)
        result = obj.ffill()
        expected = td.fillna(Timedelta(seconds=0))
        expected[0] = np.nan
        expected = frame_or_series(expected)

        tm.assert_equal(result, expected)

        # bfill
        td[2] = np.nan
        obj = frame_or_series(td)
        result = obj.bfill()
        expected = td.fillna(Timedelta(seconds=0))
        expected[2] = timedelta(days=1, seconds=9 * 3600 + 60 + 1)
        expected = frame_or_series(expected)
        tm.assert_equal(result, expected)

    def test_datetime64_fillna(self):

        ser = Series([
            Timestamp("20130101"),
            Timestamp("20130101"),
            Timestamp("20130102"),
            Timestamp("20130103 9:01:01"),
        ])
        ser[2] = np.nan

        # ffill
        result = ser.ffill()
        expected = Series([
            Timestamp("20130101"),
            Timestamp("20130101"),
            Timestamp("20130101"),
            Timestamp("20130103 9:01:01"),
        ])
        tm.assert_series_equal(result, expected)

        # bfill
        result = ser.bfill()
        expected = Series([
            Timestamp("20130101"),
            Timestamp("20130101"),
            Timestamp("20130103 9:01:01"),
            Timestamp("20130103 9:01:01"),
        ])
        tm.assert_series_equal(result, expected)

        # GH#6587
        # make sure that we are treating as integer when filling
        # this also tests inference of a datetime-like with NaT's
        ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"])
        expected = Series(
            [
                "2013-08-05 15:30:00.000001",
                "2013-08-05 15:30:00.000001",
                "2013-08-05 15:30:00.000001",
            ],
            dtype="M8[ns]",
        )
        result = ser.fillna(method="backfill")
        tm.assert_series_equal(result, expected)

    @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"])
    def test_datetime64_tz_fillna(self, tz):
        # DatetimeBlock
        ser = Series([
            Timestamp("2011-01-01 10:00"),
            NaT,
            Timestamp("2011-01-03 10:00"),
            NaT,
        ])
        null_loc = Series([False, True, False, True])

        result = ser.fillna(Timestamp("2011-01-02 10:00"))
        expected = Series([
            Timestamp("2011-01-01 10:00"),
            Timestamp("2011-01-02 10:00"),
            Timestamp("2011-01-03 10:00"),
            Timestamp("2011-01-02 10:00"),
        ])
        tm.assert_series_equal(expected, result)
        # check s is not changed
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz))
        expected = Series([
            Timestamp("2011-01-01 10:00"),
            Timestamp("2011-01-02 10:00", tz=tz),
            Timestamp("2011-01-03 10:00"),
            Timestamp("2011-01-02 10:00", tz=tz),
        ])
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna("AAA")
        expected = Series(
            [
                Timestamp("2011-01-01 10:00"),
                "AAA",
                Timestamp("2011-01-03 10:00"),
                "AAA",
            ],
            dtype=object,
        )
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna({
            1: Timestamp("2011-01-02 10:00", tz=tz),
            3: Timestamp("2011-01-04 10:00"),
        })
        expected = Series([
            Timestamp("2011-01-01 10:00"),
            Timestamp("2011-01-02 10:00", tz=tz),
            Timestamp("2011-01-03 10:00"),
            Timestamp("2011-01-04 10:00"),
        ])
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna({
            1: Timestamp("2011-01-02 10:00"),
            3: Timestamp("2011-01-04 10:00")
        })
        expected = Series([
            Timestamp("2011-01-01 10:00"),
            Timestamp("2011-01-02 10:00"),
            Timestamp("2011-01-03 10:00"),
            Timestamp("2011-01-04 10:00"),
        ])
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        # DatetimeBlockTZ
        idx = DatetimeIndex(["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT],
                            tz=tz)
        ser = Series(idx)
        assert ser.dtype == f"datetime64[ns, {tz}]"
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna(Timestamp("2011-01-02 10:00"))
        expected = Series([
            Timestamp("2011-01-01 10:00", tz=tz),
            Timestamp("2011-01-02 10:00"),
            Timestamp("2011-01-03 10:00", tz=tz),
            Timestamp("2011-01-02 10:00"),
        ])
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz))
        idx = DatetimeIndex(
            [
                "2011-01-01 10:00",
                "2011-01-02 10:00",
                "2011-01-03 10:00",
                "2011-01-02 10:00",
            ],
            tz=tz,
        )
        expected = Series(idx)
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna(
            Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime())
        idx = DatetimeIndex(
            [
                "2011-01-01 10:00",
                "2011-01-02 10:00",
                "2011-01-03 10:00",
                "2011-01-02 10:00",
            ],
            tz=tz,
        )
        expected = Series(idx)
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna("AAA")
        expected = Series(
            [
                Timestamp("2011-01-01 10:00", tz=tz),
                "AAA",
                Timestamp("2011-01-03 10:00", tz=tz),
                "AAA",
            ],
            dtype=object,
        )
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna({
            1: Timestamp("2011-01-02 10:00", tz=tz),
            3: Timestamp("2011-01-04 10:00"),
        })
        expected = Series([
            Timestamp("2011-01-01 10:00", tz=tz),
            Timestamp("2011-01-02 10:00", tz=tz),
            Timestamp("2011-01-03 10:00", tz=tz),
            Timestamp("2011-01-04 10:00"),
        ])
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna({
            1: Timestamp("2011-01-02 10:00", tz=tz),
            3: Timestamp("2011-01-04 10:00", tz=tz),
        })
        expected = Series([
            Timestamp("2011-01-01 10:00", tz=tz),
            Timestamp("2011-01-02 10:00", tz=tz),
            Timestamp("2011-01-03 10:00", tz=tz),
            Timestamp("2011-01-04 10:00", tz=tz),
        ])
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        # filling with a naive/other zone, coerce to object
        result = ser.fillna(Timestamp("20130101"))
        expected = Series([
            Timestamp("2011-01-01 10:00", tz=tz),
            Timestamp("2013-01-01"),
            Timestamp("2011-01-03 10:00", tz=tz),
            Timestamp("2013-01-01"),
        ])
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

        result = ser.fillna(Timestamp("20130101", tz="US/Pacific"))
        expected = Series([
            Timestamp("2011-01-01 10:00", tz=tz),
            Timestamp("2013-01-01", tz="US/Pacific"),
            Timestamp("2011-01-03 10:00", tz=tz),
            Timestamp("2013-01-01", tz="US/Pacific"),
        ])
        tm.assert_series_equal(expected, result)
        tm.assert_series_equal(isna(ser), null_loc)

    def test_fillna_dt64tz_with_method(self):
        # with timezone
        # GH#15855
        ser = Series([Timestamp("2012-11-11 00:00:00+01:00"), NaT])
        exp = Series([
            Timestamp("2012-11-11 00:00:00+01:00"),
            Timestamp("2012-11-11 00:00:00+01:00"),
        ])
        tm.assert_series_equal(ser.fillna(method="pad"), exp)

        ser = Series([NaT, Timestamp("2012-11-11 00:00:00+01:00")])
        exp = Series([
            Timestamp("2012-11-11 00:00:00+01:00"),
            Timestamp("2012-11-11 00:00:00+01:00"),
        ])
        tm.assert_series_equal(ser.fillna(method="bfill"), exp)

    def test_fillna_pytimedelta(self):
        # GH#8209
        ser = Series([np.nan, Timedelta("1 days")], index=["A", "B"])

        result = ser.fillna(timedelta(1))
        expected = Series(Timedelta("1 days"), index=["A", "B"])
        tm.assert_series_equal(result, expected)

    def test_fillna_period(self):
        # GH#13737
        ser = Series([Period("2011-01", freq="M"), Period("NaT", freq="M")])

        res = ser.fillna(Period("2012-01", freq="M"))
        exp = Series(
            [Period("2011-01", freq="M"),
             Period("2012-01", freq="M")])
        tm.assert_series_equal(res, exp)
        assert res.dtype == "Period[M]"

    def test_fillna_dt64_timestamp(self, frame_or_series):
        ser = Series([
            Timestamp("20130101"),
            Timestamp("20130101"),
            Timestamp("20130102"),
            Timestamp("20130103 9:01:01"),
        ])
        ser[2] = np.nan
        obj = frame_or_series(ser)

        # reg fillna
        result = obj.fillna(Timestamp("20130104"))
        expected = Series([
            Timestamp("20130101"),
            Timestamp("20130101"),
            Timestamp("20130104"),
            Timestamp("20130103 9:01:01"),
        ])
        expected = frame_or_series(expected)
        tm.assert_equal(result, expected)

        result = obj.fillna(NaT)
        expected = obj
        tm.assert_equal(result, expected)

    def test_fillna_dt64_non_nao(self):
        # GH#27419
        ser = Series([Timestamp("2010-01-01"), NaT, Timestamp("2000-01-01")])
        val = np.datetime64("1975-04-05", "ms")

        result = ser.fillna(val)
        expected = Series([
            Timestamp("2010-01-01"),
            Timestamp("1975-04-05"),
            Timestamp("2000-01-01")
        ])
        tm.assert_series_equal(result, expected)

    def test_fillna_numeric_inplace(self):
        x = Series([np.nan, 1.0, np.nan, 3.0, np.nan],
                   ["z", "a", "b", "c", "d"])
        y = x.copy()

        return_value = y.fillna(value=0, inplace=True)
        assert return_value is None

        expected = x.fillna(value=0)
        tm.assert_series_equal(y, expected)

    # ---------------------------------------------------------------
    # CategoricalDtype

    @pytest.mark.parametrize(
        "fill_value, expected_output",
        [
            ("a", ["a", "a", "b", "a", "a"]),
            ({
                1: "a",
                3: "b",
                4: "b"
            }, ["a", "a", "b", "b", "b"]),
            ({
                1: "a"
            }, ["a", "a", "b", np.nan, np.nan]),
            ({
                1: "a",
                3: "b"
            }, ["a", "a", "b", "b", np.nan]),
            (Series("a"), ["a", np.nan, "b", np.nan, np.nan]),
            (Series("a", index=[1]), ["a", "a", "b", np.nan, np.nan]),
            (Series({
                1: "a",
                3: "b"
            }), ["a", "a", "b", "b", np.nan]),
            (Series(["a", "b"], index=[3, 4]), ["a", np.nan, "b", "a", "b"]),
        ],
    )
    def test_fillna_categorical(self, fill_value, expected_output):
        # GH#17033
        # Test fillna for a Categorical series
        data = ["a", np.nan, "b", np.nan, np.nan]
        ser = Series(Categorical(data, categories=["a", "b"]))
        exp = Series(Categorical(expected_output, categories=["a", "b"]))
        result = ser.fillna(fill_value)
        tm.assert_series_equal(result, exp)

    @pytest.mark.parametrize(
        "fill_value, expected_output",
        [
            (Series(["a", "b", "c", "d", "e"]), ["a", "b", "b", "d", "e"]),
            (Series(["b", "d", "a", "d", "a"]), ["a", "d", "b", "d", "a"]),
            (
                Series(
                    Categorical(["b", "d", "a", "d", "a"],
                                categories=["b", "c", "d", "e", "a"])),
                ["a", "d", "b", "d", "a"],
            ),
        ],
    )
    def test_fillna_categorical_with_new_categories(self, fill_value,
                                                    expected_output):
        # GH#26215
        data = ["a", np.nan, "b", np.nan, np.nan]
        ser = Series(Categorical(data, categories=["a", "b", "c", "d", "e"]))
        exp = Series(
            Categorical(expected_output, categories=["a", "b", "c", "d", "e"]))
        result = ser.fillna(fill_value)
        tm.assert_series_equal(result, exp)

    def test_fillna_categorical_raises(self):
        data = ["a", np.nan, "b", np.nan, np.nan]
        ser = Series(Categorical(data, categories=["a", "b"]))

        msg = "Cannot setitem on a Categorical with a new category"
        with pytest.raises(ValueError, match=msg):
            ser.fillna("d")

        with pytest.raises(ValueError, match=msg):
            ser.fillna(Series("d"))

        with pytest.raises(ValueError, match=msg):
            ser.fillna({1: "d", 3: "a"})

        msg = '"value" parameter must be a scalar or dict, but you passed a "list"'
        with pytest.raises(TypeError, match=msg):
            ser.fillna(["a", "b"])

        msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"'
        with pytest.raises(TypeError, match=msg):
            ser.fillna(("a", "b"))

        msg = ('"value" parameter must be a scalar, dict '
               'or Series, but you passed a "DataFrame"')
        with pytest.raises(TypeError, match=msg):
            ser.fillna(DataFrame({1: ["a"], 3: ["b"]}))

    # ---------------------------------------------------------------
    # Invalid Usages

    def test_fillna_invalid_method(self, datetime_series):
        try:
            datetime_series.fillna(method="ffil")
        except ValueError as inst:
            assert "ffil" in str(inst)

    def test_fillna_listlike_invalid(self):
        ser = Series(np.random.randint(-100, 100, 50))
        msg = '"value" parameter must be a scalar or dict, but you passed a "list"'
        with pytest.raises(TypeError, match=msg):
            ser.fillna([1, 2])

        msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"'
        with pytest.raises(TypeError, match=msg):
            ser.fillna((1, 2))

    def test_fillna_method_and_limit_invalid(self):

        # related GH#9217, make sure limit is an int and greater than 0
        ser = Series([1, 2, 3, None])
        msg = (r"Cannot specify both 'value' and 'method'\.|"
               r"Limit must be greater than 0|"
               "Limit must be an integer")
        for limit in [-1, 0, 1.0, 2.0]:
            for method in ["backfill", "bfill", "pad", "ffill", None]:
                with pytest.raises(ValueError, match=msg):
                    ser.fillna(1, limit=limit, method=method)

    def test_fillna_datetime64_with_timezone_tzinfo(self):
        # https://github.com/pandas-dev/pandas/issues/38851
        s = Series(date_range("2020", periods=3, tz="UTC"))
        expected = s.astype(object)
        s[1] = NaT
        result = s.fillna(datetime(2020, 1, 2, tzinfo=timezone.utc))
        tm.assert_series_equal(result, expected)
Exemplo n.º 7
0
def test_observed(observed):
    # multiple groupers, don't re-expand the output space
    # of the grouper
    # gh-14942 (implement)
    # gh-10132 (back-compat)
    # gh-8138 (back-compat)
    # gh-8869

    cat1 = Categorical(["a", "a", "b", "b"],
                       categories=["a", "b", "z"], ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"],
                       categories=["c", "d", "y"], ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
    df['C'] = ['foo', 'bar'] * 2

    # multiple groupers with a non-cat
    gb = df.groupby(['A', 'B', 'C'], observed=observed)
    exp_index = pd.MultiIndex.from_arrays(
        [cat1, cat2, ['foo', 'bar'] * 2],
        names=['A', 'B', 'C'])
    expected = DataFrame({'values': Series(
        [1, 2, 3, 4], index=exp_index)}).sort_index()
    result = gb.sum()
    if not observed:
        expected = cartesian_product_for_groupers(
            expected,
            [cat1, cat2, ['foo', 'bar']],
            list('ABC'))

    tm.assert_frame_equal(result, expected)

    gb = df.groupby(['A', 'B'], observed=observed)
    exp_index = pd.MultiIndex.from_arrays(
        [cat1, cat2],
        names=['A', 'B'])
    expected = DataFrame({'values': [1, 2, 3, 4]},
                         index=exp_index)
    result = gb.sum()
    if not observed:
        expected = cartesian_product_for_groupers(
            expected,
            [cat1, cat2],
            list('AB'))

    tm.assert_frame_equal(result, expected)

    # https://github.com/pandas-dev/pandas/issues/8138
    d = {'cat':
         pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"],
                        ordered=True),
         'ints': [1, 1, 2, 2],
         'val': [10, 20, 30, 40]}
    df = pd.DataFrame(d)

    # Grouping on a single column
    groups_single_key = df.groupby("cat", observed=observed)
    result = groups_single_key.mean()

    exp_index = pd.CategoricalIndex(list('ab'), name="cat",
                                    categories=list('abc'),
                                    ordered=True)
    expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]},
                         index=exp_index)
    if not observed:
        index = pd.CategoricalIndex(list('abc'), name="cat",
                                    categories=list('abc'),
                                    ordered=True)
        expected = expected.reindex(index)

    tm.assert_frame_equal(result, expected)

    # Grouping on two columns
    groups_double_key = df.groupby(["cat", "ints"], observed=observed)
    result = groups_double_key.agg('mean')
    expected = DataFrame(
        {"val": [10, 30, 20, 40],
         "cat": pd.Categorical(['a', 'a', 'b', 'b'],
                               categories=['a', 'b', 'c'],
                               ordered=True),
         "ints": [1, 2, 1, 2]}).set_index(["cat", "ints"])
    if not observed:
        expected = cartesian_product_for_groupers(
            expected,
            [df.cat.values, [1, 2]],
            ['cat', 'ints'])

    tm.assert_frame_equal(result, expected)

    # GH 10132
    for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]:
        c, i = key
        result = groups_double_key.get_group(key)
        expected = df[(df.cat == c) & (df.ints == i)]
        assert_frame_equal(result, expected)

    # gh-8869
    # with as_index
    d = {'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70],
         'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']}
    df = pd.DataFrame(d)
    cat = pd.cut(df['foo'], np.linspace(0, 10, 3))
    df['range'] = cat
    groups = df.groupby(['range', 'baz'], as_index=False, observed=observed)
    result = groups.agg('mean')

    groups2 = df.groupby(['range', 'baz'], as_index=True, observed=observed)
    expected = groups2.agg('mean').reset_index()
    tm.assert_frame_equal(result, expected)
Exemplo n.º 8
0
 def test_constructor_string_and_tuples(self):
     # GH 21416
     c = Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object))
     expected_index = Index([("a", "b"), ("b", "a"), "c"])
     assert c.categories.equals(expected_index)
Exemplo n.º 9
0
    def test_constructor_datetime64_non_nano(self):
        categories = np.arange(10).view("M8[D]")
        values = categories[::2].copy()

        cat = Categorical(values, categories=categories)
        assert (cat == values).all()
Exemplo n.º 10
0
 def test_construction_with_ordered(self, ordered):
     # GH 9347, 9190
     cat = Categorical([0, 1, 2], ordered=ordered)
     assert cat.ordered == bool(ordered)
Exemplo n.º 11
0
 def test_constructor_imaginary(self):
     values = [1, 2, 3 + 1j]
     c1 = Categorical(values)
     tm.assert_index_equal(c1.categories, Index(values))
     tm.assert_numpy_array_equal(np.array(c1), np.array(values))
Exemplo n.º 12
0
    def test_from_codes_with_categorical_categories(self, klass):
        # GH17884
        expected = Categorical(["a", "b"], categories=["a", "b", "c"])

        result = Categorical.from_codes([0, 1], categories=klass(["a", "b", "c"]))
        tm.assert_categorical_equal(result, expected)
Exemplo n.º 13
0
    def test_from_codes_empty(self):
        cat = ["a", "b", "c"]
        result = Categorical.from_codes([], categories=cat)
        expected = Categorical([], categories=cat)

        tm.assert_categorical_equal(result, expected)
Exemplo n.º 14
0
 def test_constructor_np_strs(self):
     # GH#31499 Hastable.map_locations needs to work on np.str_ objects
     cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])
     assert all(isinstance(x, np.str_) for x in cat.categories)
Exemplo n.º 15
0
 def test_from_categorical_dtype_ordered(self):
     c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True)
     # override ordered
     result = CategoricalDtype._from_categorical_dtype(c1, ordered=False)
     assert result == CategoricalDtype([1, 2, 3], ordered=False)
Exemplo n.º 16
0
 def test_constructor_empty_boolean(self):
     # see gh-22702
     cat = Categorical([], categories=[True, False])
     categories = sorted(cat.categories.tolist())
     assert categories == [False, True]
Exemplo n.º 17
0
 def test_categorical_categories(self):
     # GH17884
     c1 = CategoricalDtype(Categorical(["a", "b"]))
     tm.assert_index_equal(c1.categories, pd.Index(["a", "b"]))
     c1 = CategoricalDtype(CategoricalIndex(["a", "b"]))
     tm.assert_index_equal(c1.categories, pd.Index(["a", "b"]))
Exemplo n.º 18
0
 def test_constructor_tuples(self):
     values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object)
     result = Categorical(values)
     expected = Index([(1,), (1, 2)], tupleize_cols=False)
     tm.assert_index_equal(result.categories, expected)
     assert result.ordered is False
Exemplo n.º 19
0
def union_categoricals(to_union,
                       sort_categories: bool = False,
                       ignore_order: bool = False):
    """
    Combine list-like of Categorical-like, unioning categories.

    All categories must have the same dtype.

    Parameters
    ----------
    to_union : list-like
        Categorical, CategoricalIndex, or Series with dtype='category'.
    sort_categories : bool, default False
        If true, resulting categories will be lexsorted, otherwise
        they will be ordered as they appear in the data.
    ignore_order : bool, default False
        If true, the ordered attribute of the Categoricals will be ignored.
        Results in an unordered categorical.

    Returns
    -------
    Categorical

    Raises
    ------
    TypeError
        - all inputs do not have the same dtype
        - all inputs do not have the same ordered property
        - all inputs are ordered and their categories are not identical
        - sort_categories=True and Categoricals are ordered
    ValueError
        Empty list of categoricals passed

    Notes
    -----
    To learn more about categories, see `link
    <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__

    Examples
    --------
    >>> from pandas.api.types import union_categoricals

    If you want to combine categoricals that do not necessarily have
    the same categories, `union_categoricals` will combine a list-like
    of categoricals. The new categories will be the union of the
    categories being combined.

    >>> a = pd.Categorical(["b", "c"])
    >>> b = pd.Categorical(["a", "b"])
    >>> union_categoricals([a, b])
    ['b', 'c', 'a', 'b']
    Categories (3, object): ['b', 'c', 'a']

    By default, the resulting categories will be ordered as they appear
    in the `categories` of the data. If you want the categories to be
    lexsorted, use `sort_categories=True` argument.

    >>> union_categoricals([a, b], sort_categories=True)
    ['b', 'c', 'a', 'b']
    Categories (3, object): ['a', 'b', 'c']

    `union_categoricals` also works with the case of combining two
    categoricals of the same categories and order information (e.g. what
    you could also `append` for).

    >>> a = pd.Categorical(["a", "b"], ordered=True)
    >>> b = pd.Categorical(["a", "b", "a"], ordered=True)
    >>> union_categoricals([a, b])
    ['a', 'b', 'a', 'b', 'a']
    Categories (2, object): ['a' < 'b']

    Raises `TypeError` because the categories are ordered and not identical.

    >>> a = pd.Categorical(["a", "b"], ordered=True)
    >>> b = pd.Categorical(["a", "b", "c"], ordered=True)
    >>> union_categoricals([a, b])
    Traceback (most recent call last):
        ...
    TypeError: to union ordered Categoricals, all categories must be the same

    New in version 0.20.0

    Ordered categoricals with different categories or orderings can be
    combined by using the `ignore_ordered=True` argument.

    >>> a = pd.Categorical(["a", "b", "c"], ordered=True)
    >>> b = pd.Categorical(["c", "b", "a"], ordered=True)
    >>> union_categoricals([a, b], ignore_order=True)
    ['a', 'b', 'c', 'c', 'b', 'a']
    Categories (3, object): ['a', 'b', 'c']

    `union_categoricals` also works with a `CategoricalIndex`, or `Series`
    containing categorical data, but note that the resulting array will
    always be a plain `Categorical`

    >>> a = pd.Series(["b", "c"], dtype='category')
    >>> b = pd.Series(["a", "b"], dtype='category')
    >>> union_categoricals([a, b])
    ['b', 'c', 'a', 'b']
    Categories (3, object): ['b', 'c', 'a']
    """
    from pandas import Categorical
    from pandas.core.arrays.categorical import recode_for_categories

    if len(to_union) == 0:
        raise ValueError("No Categoricals to union")

    def _maybe_unwrap(x):
        if isinstance(x, (ABCCategoricalIndex, ABCSeries)):
            return x._values
        elif isinstance(x, Categorical):
            return x
        else:
            raise TypeError("all components to combine must be Categorical")

    to_union = [_maybe_unwrap(x) for x in to_union]
    first = to_union[0]

    if not all(
            is_dtype_equal(other.categories.dtype, first.categories.dtype)
            for other in to_union[1:]):
        raise TypeError("dtype of categories must be the same")

    ordered = False
    if all(
            first._categories_match_up_to_permutation(other)
            for other in to_union[1:]):
        # identical categories - fastpath
        categories = first.categories
        ordered = first.ordered

        all_codes = [
            first._encode_with_my_categories(x)._codes for x in to_union
        ]
        new_codes = np.concatenate(all_codes)

        if sort_categories and not ignore_order and ordered:
            raise TypeError(
                "Cannot use sort_categories=True with ordered Categoricals")

        if sort_categories and not categories.is_monotonic_increasing:
            categories = categories.sort_values()
            indexer = categories.get_indexer(first.categories)

            from pandas.core.algorithms import take_nd

            new_codes = take_nd(indexer, new_codes, fill_value=-1)
    elif ignore_order or all(not c.ordered for c in to_union):
        # different categories - union and recode
        cats = first.categories.append([c.categories for c in to_union[1:]])
        categories = cats.unique()
        if sort_categories:
            categories = categories.sort_values()

        new_codes = [
            recode_for_categories(c.codes, c.categories, categories)
            for c in to_union
        ]
        new_codes = np.concatenate(new_codes)
    else:
        # ordered - to show a proper error message
        if all(c.ordered for c in to_union):
            msg = "to union ordered Categoricals, all categories must be the same"
            raise TypeError(msg)
        else:
            raise TypeError("Categorical.ordered must be the same")

    if ignore_order:
        ordered = False

    return Categorical(new_codes,
                       categories=categories,
                       ordered=ordered,
                       fastpath=True)
Exemplo n.º 20
0
 def test_is_boolean(self, categories, expected):
     cat = Categorical(categories)
     assert cat.dtype._is_boolean is expected
     assert is_bool_dtype(cat) is expected
     assert is_bool_dtype(cat.dtype) is expected
Exemplo n.º 21
0
def test_basic():

    cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
                       categories=["a", "b", "c", "d"], ordered=True)
    data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})

    exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True)
    expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index)
    result = data.groupby("b", observed=False).mean()
    tm.assert_frame_equal(result, expected)

    cat1 = Categorical(["a", "a", "b", "b"],
                       categories=["a", "b", "z"], ordered=True)
    cat2 = Categorical(["c", "d", "c", "d"],
                       categories=["c", "d", "y"], ordered=True)
    df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})

    # single grouper
    gb = df.groupby("A", observed=False)
    exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True)
    expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)})
    result = gb.sum()
    tm.assert_frame_equal(result, expected)

    # GH 8623
    x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
                   [1, 'John P. Doe']],
                  columns=['person_id', 'person_name'])
    x['person_name'] = Categorical(x.person_name)

    g = x.groupby(['person_id'], observed=False)
    result = g.transform(lambda x: x)
    tm.assert_frame_equal(result, x[['person_name']])

    result = x.drop_duplicates('person_name')
    expected = x.iloc[[0, 1]]
    tm.assert_frame_equal(result, expected)

    def f(x):
        return x.drop_duplicates('person_name').iloc[0]

    result = g.apply(f)
    expected = x.iloc[[0, 1]].copy()
    expected.index = Index([1, 2], name='person_id')
    expected['person_name'] = expected['person_name'].astype('object')
    tm.assert_frame_equal(result, expected)

    # GH 9921
    # Monotonic
    df = DataFrame({"a": [5, 15, 25]})
    c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df['a'])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum),
        df[['a']])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.max(xs)),
        df[['a']])

    # Filter
    tm.assert_series_equal(
        df.a.groupby(c, observed=False).filter(np.all),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).filter(np.all),
        df)

    # Non-monotonic
    df = DataFrame({"a": [5, 15, 25, -5]})
    c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])

    result = df.a.groupby(c, observed=False).transform(sum)
    tm.assert_series_equal(result, df['a'])

    tm.assert_series_equal(
        df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df['a'])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(sum),
        df[['a']])
    tm.assert_frame_equal(
        df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)),
        df[['a']])

    # GH 9603
    df = DataFrame({'a': [1, 0, 0, 0]})
    c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd')))
    result = df.groupby(c, observed=False).apply(len)

    exp_index = CategoricalIndex(
        c.values.categories, ordered=c.values.ordered)
    expected = Series([1, 0, 0, 0], index=exp_index)
    expected.index.name = 'a'
    tm.assert_series_equal(result, expected)

    # more basic
    levels = ['foo', 'bar', 'baz', 'qux']
    codes = np.random.randint(0, 4, size=100)

    cats = Categorical.from_codes(codes, levels, ordered=True)

    data = DataFrame(np.random.randn(100, 4))

    result = data.groupby(cats, observed=False).mean()

    expected = data.groupby(np.asarray(cats), observed=False).mean()
    exp_idx = CategoricalIndex(levels, categories=cats.categories,
                               ordered=True)
    expected = expected.reindex(exp_idx)

    assert_frame_equal(result, expected)

    grouped = data.groupby(cats, observed=False)
    desc_result = grouped.describe()

    idx = cats.codes.argsort()
    ord_labels = np.asarray(cats).take(idx)
    ord_data = data.take(idx)

    exp_cats = Categorical(ord_labels, ordered=True,
                           categories=['foo', 'bar', 'baz', 'qux'])
    expected = ord_data.groupby(
        exp_cats, sort=False, observed=False).describe()
    assert_frame_equal(desc_result, expected)

    # GH 10460
    expc = Categorical.from_codes(np.arange(4).repeat(8),
                                  levels, ordered=True)
    exp = CategoricalIndex(expc)
    tm.assert_index_equal((desc_result.stack().index
                           .get_level_values(0)), exp)
    exp = Index(['count', 'mean', 'std', 'min', '25%', '50%',
                 '75%', 'max'] * 4)
    tm.assert_index_equal((desc_result.stack().index
                           .get_level_values(1)), exp)
Exemplo n.º 22
0
 def test_dtype_specific_categorical_dtype(self):
     expected = "datetime64[ns]"
     result = str(Categorical(DatetimeIndex([])).categories.dtype)
     assert result == expected
 def test_nbytes(self):
     cat = Categorical([1, 2, 3])
     exp = 3 + 3 * 8  # 3 int8s for values + 3 int64s for categories
     assert cat.nbytes == exp
Exemplo n.º 24
0
class TestCategoricalDtype(Base):
    @pytest.fixture
    def dtype(self):
        """
        Class level fixture of dtype for TestCategoricalDtype
        """
        return CategoricalDtype()

    def test_hash_vs_equality(self, dtype):
        dtype2 = CategoricalDtype()
        assert dtype == dtype2
        assert dtype2 == dtype
        assert hash(dtype) == hash(dtype2)

    def test_equality(self, dtype):
        assert is_dtype_equal(dtype, "category")
        assert is_dtype_equal(dtype, CategoricalDtype())
        assert not is_dtype_equal(dtype, "foo")

    def test_construction_from_string(self, dtype):
        result = CategoricalDtype.construct_from_string("category")
        assert is_dtype_equal(dtype, result)
        msg = "Cannot construct a 'CategoricalDtype' from 'foo'"
        with pytest.raises(TypeError, match=msg):
            CategoricalDtype.construct_from_string("foo")

    def test_constructor_invalid(self):
        msg = "Parameter 'categories' must be list-like"
        with pytest.raises(TypeError, match=msg):
            CategoricalDtype("category")

    dtype1 = CategoricalDtype(["a", "b"], ordered=True)
    dtype2 = CategoricalDtype(["x", "y"], ordered=False)
    c = Categorical([0, 1], dtype=dtype1, fastpath=True)

    @pytest.mark.parametrize(
        "values, categories, ordered, dtype, expected",
        [
            [None, None, None, None,
             CategoricalDtype()],
            [None, ["a", "b"], True, None, dtype1],
            [c, None, None, dtype2, dtype2],
            [c, ["x", "y"], False, None, dtype2],
        ],
    )
    def test_from_values_or_dtype(self, values, categories, ordered, dtype,
                                  expected):
        result = CategoricalDtype._from_values_or_dtype(
            values, categories, ordered, dtype)
        assert result == expected

    @pytest.mark.parametrize(
        "values, categories, ordered, dtype",
        [
            [None, ["a", "b"], True, dtype2],
            [None, ["a", "b"], None, dtype2],
            [None, None, True, dtype2],
        ],
    )
    def test_from_values_or_dtype_raises(self, values, categories, ordered,
                                         dtype):
        msg = "Cannot specify `categories` or `ordered` together with `dtype`."
        with pytest.raises(ValueError, match=msg):
            CategoricalDtype._from_values_or_dtype(values, categories, ordered,
                                                   dtype)

    def test_from_values_or_dtype_invalid_dtype(self):
        msg = "Cannot not construct CategoricalDtype from <class 'object'>"
        with pytest.raises(ValueError, match=msg):
            CategoricalDtype._from_values_or_dtype(None, None, None, object)

    def test_is_dtype(self, dtype):
        assert CategoricalDtype.is_dtype(dtype)
        assert CategoricalDtype.is_dtype("category")
        assert CategoricalDtype.is_dtype(CategoricalDtype())
        assert not CategoricalDtype.is_dtype("foo")
        assert not CategoricalDtype.is_dtype(np.float64)

    def test_basic(self, dtype):

        assert is_categorical_dtype(dtype)

        factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])

        s = Series(factor, name="A")

        # dtypes
        assert is_categorical_dtype(s.dtype)
        assert is_categorical_dtype(s)
        assert not is_categorical_dtype(np.dtype("float64"))

        with tm.assert_produces_warning(FutureWarning):
            # GH#33385 deprecated
            assert is_categorical(s.dtype)
            assert is_categorical(s)
            assert not is_categorical(np.dtype("float64"))
            assert not is_categorical(1.0)

    def test_tuple_categories(self):
        categories = [(1, "a"), (2, "b"), (3, "c")]
        result = CategoricalDtype(categories)
        assert all(result.categories == categories)

    @pytest.mark.parametrize(
        "categories, expected",
        [
            ([True, False], True),
            ([True, False, None], True),
            ([True, False, "a", "b'"], False),
            ([0, 1], False),
        ],
    )
    def test_is_boolean(self, categories, expected):
        cat = Categorical(categories)
        assert cat.dtype._is_boolean is expected
        assert is_bool_dtype(cat) is expected
        assert is_bool_dtype(cat.dtype) is expected

    def test_dtype_specific_categorical_dtype(self):
        expected = "datetime64[ns]"
        result = str(Categorical(DatetimeIndex([])).categories.dtype)
        assert result == expected
    def test_isna(self):
        exp = np.array([False, False, True])
        c = Categorical(["a", "b", np.nan])
        res = c.isna()

        tm.assert_numpy_array_equal(res, exp)
Exemplo n.º 26
0
 def test_from_categorical_dtype_identity(self):
     c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True)
     # Identity test for no changes
     c2 = CategoricalDtype._from_categorical_dtype(c1)
     assert c2 is c1
Exemplo n.º 27
0
def process_type(t, df, plot_args, benchmark_order):
    columns = []

    # type specific processing
    if t == "perf":
        df = prepare.calculate_overhead(df)
        prepare.reorder_and_rename_benchmarks(df, benchmark_order)
        prepare.reorder_compilers(df, t)

        plot = draw.BarplotOverhead()
        plot_args.update({
            "ylabel": "Normalized runtime\n(w.r.t. native)",
            "logy": True,
        })

    elif t == "mem":
        df = prepare.calculate_overhead(df, column="maxsize")
        prepare.reorder_and_rename_benchmarks(df, benchmark_order)
        prepare.reorder_compilers(df, t)

        plot = draw.BarplotOverhead()
        plot_args.update({
            "ylabel": "Memory overhead\n(w.r.t. native)",
            "logy": True,
        })

    elif t == "multi":
        df = prepare.calculate_multithreading_overhead(df, over=2)
        prepare.reorder_and_rename_benchmarks(df, benchmark_order)
        prepare.reorder_compilers(df, t)

        df["threads"] = Categorical(df["threads"], [8])
        df.sort_values(["threads"], inplace=True)

        plot = draw.BarplotMultithreaded()
        plot_args.update({
            "ylabel": "Speedup of 8 threads \nw.r.t. 2 threads",
            "ylim": (0.9, 4.5),
            "logy": True,
        })

    elif t == "cache":
        # values over 1000 instructions
        columns = ["l1_dcache_loads", "l1_dcache_load_misses", "l1_dcache_stores", "l1_dcache_store_misses",
                   "llc_loads", "llc_load_misses", "llc_stores", "llc_store_misses"]
        df = prepare.calculate_ratio(df, columns, "instructions")
        for c in columns:
            df[c] *= 100

        # differences
        df["L1 load hits"] = df["l1_dcache_loads"] - df["l1_dcache_load_misses"]
        df["LLC load hits"] = df["llc_loads"] - df["llc_load_misses"]
        df["L2 load hits"] = df["l1_dcache_load_misses"] - df["LLC load hits"]
        df["LLC load misses"] = df["llc_load_misses"]

        # df["l1_dcache_store_hits"] = df["l1_dcache_stores"] - df["l1_dcache_store_misses"]
        df["L1 store hits"] = df["l1_dcache_stores"] - df["llc_store_misses"]
        df["LLC store misses"] = df["llc_store_misses"]

        # ordering
        prepare.reorder_compilers(df, t)
        df = df.dropna(subset=['compilertype', 'name'])

        labels = df["compilertype"].unique()

        columns = ["L1 load hits", "L1 store hits",
                   "L2 load hits",
                   "LLC load hits", "LLC load misses", "LLC store misses"]

        plot = draw.BarplotClusteredStacked()
        plot_args.update({
            "ylabel": "Cache hits and misses\n(w.r.t. all instructions, %)",
            "xlabels": labels,
            "ylim": (0, 100),
            "yticks": range(0, 150, 20),
            "df_callback": prepare.reorder_and_rename_benchmarks,
            "df_callback_args": (benchmark_order,)
        })

    elif t == "instr":
        df = prepare.calculate_overhead(df, column="instructions")
        prepare.reorder_and_rename_benchmarks(df, benchmark_order)
        prepare.reorder_compilers(df, t)

        plot = draw.BarplotOverhead()
        plot_args.update({
            "ylabel": "Instruction overhead\n(w.r.t. native)",
            "logy": True,
        })

    elif t == "misc_stat":
        # values over 1000 instructions
        columns = ["dtlb_stores", "dtlb_store_misses", "dtlb_load_misses", "dtlb_loads", "branch_misses",
                   "branch_instructions"]
        df = prepare.calculate_ratio(df, columns, "instructions")
        for c in columns:
            df[c] *= 100

        # differences
        df["dtlb_stores"] -= df["dtlb_store_misses"]
        df["dtlb_loads"] -= df["dtlb_load_misses"]
        df["branch_instructions"] -= df["branch_misses"]

        prepare.reorder_compilers(df, t)
        df = df.dropna(subset=['compilertype', 'name'])

        labels = df["compilertype"].unique()

        plot = draw.BarplotClusteredStacked()
        plot_args.update({
            "ylabel": "Other statistics\n(w.r.t. all instructions, %)",
            "xlabels": labels,
            "ylim": (0, 100),
            "yticks": range(0, 150, 20),
            "df_callback": prepare.reorder_and_rename_benchmarks,
            "df_callback_args": (benchmark_order,)
        })

    elif t == "ku_instr":
        df = prepare.calculate_overhead(df, column="instructions:k")
        prepare.reorder_and_rename_benchmarks(df, benchmark_order)
        prepare.reorder_compilers(df, t)

        plot = draw.BarplotOverhead()
        plot_args.update({
            "ylabel": "Kernel instruction overhead\n(w.r.t. native)",
            "logy": True,
        })

    elif t == "native_mem_access":
        # values over 1000 instructions
        columns = ["l1_dcache_loads", "l1_dcache_stores"]
        df = prepare.calculate_ratio(df, columns, "instructions")

        df["overhead"] = df["l1_dcache_loads"] + df["l1_dcache_stores"]
        df["overhead"] *= 100

        prepare.reorder_and_rename_benchmarks(df, benchmark_order)
        prepare.reorder_compilers(df, t)

        plot = draw.BarplotOverhead()
        plot_args.update({
            "ylabel": "Memory accesses\n(w.r.t. all instructions, %)",
            "ylim": (0, 115),
            "yticks": range(0, 101, 20),
        })

    elif t == "ipc":
        # IPC (instructions/cycle) is saved in "instructions" column
        df = prepare.calculate_ratio(df, ["instructions"], "cycles")
        df["overhead"] = df["instructions"]

        prepare.reorder_and_rename_benchmarks(df, benchmark_order)
        prepare.reorder_compilers(df, t)

        plot = draw.BarplotWithNative()
        plot_args.update({
            "ylabel": "Processor IPC\n(instructions/cycle)",
            "ylim": (0, 5.4),
            "yticks": range(0, 10, 1),
            "ncol": 6,
        })

    else:
        logging.error("Unknown plot type")
        exit(-1)

    # no need to return plot_args, dict is mutable and is passed by reference
    return plot, columns
Exemplo n.º 28
0
 def test_from_categorical_dtype_categories(self):
     c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True)
     # override categories
     result = CategoricalDtype._from_categorical_dtype(c1,
                                                       categories=[2, 3])
     assert result == CategoricalDtype([2, 3], ordered=True)
Exemplo n.º 29
0
    def test_categorical_delegations(self):

        # invalid accessor
        pytest.raises(AttributeError, lambda: Series([1, 2, 3]).cat)
        tm.assert_raises_regex(
            AttributeError,
            r"Can only use .cat accessor with a 'category' dtype",
            lambda: Series([1, 2, 3]).cat)
        pytest.raises(AttributeError, lambda: Series(['a', 'b', 'c']).cat)
        pytest.raises(AttributeError, lambda: Series(np.arange(5.)).cat)
        pytest.raises(AttributeError,
                      lambda: Series([Timestamp('20130101')]).cat)

        # Series should delegate calls to '.categories', '.codes', '.ordered'
        # and the methods '.set_categories()' 'drop_unused_categories()' to the
        # categorical# -*- coding: utf-8 -*-
        s = Series(Categorical(["a", "b", "c", "a"], ordered=True))
        exp_categories = Index(["a", "b", "c"])
        tm.assert_index_equal(s.cat.categories, exp_categories)
        s.cat.categories = [1, 2, 3]
        exp_categories = Index([1, 2, 3])
        tm.assert_index_equal(s.cat.categories, exp_categories)

        exp_codes = Series([0, 1, 2, 0], dtype='int8')
        tm.assert_series_equal(s.cat.codes, exp_codes)

        assert s.cat.ordered
        s = s.cat.as_unordered()
        assert not s.cat.ordered
        s.cat.as_ordered(inplace=True)
        assert s.cat.ordered

        # reorder
        s = Series(Categorical(["a", "b", "c", "a"], ordered=True))
        exp_categories = Index(["c", "b", "a"])
        exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
        s = s.cat.set_categories(["c", "b", "a"])
        tm.assert_index_equal(s.cat.categories, exp_categories)
        tm.assert_numpy_array_equal(s.values.__array__(), exp_values)
        tm.assert_numpy_array_equal(s.__array__(), exp_values)

        # remove unused categories
        s = Series(Categorical(["a", "b", "b", "a"], categories=["a", "b", "c"
                                                                 ]))
        exp_categories = Index(["a", "b"])
        exp_values = np.array(["a", "b", "b", "a"], dtype=np.object_)
        s = s.cat.remove_unused_categories()
        tm.assert_index_equal(s.cat.categories, exp_categories)
        tm.assert_numpy_array_equal(s.values.__array__(), exp_values)
        tm.assert_numpy_array_equal(s.__array__(), exp_values)

        # This method is likely to be confused, so test that it raises an error
        # on wrong inputs:
        def f():
            s.set_categories([4, 3, 2, 1])

        pytest.raises(Exception, f)
        # right: s.cat.set_categories([4,3,2,1])

        # GH18862 (let Series.cat.rename_categories take callables)
        s = Series(Categorical(["a", "b", "c", "a"], ordered=True))
        result = s.cat.rename_categories(lambda x: x.upper())
        expected = Series(Categorical(["A", "B", "C", "A"],
                                      categories=["A", "B", "C"],
                                      ordered=True))
        tm.assert_series_equal(result, expected)
Exemplo n.º 30
0
 def test_constructor_str_unknown(self):
     with pytest.raises(ValueError, match="Unknown dtype"):
         Categorical([1, 2], dtype="foo")