def test_categorical_1d_only(self): # ndim > 1 msg = "> 1 ndim Categorical are not supported at this time" with pytest.raises(NotImplementedError, match=msg): Categorical(np.array([list("abcd")]))
def test_mode(self, values, categories, exp_mode): s = Categorical(values, categories=categories, ordered=True) res = s.mode() exp = Categorical(exp_mode, categories=categories, ordered=True) tm.assert_categorical_equal(res, exp)
def create_block(typestr, placement, item_shape=None, num_offset=0): """ Supported typestr: * float, f8, f4, f2 * int, i8, i4, i2, i1 * uint, u8, u4, u2, u1 * complex, c16, c8 * bool * object, string, O * datetime, dt, M8[ns], M8[ns, tz] * timedelta, td, m8[ns] * sparse (SparseArray with fill_value=0.0) * sparse_na (SparseArray with fill_value=np.nan) * category, category2 """ placement = BlockPlacement(placement) num_items = len(placement) if item_shape is None: item_shape = (N, ) shape = (num_items, ) + item_shape mat = get_numeric_mat(shape) if typestr in ( "float", "f8", "f4", "f2", "int", "i8", "i4", "i2", "i1", "uint", "u8", "u4", "u2", "u1", ): values = mat.astype(typestr) + num_offset elif typestr in ("complex", "c16", "c8"): values = 1.0j * (mat.astype(typestr) + num_offset) elif typestr in ("object", "string", "O"): values = np.reshape([f"A{i:d}" for i in mat.ravel() + num_offset], shape) elif typestr in ("b", "bool"): values = np.ones(shape, dtype=np.bool_) elif typestr in ("datetime", "dt", "M8[ns]"): values = (mat * 1e9).astype("M8[ns]") elif typestr.startswith("M8[ns"): # datetime with tz m = re.search(r"M8\[ns,\s*(\w+\/?\w*)\]", typestr) assert m is not None, f"incompatible typestr -> {typestr}" tz = m.groups()[0] assert num_items == 1, "must have only 1 num items for a tz-aware" values = DatetimeIndex(np.arange(N) * 1e9, tz=tz) elif typestr in ("timedelta", "td", "m8[ns]"): values = (mat * 1).astype("m8[ns]") elif typestr in ("category", ): values = Categorical([1, 1, 2, 2, 3, 3, 3, 3, 4, 4]) elif typestr in ("category2", ): values = Categorical( ["a", "a", "a", "a", "b", "b", "c", "c", "c", "d"]) elif typestr in ("sparse", "sparse_na"): # FIXME: doesn't support num_rows != 10 assert shape[-1] == 10 assert all(s == 1 for s in shape[:-1]) if typestr.endswith("_na"): fill_value = np.nan else: fill_value = 0.0 values = SparseArray( [fill_value, fill_value, 1, 2, 3, fill_value, 4, 5, fill_value, 6], fill_value=fill_value, ) arr = values.sp_values.view() arr += num_offset - 1 else: raise ValueError(f'Unsupported typestr: "{typestr}"') return make_block(values, placement=placement, ndim=len(shape))
def test_qcut_index(self): result = qcut([0, 2], 2) intervals = [Interval(-0.001, 1), Interval(1, 2)] expected = Categorical(intervals, ordered=True) tm.assert_categorical_equal(result, expected)
def test_repeat(self): # GH10183 cat = Categorical(["a", "b"], categories=["a", "b"]) exp = Categorical(["a", "a", "b", "b"], categories=["a", "b"]) res = cat.repeat(2) tm.assert_categorical_equal(res, exp)
class TestSeriesFillNA: def test_fillna_nat(self): series = Series([0, 1, 2, NaT.value], dtype="M8[ns]") filled = series.fillna(method="pad") filled2 = series.fillna(value=series.values[2]) expected = series.copy() expected.values[3] = expected.values[2] tm.assert_series_equal(filled, expected) tm.assert_series_equal(filled2, expected) df = DataFrame({"A": series}) filled = df.fillna(method="pad") filled2 = df.fillna(value=series.values[2]) expected = DataFrame({"A": expected}) tm.assert_frame_equal(filled, expected) tm.assert_frame_equal(filled2, expected) series = Series([NaT.value, 0, 1, 2], dtype="M8[ns]") filled = series.fillna(method="bfill") filled2 = series.fillna(value=series[1]) expected = series.copy() expected[0] = expected[1] tm.assert_series_equal(filled, expected) tm.assert_series_equal(filled2, expected) df = DataFrame({"A": series}) filled = df.fillna(method="bfill") filled2 = df.fillna(value=series[1]) expected = DataFrame({"A": expected}) tm.assert_frame_equal(filled, expected) tm.assert_frame_equal(filled2, expected) def test_fillna(self, datetime_series): ts = Series([0.0, 1.0, 2.0, 3.0, 4.0], index=tm.makeDateIndex(5)) tm.assert_series_equal(ts, ts.fillna(method="ffill")) ts[2] = np.NaN exp = Series([0.0, 1.0, 1.0, 3.0, 4.0], index=ts.index) tm.assert_series_equal(ts.fillna(method="ffill"), exp) exp = Series([0.0, 1.0, 3.0, 3.0, 4.0], index=ts.index) tm.assert_series_equal(ts.fillna(method="backfill"), exp) exp = Series([0.0, 1.0, 5.0, 3.0, 4.0], index=ts.index) tm.assert_series_equal(ts.fillna(value=5), exp) msg = "Must specify a fill 'value' or 'method'" with pytest.raises(ValueError, match=msg): ts.fillna() msg = "Cannot specify both 'value' and 'method'" with pytest.raises(ValueError, match=msg): datetime_series.fillna(value=0, method="ffill") # GH#5703 s1 = Series([np.nan]) s2 = Series([1]) result = s1.fillna(s2) expected = Series([1.0]) tm.assert_series_equal(result, expected) result = s1.fillna({}) tm.assert_series_equal(result, s1) result = s1.fillna(Series((), dtype=object)) tm.assert_series_equal(result, s1) result = s2.fillna(s1) tm.assert_series_equal(result, s2) result = s1.fillna({0: 1}) tm.assert_series_equal(result, expected) result = s1.fillna({1: 1}) tm.assert_series_equal(result, Series([np.nan])) result = s1.fillna({0: 1, 1: 1}) tm.assert_series_equal(result, expected) result = s1.fillna(Series({0: 1, 1: 1})) tm.assert_series_equal(result, expected) result = s1.fillna(Series({0: 1, 1: 1}, index=[4, 5])) tm.assert_series_equal(result, s1) s1 = Series([0, 1, 2], list("abc")) s2 = Series([0, np.nan, 2], list("bac")) result = s2.fillna(s1) expected = Series([0, 0, 2.0], list("bac")) tm.assert_series_equal(result, expected) # limit ser = Series(np.nan, index=[0, 1, 2]) result = ser.fillna(999, limit=1) expected = Series([999, np.nan, np.nan], index=[0, 1, 2]) tm.assert_series_equal(result, expected) result = ser.fillna(999, limit=2) expected = Series([999, 999, np.nan], index=[0, 1, 2]) tm.assert_series_equal(result, expected) # GH#9043 # make sure a string representation of int/float values can be filled # correctly without raising errors or being converted vals = ["0", "1.5", "-0.3"] for val in vals: ser = Series([0, 1, np.nan, np.nan, 4], dtype="float64") result = ser.fillna(val) expected = Series([0, 1, val, val, 4], dtype="object") tm.assert_series_equal(result, expected) def test_fillna_consistency(self): # GH#16402 # fillna with a tz aware to a tz-naive, should result in object ser = Series([Timestamp("20130101"), NaT]) result = ser.fillna(Timestamp("20130101", tz="US/Eastern")) expected = Series( [Timestamp("20130101"), Timestamp("2013-01-01", tz="US/Eastern")], dtype="object", ) tm.assert_series_equal(result, expected) # where (we ignore the errors=) result = ser.where([True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore") tm.assert_series_equal(result, expected) result = ser.where([True, False], Timestamp("20130101", tz="US/Eastern"), errors="ignore") tm.assert_series_equal(result, expected) # with a non-datetime result = ser.fillna("foo") expected = Series([Timestamp("20130101"), "foo"]) tm.assert_series_equal(result, expected) # assignment ser2 = ser.copy() ser2[1] = "foo" tm.assert_series_equal(ser2, expected) def test_fillna_downcast(self): # GH#15277 # infer int64 from float64 ser = Series([1.0, np.nan]) result = ser.fillna(0, downcast="infer") expected = Series([1, 0]) tm.assert_series_equal(result, expected) # infer int64 from float64 when fillna value is a dict ser = Series([1.0, np.nan]) result = ser.fillna({1: 0}, downcast="infer") expected = Series([1, 0]) tm.assert_series_equal(result, expected) def test_timedelta_fillna(self, frame_or_series): # GH#3371 ser = Series([ Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130102"), Timestamp("20130103 9:01:01"), ]) td = ser.diff() obj = frame_or_series(td) # reg fillna result = obj.fillna(Timedelta(seconds=0)) expected = Series([ timedelta(0), timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), ]) expected = frame_or_series(expected) tm.assert_equal(result, expected) # interpreted as seconds, deprecated with pytest.raises(TypeError, match="Passing integers to fillna"): obj.fillna(1) result = obj.fillna(Timedelta(seconds=1)) expected = Series([ timedelta(seconds=1), timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), ]) expected = frame_or_series(expected) tm.assert_equal(result, expected) result = obj.fillna(timedelta(days=1, seconds=1)) expected = Series([ timedelta(days=1, seconds=1), timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), ]) expected = frame_or_series(expected) tm.assert_equal(result, expected) result = obj.fillna(np.timedelta64(10**9)) expected = Series([ timedelta(seconds=1), timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), ]) expected = frame_or_series(expected) tm.assert_equal(result, expected) result = obj.fillna(NaT) expected = Series( [ NaT, timedelta(0), timedelta(1), timedelta(days=1, seconds=9 * 3600 + 60 + 1), ], dtype="m8[ns]", ) expected = frame_or_series(expected) tm.assert_equal(result, expected) # ffill td[2] = np.nan obj = frame_or_series(td) result = obj.ffill() expected = td.fillna(Timedelta(seconds=0)) expected[0] = np.nan expected = frame_or_series(expected) tm.assert_equal(result, expected) # bfill td[2] = np.nan obj = frame_or_series(td) result = obj.bfill() expected = td.fillna(Timedelta(seconds=0)) expected[2] = timedelta(days=1, seconds=9 * 3600 + 60 + 1) expected = frame_or_series(expected) tm.assert_equal(result, expected) def test_datetime64_fillna(self): ser = Series([ Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130102"), Timestamp("20130103 9:01:01"), ]) ser[2] = np.nan # ffill result = ser.ffill() expected = Series([ Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130103 9:01:01"), ]) tm.assert_series_equal(result, expected) # bfill result = ser.bfill() expected = Series([ Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130103 9:01:01"), Timestamp("20130103 9:01:01"), ]) tm.assert_series_equal(result, expected) # GH#6587 # make sure that we are treating as integer when filling # this also tests inference of a datetime-like with NaT's ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"]) expected = Series( [ "2013-08-05 15:30:00.000001", "2013-08-05 15:30:00.000001", "2013-08-05 15:30:00.000001", ], dtype="M8[ns]", ) result = ser.fillna(method="backfill") tm.assert_series_equal(result, expected) @pytest.mark.parametrize("tz", ["US/Eastern", "Asia/Tokyo"]) def test_datetime64_tz_fillna(self, tz): # DatetimeBlock ser = Series([ Timestamp("2011-01-01 10:00"), NaT, Timestamp("2011-01-03 10:00"), NaT, ]) null_loc = Series([False, True, False, True]) result = ser.fillna(Timestamp("2011-01-02 10:00")) expected = Series([ Timestamp("2011-01-01 10:00"), Timestamp("2011-01-02 10:00"), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-02 10:00"), ]) tm.assert_series_equal(expected, result) # check s is not changed tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz)) expected = Series([ Timestamp("2011-01-01 10:00"), Timestamp("2011-01-02 10:00", tz=tz), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-02 10:00", tz=tz), ]) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna("AAA") expected = Series( [ Timestamp("2011-01-01 10:00"), "AAA", Timestamp("2011-01-03 10:00"), "AAA", ], dtype=object, ) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna({ 1: Timestamp("2011-01-02 10:00", tz=tz), 3: Timestamp("2011-01-04 10:00"), }) expected = Series([ Timestamp("2011-01-01 10:00"), Timestamp("2011-01-02 10:00", tz=tz), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-04 10:00"), ]) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna({ 1: Timestamp("2011-01-02 10:00"), 3: Timestamp("2011-01-04 10:00") }) expected = Series([ Timestamp("2011-01-01 10:00"), Timestamp("2011-01-02 10:00"), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-04 10:00"), ]) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) # DatetimeBlockTZ idx = DatetimeIndex(["2011-01-01 10:00", NaT, "2011-01-03 10:00", NaT], tz=tz) ser = Series(idx) assert ser.dtype == f"datetime64[ns, {tz}]" tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna(Timestamp("2011-01-02 10:00")) expected = Series([ Timestamp("2011-01-01 10:00", tz=tz), Timestamp("2011-01-02 10:00"), Timestamp("2011-01-03 10:00", tz=tz), Timestamp("2011-01-02 10:00"), ]) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna(Timestamp("2011-01-02 10:00", tz=tz)) idx = DatetimeIndex( [ "2011-01-01 10:00", "2011-01-02 10:00", "2011-01-03 10:00", "2011-01-02 10:00", ], tz=tz, ) expected = Series(idx) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna( Timestamp("2011-01-02 10:00", tz=tz).to_pydatetime()) idx = DatetimeIndex( [ "2011-01-01 10:00", "2011-01-02 10:00", "2011-01-03 10:00", "2011-01-02 10:00", ], tz=tz, ) expected = Series(idx) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna("AAA") expected = Series( [ Timestamp("2011-01-01 10:00", tz=tz), "AAA", Timestamp("2011-01-03 10:00", tz=tz), "AAA", ], dtype=object, ) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna({ 1: Timestamp("2011-01-02 10:00", tz=tz), 3: Timestamp("2011-01-04 10:00"), }) expected = Series([ Timestamp("2011-01-01 10:00", tz=tz), Timestamp("2011-01-02 10:00", tz=tz), Timestamp("2011-01-03 10:00", tz=tz), Timestamp("2011-01-04 10:00"), ]) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna({ 1: Timestamp("2011-01-02 10:00", tz=tz), 3: Timestamp("2011-01-04 10:00", tz=tz), }) expected = Series([ Timestamp("2011-01-01 10:00", tz=tz), Timestamp("2011-01-02 10:00", tz=tz), Timestamp("2011-01-03 10:00", tz=tz), Timestamp("2011-01-04 10:00", tz=tz), ]) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) # filling with a naive/other zone, coerce to object result = ser.fillna(Timestamp("20130101")) expected = Series([ Timestamp("2011-01-01 10:00", tz=tz), Timestamp("2013-01-01"), Timestamp("2011-01-03 10:00", tz=tz), Timestamp("2013-01-01"), ]) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) result = ser.fillna(Timestamp("20130101", tz="US/Pacific")) expected = Series([ Timestamp("2011-01-01 10:00", tz=tz), Timestamp("2013-01-01", tz="US/Pacific"), Timestamp("2011-01-03 10:00", tz=tz), Timestamp("2013-01-01", tz="US/Pacific"), ]) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) def test_fillna_dt64tz_with_method(self): # with timezone # GH#15855 ser = Series([Timestamp("2012-11-11 00:00:00+01:00"), NaT]) exp = Series([ Timestamp("2012-11-11 00:00:00+01:00"), Timestamp("2012-11-11 00:00:00+01:00"), ]) tm.assert_series_equal(ser.fillna(method="pad"), exp) ser = Series([NaT, Timestamp("2012-11-11 00:00:00+01:00")]) exp = Series([ Timestamp("2012-11-11 00:00:00+01:00"), Timestamp("2012-11-11 00:00:00+01:00"), ]) tm.assert_series_equal(ser.fillna(method="bfill"), exp) def test_fillna_pytimedelta(self): # GH#8209 ser = Series([np.nan, Timedelta("1 days")], index=["A", "B"]) result = ser.fillna(timedelta(1)) expected = Series(Timedelta("1 days"), index=["A", "B"]) tm.assert_series_equal(result, expected) def test_fillna_period(self): # GH#13737 ser = Series([Period("2011-01", freq="M"), Period("NaT", freq="M")]) res = ser.fillna(Period("2012-01", freq="M")) exp = Series( [Period("2011-01", freq="M"), Period("2012-01", freq="M")]) tm.assert_series_equal(res, exp) assert res.dtype == "Period[M]" def test_fillna_dt64_timestamp(self, frame_or_series): ser = Series([ Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130102"), Timestamp("20130103 9:01:01"), ]) ser[2] = np.nan obj = frame_or_series(ser) # reg fillna result = obj.fillna(Timestamp("20130104")) expected = Series([ Timestamp("20130101"), Timestamp("20130101"), Timestamp("20130104"), Timestamp("20130103 9:01:01"), ]) expected = frame_or_series(expected) tm.assert_equal(result, expected) result = obj.fillna(NaT) expected = obj tm.assert_equal(result, expected) def test_fillna_dt64_non_nao(self): # GH#27419 ser = Series([Timestamp("2010-01-01"), NaT, Timestamp("2000-01-01")]) val = np.datetime64("1975-04-05", "ms") result = ser.fillna(val) expected = Series([ Timestamp("2010-01-01"), Timestamp("1975-04-05"), Timestamp("2000-01-01") ]) tm.assert_series_equal(result, expected) def test_fillna_numeric_inplace(self): x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) y = x.copy() return_value = y.fillna(value=0, inplace=True) assert return_value is None expected = x.fillna(value=0) tm.assert_series_equal(y, expected) # --------------------------------------------------------------- # CategoricalDtype @pytest.mark.parametrize( "fill_value, expected_output", [ ("a", ["a", "a", "b", "a", "a"]), ({ 1: "a", 3: "b", 4: "b" }, ["a", "a", "b", "b", "b"]), ({ 1: "a" }, ["a", "a", "b", np.nan, np.nan]), ({ 1: "a", 3: "b" }, ["a", "a", "b", "b", np.nan]), (Series("a"), ["a", np.nan, "b", np.nan, np.nan]), (Series("a", index=[1]), ["a", "a", "b", np.nan, np.nan]), (Series({ 1: "a", 3: "b" }), ["a", "a", "b", "b", np.nan]), (Series(["a", "b"], index=[3, 4]), ["a", np.nan, "b", "a", "b"]), ], ) def test_fillna_categorical(self, fill_value, expected_output): # GH#17033 # Test fillna for a Categorical series data = ["a", np.nan, "b", np.nan, np.nan] ser = Series(Categorical(data, categories=["a", "b"])) exp = Series(Categorical(expected_output, categories=["a", "b"])) result = ser.fillna(fill_value) tm.assert_series_equal(result, exp) @pytest.mark.parametrize( "fill_value, expected_output", [ (Series(["a", "b", "c", "d", "e"]), ["a", "b", "b", "d", "e"]), (Series(["b", "d", "a", "d", "a"]), ["a", "d", "b", "d", "a"]), ( Series( Categorical(["b", "d", "a", "d", "a"], categories=["b", "c", "d", "e", "a"])), ["a", "d", "b", "d", "a"], ), ], ) def test_fillna_categorical_with_new_categories(self, fill_value, expected_output): # GH#26215 data = ["a", np.nan, "b", np.nan, np.nan] ser = Series(Categorical(data, categories=["a", "b", "c", "d", "e"])) exp = Series( Categorical(expected_output, categories=["a", "b", "c", "d", "e"])) result = ser.fillna(fill_value) tm.assert_series_equal(result, exp) def test_fillna_categorical_raises(self): data = ["a", np.nan, "b", np.nan, np.nan] ser = Series(Categorical(data, categories=["a", "b"])) msg = "Cannot setitem on a Categorical with a new category" with pytest.raises(ValueError, match=msg): ser.fillna("d") with pytest.raises(ValueError, match=msg): ser.fillna(Series("d")) with pytest.raises(ValueError, match=msg): ser.fillna({1: "d", 3: "a"}) msg = '"value" parameter must be a scalar or dict, but you passed a "list"' with pytest.raises(TypeError, match=msg): ser.fillna(["a", "b"]) msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' with pytest.raises(TypeError, match=msg): ser.fillna(("a", "b")) msg = ('"value" parameter must be a scalar, dict ' 'or Series, but you passed a "DataFrame"') with pytest.raises(TypeError, match=msg): ser.fillna(DataFrame({1: ["a"], 3: ["b"]})) # --------------------------------------------------------------- # Invalid Usages def test_fillna_invalid_method(self, datetime_series): try: datetime_series.fillna(method="ffil") except ValueError as inst: assert "ffil" in str(inst) def test_fillna_listlike_invalid(self): ser = Series(np.random.randint(-100, 100, 50)) msg = '"value" parameter must be a scalar or dict, but you passed a "list"' with pytest.raises(TypeError, match=msg): ser.fillna([1, 2]) msg = '"value" parameter must be a scalar or dict, but you passed a "tuple"' with pytest.raises(TypeError, match=msg): ser.fillna((1, 2)) def test_fillna_method_and_limit_invalid(self): # related GH#9217, make sure limit is an int and greater than 0 ser = Series([1, 2, 3, None]) msg = (r"Cannot specify both 'value' and 'method'\.|" r"Limit must be greater than 0|" "Limit must be an integer") for limit in [-1, 0, 1.0, 2.0]: for method in ["backfill", "bfill", "pad", "ffill", None]: with pytest.raises(ValueError, match=msg): ser.fillna(1, limit=limit, method=method) def test_fillna_datetime64_with_timezone_tzinfo(self): # https://github.com/pandas-dev/pandas/issues/38851 s = Series(date_range("2020", periods=3, tz="UTC")) expected = s.astype(object) s[1] = NaT result = s.fillna(datetime(2020, 1, 2, tzinfo=timezone.utc)) tm.assert_series_equal(result, expected)
def test_observed(observed): # multiple groupers, don't re-expand the output space # of the grouper # gh-14942 (implement) # gh-10132 (back-compat) # gh-8138 (back-compat) # gh-8869 cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) df['C'] = ['foo', 'bar'] * 2 # multiple groupers with a non-cat gb = df.groupby(['A', 'B', 'C'], observed=observed) exp_index = pd.MultiIndex.from_arrays( [cat1, cat2, ['foo', 'bar'] * 2], names=['A', 'B', 'C']) expected = DataFrame({'values': Series( [1, 2, 3, 4], index=exp_index)}).sort_index() result = gb.sum() if not observed: expected = cartesian_product_for_groupers( expected, [cat1, cat2, ['foo', 'bar']], list('ABC')) tm.assert_frame_equal(result, expected) gb = df.groupby(['A', 'B'], observed=observed) exp_index = pd.MultiIndex.from_arrays( [cat1, cat2], names=['A', 'B']) expected = DataFrame({'values': [1, 2, 3, 4]}, index=exp_index) result = gb.sum() if not observed: expected = cartesian_product_for_groupers( expected, [cat1, cat2], list('AB')) tm.assert_frame_equal(result, expected) # https://github.com/pandas-dev/pandas/issues/8138 d = {'cat': pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True), 'ints': [1, 1, 2, 2], 'val': [10, 20, 30, 40]} df = pd.DataFrame(d) # Grouping on a single column groups_single_key = df.groupby("cat", observed=observed) result = groups_single_key.mean() exp_index = pd.CategoricalIndex(list('ab'), name="cat", categories=list('abc'), ordered=True) expected = DataFrame({"ints": [1.5, 1.5], "val": [20., 30]}, index=exp_index) if not observed: index = pd.CategoricalIndex(list('abc'), name="cat", categories=list('abc'), ordered=True) expected = expected.reindex(index) tm.assert_frame_equal(result, expected) # Grouping on two columns groups_double_key = df.groupby(["cat", "ints"], observed=observed) result = groups_double_key.agg('mean') expected = DataFrame( {"val": [10, 30, 20, 40], "cat": pd.Categorical(['a', 'a', 'b', 'b'], categories=['a', 'b', 'c'], ordered=True), "ints": [1, 2, 1, 2]}).set_index(["cat", "ints"]) if not observed: expected = cartesian_product_for_groupers( expected, [df.cat.values, [1, 2]], ['cat', 'ints']) tm.assert_frame_equal(result, expected) # GH 10132 for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: c, i = key result = groups_double_key.get_group(key) expected = df[(df.cat == c) & (df.ints == i)] assert_frame_equal(result, expected) # gh-8869 # with as_index d = {'foo': [10, 8, 4, 8, 4, 1, 1], 'bar': [10, 20, 30, 40, 50, 60, 70], 'baz': ['d', 'c', 'e', 'a', 'a', 'd', 'c']} df = pd.DataFrame(d) cat = pd.cut(df['foo'], np.linspace(0, 10, 3)) df['range'] = cat groups = df.groupby(['range', 'baz'], as_index=False, observed=observed) result = groups.agg('mean') groups2 = df.groupby(['range', 'baz'], as_index=True, observed=observed) expected = groups2.agg('mean').reset_index() tm.assert_frame_equal(result, expected)
def test_constructor_string_and_tuples(self): # GH 21416 c = Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object)) expected_index = Index([("a", "b"), ("b", "a"), "c"]) assert c.categories.equals(expected_index)
def test_constructor_datetime64_non_nano(self): categories = np.arange(10).view("M8[D]") values = categories[::2].copy() cat = Categorical(values, categories=categories) assert (cat == values).all()
def test_construction_with_ordered(self, ordered): # GH 9347, 9190 cat = Categorical([0, 1, 2], ordered=ordered) assert cat.ordered == bool(ordered)
def test_constructor_imaginary(self): values = [1, 2, 3 + 1j] c1 = Categorical(values) tm.assert_index_equal(c1.categories, Index(values)) tm.assert_numpy_array_equal(np.array(c1), np.array(values))
def test_from_codes_with_categorical_categories(self, klass): # GH17884 expected = Categorical(["a", "b"], categories=["a", "b", "c"]) result = Categorical.from_codes([0, 1], categories=klass(["a", "b", "c"])) tm.assert_categorical_equal(result, expected)
def test_from_codes_empty(self): cat = ["a", "b", "c"] result = Categorical.from_codes([], categories=cat) expected = Categorical([], categories=cat) tm.assert_categorical_equal(result, expected)
def test_constructor_np_strs(self): # GH#31499 Hastable.map_locations needs to work on np.str_ objects cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")]) assert all(isinstance(x, np.str_) for x in cat.categories)
def test_from_categorical_dtype_ordered(self): c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) # override ordered result = CategoricalDtype._from_categorical_dtype(c1, ordered=False) assert result == CategoricalDtype([1, 2, 3], ordered=False)
def test_constructor_empty_boolean(self): # see gh-22702 cat = Categorical([], categories=[True, False]) categories = sorted(cat.categories.tolist()) assert categories == [False, True]
def test_categorical_categories(self): # GH17884 c1 = CategoricalDtype(Categorical(["a", "b"])) tm.assert_index_equal(c1.categories, pd.Index(["a", "b"])) c1 = CategoricalDtype(CategoricalIndex(["a", "b"])) tm.assert_index_equal(c1.categories, pd.Index(["a", "b"]))
def test_constructor_tuples(self): values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object) result = Categorical(values) expected = Index([(1,), (1, 2)], tupleize_cols=False) tm.assert_index_equal(result.categories, expected) assert result.ordered is False
def union_categoricals(to_union, sort_categories: bool = False, ignore_order: bool = False): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. Parameters ---------- to_union : list-like Categorical, CategoricalIndex, or Series with dtype='category'. sort_categories : bool, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. ignore_order : bool, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. Returns ------- Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Empty list of categoricals passed Notes ----- To learn more about categories, see `link <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__ Examples -------- >>> from pandas.api.types import union_categoricals If you want to combine categoricals that do not necessarily have the same categories, `union_categoricals` will combine a list-like of categoricals. The new categories will be the union of the categories being combined. >>> a = pd.Categorical(["b", "c"]) >>> b = pd.Categorical(["a", "b"]) >>> union_categoricals([a, b]) ['b', 'c', 'a', 'b'] Categories (3, object): ['b', 'c', 'a'] By default, the resulting categories will be ordered as they appear in the `categories` of the data. If you want the categories to be lexsorted, use `sort_categories=True` argument. >>> union_categoricals([a, b], sort_categories=True) ['b', 'c', 'a', 'b'] Categories (3, object): ['a', 'b', 'c'] `union_categoricals` also works with the case of combining two categoricals of the same categories and order information (e.g. what you could also `append` for). >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "a"], ordered=True) >>> union_categoricals([a, b]) ['a', 'b', 'a', 'b', 'a'] Categories (2, object): ['a' < 'b'] Raises `TypeError` because the categories are ordered and not identical. >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "c"], ordered=True) >>> union_categoricals([a, b]) Traceback (most recent call last): ... TypeError: to union ordered Categoricals, all categories must be the same New in version 0.20.0 Ordered categoricals with different categories or orderings can be combined by using the `ignore_ordered=True` argument. >>> a = pd.Categorical(["a", "b", "c"], ordered=True) >>> b = pd.Categorical(["c", "b", "a"], ordered=True) >>> union_categoricals([a, b], ignore_order=True) ['a', 'b', 'c', 'c', 'b', 'a'] Categories (3, object): ['a', 'b', 'c'] `union_categoricals` also works with a `CategoricalIndex`, or `Series` containing categorical data, but note that the resulting array will always be a plain `Categorical` >>> a = pd.Series(["b", "c"], dtype='category') >>> b = pd.Series(["a", "b"], dtype='category') >>> union_categoricals([a, b]) ['b', 'c', 'a', 'b'] Categories (3, object): ['b', 'c', 'a'] """ from pandas import Categorical from pandas.core.arrays.categorical import recode_for_categories if len(to_union) == 0: raise ValueError("No Categoricals to union") def _maybe_unwrap(x): if isinstance(x, (ABCCategoricalIndex, ABCSeries)): return x._values elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all( is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:]): raise TypeError("dtype of categories must be the same") ordered = False if all( first._categories_match_up_to_permutation(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered all_codes = [ first._encode_with_my_categories(x)._codes for x in to_union ] new_codes = np.concatenate(all_codes) if sort_categories and not ignore_order and ordered: raise TypeError( "Cannot use sort_categories=True with ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) from pandas.core.algorithms import take_nd new_codes = take_nd(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = cats.unique() if sort_categories: categories = categories.sort_values() new_codes = [ recode_for_categories(c.codes, c.categories, categories) for c in to_union ] new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = "to union ordered Categoricals, all categories must be the same" raise TypeError(msg) else: raise TypeError("Categorical.ordered must be the same") if ignore_order: ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
def test_is_boolean(self, categories, expected): cat = Categorical(categories) assert cat.dtype._is_boolean is expected assert is_bool_dtype(cat) is expected assert is_bool_dtype(cat.dtype) is expected
def test_basic(): cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) exp_index = CategoricalIndex(list('abcd'), name='b', ordered=True) expected = DataFrame({'a': [1, 2, 4, np.nan]}, index=exp_index) result = data.groupby("b", observed=False).mean() tm.assert_frame_equal(result, expected) cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True) cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True) df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]}) # single grouper gb = df.groupby("A", observed=False) exp_idx = CategoricalIndex(['a', 'b', 'z'], name='A', ordered=True) expected = DataFrame({'values': Series([3, 7, 0], index=exp_idx)}) result = gb.sum() tm.assert_frame_equal(result, expected) # GH 8623 x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'], [1, 'John P. Doe']], columns=['person_id', 'person_name']) x['person_name'] = Categorical(x.person_name) g = x.groupby(['person_id'], observed=False) result = g.transform(lambda x: x) tm.assert_frame_equal(result, x[['person_name']]) result = x.drop_duplicates('person_name') expected = x.iloc[[0, 1]] tm.assert_frame_equal(result, expected) def f(x): return x.drop_duplicates('person_name').iloc[0] result = g.apply(f) expected = x.iloc[[0, 1]].copy() expected.index = Index([1, 2], name='person_id') expected['person_name'] = expected['person_name'].astype('object') tm.assert_frame_equal(result, expected) # GH 9921 # Monotonic df = DataFrame({"a": [5, 15, 25]}) c = pd.cut(df.a, bins=[0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df['a']) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[['a']]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.max(xs)), df[['a']]) # Filter tm.assert_series_equal( df.a.groupby(c, observed=False).filter(np.all), df['a']) tm.assert_frame_equal( df.groupby(c, observed=False).filter(np.all), df) # Non-monotonic df = DataFrame({"a": [5, 15, 25, -5]}) c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40]) result = df.a.groupby(c, observed=False).transform(sum) tm.assert_series_equal(result, df['a']) tm.assert_series_equal( df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df['a']) tm.assert_frame_equal( df.groupby(c, observed=False).transform(sum), df[['a']]) tm.assert_frame_equal( df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[['a']]) # GH 9603 df = DataFrame({'a': [1, 0, 0, 0]}) c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list('abcd'))) result = df.groupby(c, observed=False).apply(len) exp_index = CategoricalIndex( c.values.categories, ordered=c.values.ordered) expected = Series([1, 0, 0, 0], index=exp_index) expected.index.name = 'a' tm.assert_series_equal(result, expected) # more basic levels = ['foo', 'bar', 'baz', 'qux'] codes = np.random.randint(0, 4, size=100) cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) grouped = data.groupby(cats, observed=False) desc_result = grouped.describe() idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) exp_cats = Categorical(ord_labels, ordered=True, categories=['foo', 'bar', 'baz', 'qux']) expected = ord_data.groupby( exp_cats, sort=False, observed=False).describe() assert_frame_equal(desc_result, expected) # GH 10460 expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index .get_level_values(0)), exp) exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'] * 4) tm.assert_index_equal((desc_result.stack().index .get_level_values(1)), exp)
def test_dtype_specific_categorical_dtype(self): expected = "datetime64[ns]" result = str(Categorical(DatetimeIndex([])).categories.dtype) assert result == expected
def test_nbytes(self): cat = Categorical([1, 2, 3]) exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories assert cat.nbytes == exp
class TestCategoricalDtype(Base): @pytest.fixture def dtype(self): """ Class level fixture of dtype for TestCategoricalDtype """ return CategoricalDtype() def test_hash_vs_equality(self, dtype): dtype2 = CategoricalDtype() assert dtype == dtype2 assert dtype2 == dtype assert hash(dtype) == hash(dtype2) def test_equality(self, dtype): assert is_dtype_equal(dtype, "category") assert is_dtype_equal(dtype, CategoricalDtype()) assert not is_dtype_equal(dtype, "foo") def test_construction_from_string(self, dtype): result = CategoricalDtype.construct_from_string("category") assert is_dtype_equal(dtype, result) msg = "Cannot construct a 'CategoricalDtype' from 'foo'" with pytest.raises(TypeError, match=msg): CategoricalDtype.construct_from_string("foo") def test_constructor_invalid(self): msg = "Parameter 'categories' must be list-like" with pytest.raises(TypeError, match=msg): CategoricalDtype("category") dtype1 = CategoricalDtype(["a", "b"], ordered=True) dtype2 = CategoricalDtype(["x", "y"], ordered=False) c = Categorical([0, 1], dtype=dtype1, fastpath=True) @pytest.mark.parametrize( "values, categories, ordered, dtype, expected", [ [None, None, None, None, CategoricalDtype()], [None, ["a", "b"], True, None, dtype1], [c, None, None, dtype2, dtype2], [c, ["x", "y"], False, None, dtype2], ], ) def test_from_values_or_dtype(self, values, categories, ordered, dtype, expected): result = CategoricalDtype._from_values_or_dtype( values, categories, ordered, dtype) assert result == expected @pytest.mark.parametrize( "values, categories, ordered, dtype", [ [None, ["a", "b"], True, dtype2], [None, ["a", "b"], None, dtype2], [None, None, True, dtype2], ], ) def test_from_values_or_dtype_raises(self, values, categories, ordered, dtype): msg = "Cannot specify `categories` or `ordered` together with `dtype`." with pytest.raises(ValueError, match=msg): CategoricalDtype._from_values_or_dtype(values, categories, ordered, dtype) def test_from_values_or_dtype_invalid_dtype(self): msg = "Cannot not construct CategoricalDtype from <class 'object'>" with pytest.raises(ValueError, match=msg): CategoricalDtype._from_values_or_dtype(None, None, None, object) def test_is_dtype(self, dtype): assert CategoricalDtype.is_dtype(dtype) assert CategoricalDtype.is_dtype("category") assert CategoricalDtype.is_dtype(CategoricalDtype()) assert not CategoricalDtype.is_dtype("foo") assert not CategoricalDtype.is_dtype(np.float64) def test_basic(self, dtype): assert is_categorical_dtype(dtype) factor = Categorical(["a", "b", "b", "a", "a", "c", "c", "c"]) s = Series(factor, name="A") # dtypes assert is_categorical_dtype(s.dtype) assert is_categorical_dtype(s) assert not is_categorical_dtype(np.dtype("float64")) with tm.assert_produces_warning(FutureWarning): # GH#33385 deprecated assert is_categorical(s.dtype) assert is_categorical(s) assert not is_categorical(np.dtype("float64")) assert not is_categorical(1.0) def test_tuple_categories(self): categories = [(1, "a"), (2, "b"), (3, "c")] result = CategoricalDtype(categories) assert all(result.categories == categories) @pytest.mark.parametrize( "categories, expected", [ ([True, False], True), ([True, False, None], True), ([True, False, "a", "b'"], False), ([0, 1], False), ], ) def test_is_boolean(self, categories, expected): cat = Categorical(categories) assert cat.dtype._is_boolean is expected assert is_bool_dtype(cat) is expected assert is_bool_dtype(cat.dtype) is expected def test_dtype_specific_categorical_dtype(self): expected = "datetime64[ns]" result = str(Categorical(DatetimeIndex([])).categories.dtype) assert result == expected
def test_isna(self): exp = np.array([False, False, True]) c = Categorical(["a", "b", np.nan]) res = c.isna() tm.assert_numpy_array_equal(res, exp)
def test_from_categorical_dtype_identity(self): c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) # Identity test for no changes c2 = CategoricalDtype._from_categorical_dtype(c1) assert c2 is c1
def process_type(t, df, plot_args, benchmark_order): columns = [] # type specific processing if t == "perf": df = prepare.calculate_overhead(df) prepare.reorder_and_rename_benchmarks(df, benchmark_order) prepare.reorder_compilers(df, t) plot = draw.BarplotOverhead() plot_args.update({ "ylabel": "Normalized runtime\n(w.r.t. native)", "logy": True, }) elif t == "mem": df = prepare.calculate_overhead(df, column="maxsize") prepare.reorder_and_rename_benchmarks(df, benchmark_order) prepare.reorder_compilers(df, t) plot = draw.BarplotOverhead() plot_args.update({ "ylabel": "Memory overhead\n(w.r.t. native)", "logy": True, }) elif t == "multi": df = prepare.calculate_multithreading_overhead(df, over=2) prepare.reorder_and_rename_benchmarks(df, benchmark_order) prepare.reorder_compilers(df, t) df["threads"] = Categorical(df["threads"], [8]) df.sort_values(["threads"], inplace=True) plot = draw.BarplotMultithreaded() plot_args.update({ "ylabel": "Speedup of 8 threads \nw.r.t. 2 threads", "ylim": (0.9, 4.5), "logy": True, }) elif t == "cache": # values over 1000 instructions columns = ["l1_dcache_loads", "l1_dcache_load_misses", "l1_dcache_stores", "l1_dcache_store_misses", "llc_loads", "llc_load_misses", "llc_stores", "llc_store_misses"] df = prepare.calculate_ratio(df, columns, "instructions") for c in columns: df[c] *= 100 # differences df["L1 load hits"] = df["l1_dcache_loads"] - df["l1_dcache_load_misses"] df["LLC load hits"] = df["llc_loads"] - df["llc_load_misses"] df["L2 load hits"] = df["l1_dcache_load_misses"] - df["LLC load hits"] df["LLC load misses"] = df["llc_load_misses"] # df["l1_dcache_store_hits"] = df["l1_dcache_stores"] - df["l1_dcache_store_misses"] df["L1 store hits"] = df["l1_dcache_stores"] - df["llc_store_misses"] df["LLC store misses"] = df["llc_store_misses"] # ordering prepare.reorder_compilers(df, t) df = df.dropna(subset=['compilertype', 'name']) labels = df["compilertype"].unique() columns = ["L1 load hits", "L1 store hits", "L2 load hits", "LLC load hits", "LLC load misses", "LLC store misses"] plot = draw.BarplotClusteredStacked() plot_args.update({ "ylabel": "Cache hits and misses\n(w.r.t. all instructions, %)", "xlabels": labels, "ylim": (0, 100), "yticks": range(0, 150, 20), "df_callback": prepare.reorder_and_rename_benchmarks, "df_callback_args": (benchmark_order,) }) elif t == "instr": df = prepare.calculate_overhead(df, column="instructions") prepare.reorder_and_rename_benchmarks(df, benchmark_order) prepare.reorder_compilers(df, t) plot = draw.BarplotOverhead() plot_args.update({ "ylabel": "Instruction overhead\n(w.r.t. native)", "logy": True, }) elif t == "misc_stat": # values over 1000 instructions columns = ["dtlb_stores", "dtlb_store_misses", "dtlb_load_misses", "dtlb_loads", "branch_misses", "branch_instructions"] df = prepare.calculate_ratio(df, columns, "instructions") for c in columns: df[c] *= 100 # differences df["dtlb_stores"] -= df["dtlb_store_misses"] df["dtlb_loads"] -= df["dtlb_load_misses"] df["branch_instructions"] -= df["branch_misses"] prepare.reorder_compilers(df, t) df = df.dropna(subset=['compilertype', 'name']) labels = df["compilertype"].unique() plot = draw.BarplotClusteredStacked() plot_args.update({ "ylabel": "Other statistics\n(w.r.t. all instructions, %)", "xlabels": labels, "ylim": (0, 100), "yticks": range(0, 150, 20), "df_callback": prepare.reorder_and_rename_benchmarks, "df_callback_args": (benchmark_order,) }) elif t == "ku_instr": df = prepare.calculate_overhead(df, column="instructions:k") prepare.reorder_and_rename_benchmarks(df, benchmark_order) prepare.reorder_compilers(df, t) plot = draw.BarplotOverhead() plot_args.update({ "ylabel": "Kernel instruction overhead\n(w.r.t. native)", "logy": True, }) elif t == "native_mem_access": # values over 1000 instructions columns = ["l1_dcache_loads", "l1_dcache_stores"] df = prepare.calculate_ratio(df, columns, "instructions") df["overhead"] = df["l1_dcache_loads"] + df["l1_dcache_stores"] df["overhead"] *= 100 prepare.reorder_and_rename_benchmarks(df, benchmark_order) prepare.reorder_compilers(df, t) plot = draw.BarplotOverhead() plot_args.update({ "ylabel": "Memory accesses\n(w.r.t. all instructions, %)", "ylim": (0, 115), "yticks": range(0, 101, 20), }) elif t == "ipc": # IPC (instructions/cycle) is saved in "instructions" column df = prepare.calculate_ratio(df, ["instructions"], "cycles") df["overhead"] = df["instructions"] prepare.reorder_and_rename_benchmarks(df, benchmark_order) prepare.reorder_compilers(df, t) plot = draw.BarplotWithNative() plot_args.update({ "ylabel": "Processor IPC\n(instructions/cycle)", "ylim": (0, 5.4), "yticks": range(0, 10, 1), "ncol": 6, }) else: logging.error("Unknown plot type") exit(-1) # no need to return plot_args, dict is mutable and is passed by reference return plot, columns
def test_from_categorical_dtype_categories(self): c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) # override categories result = CategoricalDtype._from_categorical_dtype(c1, categories=[2, 3]) assert result == CategoricalDtype([2, 3], ordered=True)
def test_categorical_delegations(self): # invalid accessor pytest.raises(AttributeError, lambda: Series([1, 2, 3]).cat) tm.assert_raises_regex( AttributeError, r"Can only use .cat accessor with a 'category' dtype", lambda: Series([1, 2, 3]).cat) pytest.raises(AttributeError, lambda: Series(['a', 'b', 'c']).cat) pytest.raises(AttributeError, lambda: Series(np.arange(5.)).cat) pytest.raises(AttributeError, lambda: Series([Timestamp('20130101')]).cat) # Series should delegate calls to '.categories', '.codes', '.ordered' # and the methods '.set_categories()' 'drop_unused_categories()' to the # categorical# -*- coding: utf-8 -*- s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) exp_categories = Index(["a", "b", "c"]) tm.assert_index_equal(s.cat.categories, exp_categories) s.cat.categories = [1, 2, 3] exp_categories = Index([1, 2, 3]) tm.assert_index_equal(s.cat.categories, exp_categories) exp_codes = Series([0, 1, 2, 0], dtype='int8') tm.assert_series_equal(s.cat.codes, exp_codes) assert s.cat.ordered s = s.cat.as_unordered() assert not s.cat.ordered s.cat.as_ordered(inplace=True) assert s.cat.ordered # reorder s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) exp_categories = Index(["c", "b", "a"]) exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_) s = s.cat.set_categories(["c", "b", "a"]) tm.assert_index_equal(s.cat.categories, exp_categories) tm.assert_numpy_array_equal(s.values.__array__(), exp_values) tm.assert_numpy_array_equal(s.__array__(), exp_values) # remove unused categories s = Series(Categorical(["a", "b", "b", "a"], categories=["a", "b", "c" ])) exp_categories = Index(["a", "b"]) exp_values = np.array(["a", "b", "b", "a"], dtype=np.object_) s = s.cat.remove_unused_categories() tm.assert_index_equal(s.cat.categories, exp_categories) tm.assert_numpy_array_equal(s.values.__array__(), exp_values) tm.assert_numpy_array_equal(s.__array__(), exp_values) # This method is likely to be confused, so test that it raises an error # on wrong inputs: def f(): s.set_categories([4, 3, 2, 1]) pytest.raises(Exception, f) # right: s.cat.set_categories([4,3,2,1]) # GH18862 (let Series.cat.rename_categories take callables) s = Series(Categorical(["a", "b", "c", "a"], ordered=True)) result = s.cat.rename_categories(lambda x: x.upper()) expected = Series(Categorical(["A", "B", "C", "A"], categories=["A", "B", "C"], ordered=True)) tm.assert_series_equal(result, expected)
def test_constructor_str_unknown(self): with pytest.raises(ValueError, match="Unknown dtype"): Categorical([1, 2], dtype="foo")