示例#1
0
    @pytest.mark.parametrize('bad_dtype', [
        'foo', object, np.int64, PeriodDtype('Q')])
    def test_update_dtype_errors(self, bad_dtype):
        dtype = CategoricalDtype(list('abc'), False)
        msg = 'a CategoricalDtype must be passed to perform an update, '
        with tm.assert_raises_regex(ValueError, msg):
            dtype.update_dtype(bad_dtype)


@pytest.mark.parametrize(
    'dtype',
    [DatetimeTZDtype, CategoricalDtype,
     PeriodDtype, IntervalDtype])
def test_registry(dtype):
    assert dtype in registry.dtypes


@pytest.mark.parametrize(
    'dtype, expected',
    [('int64', None),
     ('interval', IntervalDtype()),
     ('interval[int64]', IntervalDtype()),
     ('interval[datetime64[ns]]', IntervalDtype('datetime64[ns]')),
     ('category', CategoricalDtype()),
     ('period[D]', PeriodDtype('D')),
     ('datetime64[ns, US/Eastern]', DatetimeTZDtype('ns', 'US/Eastern'))])
def test_registry_find(dtype, expected):

    assert registry.find(dtype) == expected
示例#2
0
 def test_not_string(self):
     # GH30568: though IntervalDtype has object kind, it cannot be string
     assert not is_string_dtype(IntervalDtype())
示例#3
0
    def test_subclass(self):
        a = IntervalDtype('interval[int64]')
        b = IntervalDtype('interval[int64]')

        assert issubclass(type(a), type(a))
        assert issubclass(type(a), type(b))
示例#4
0
 def test_construction_from_string(self, dtype):
     result = IntervalDtype("interval[int64]")
     assert is_dtype_equal(dtype, result)
     result = IntervalDtype.construct_from_string("interval[int64]")
     assert is_dtype_equal(dtype, result)
示例#5
0
 def test_name_repr(self, subtype):
     # GH 18980
     dtype = IntervalDtype(subtype)
     expected = f"interval[{subtype}]"
     assert str(dtype) == expected
     assert dtype.name == "interval"
示例#6
0
 def test_construction(self, subtype):
     i = IntervalDtype(subtype)
     assert i.subtype == np.dtype("int64")
     assert is_interval_dtype(i)
示例#7
0
 def test_construction_not_supported(self, subtype):
     # GH 19016
     msg = ("category, object, and string subtypes are not supported "
            "for IntervalDtype")
     with pytest.raises(TypeError, match=msg):
         IntervalDtype(subtype)
示例#8
0
 def test_name_repr(self, subtype):
     # GH 18980
     dtype = IntervalDtype(subtype)
     expected = 'interval[{subtype}]'.format(subtype=subtype)
     assert str(dtype) == expected
     assert dtype.name == 'interval'
示例#9
0
 def test_subtype_datetimelike(self, index, subtype):
     dtype = IntervalDtype(subtype, "right")
     msg = "Cannot convert .* to .*; subtypes are incompatible"
     with pytest.raises(TypeError, match=msg):
         index.astype(dtype)
示例#10
0
 def test_construction_from_string(self):
     result = IntervalDtype('interval[int64]')
     assert is_dtype_equal(self.dtype, result)
     result = IntervalDtype.construct_from_string('interval[int64]')
     assert is_dtype_equal(self.dtype, result)
示例#11
0
 def test_coerce_to_dtype(self):
     assert (_coerce_to_dtype('interval[int64]') ==
             IntervalDtype('interval[int64]'))
示例#12
0
 def test_construction_errors(self):
     msg = 'could not construct IntervalDtype'
     with tm.assert_raises_regex(TypeError, msg):
         IntervalDtype('xx')
示例#13
0
 def test_construction_not_supported(self, subtype):
     # GH 19016
     msg = ('category, object, and string subtypes are not supported '
            'for IntervalDtype')
     with tm.assert_raises_regex(TypeError, msg):
         IntervalDtype(subtype)
示例#14
0
 def create(self):
     return IntervalDtype('int64')
示例#15
0
        msg = "a CategoricalDtype must be passed to perform an update, "
        with pytest.raises(ValueError, match=msg):
            dtype.update_dtype(bad_dtype)


@pytest.mark.parametrize(
    "dtype", [CategoricalDtype, IntervalDtype, DatetimeTZDtype, PeriodDtype])
def test_registry(dtype):
    assert dtype in registry.dtypes


@pytest.mark.parametrize(
    "dtype, expected",
    [
        ("int64", None),
        ("interval", IntervalDtype()),
        ("interval[int64]", IntervalDtype()),
        ("interval[datetime64[ns]]", IntervalDtype("datetime64[ns]")),
        ("period[D]", PeriodDtype("D")),
        ("category", CategoricalDtype()),
        ("datetime64[ns, US/Eastern]", DatetimeTZDtype("ns", "US/Eastern")),
    ],
)
def test_registry_find(dtype, expected):
    assert registry.find(dtype) == expected


@pytest.mark.parametrize(
    "dtype, expected",
    [
        (str, False),
示例#16
0
 def test_subtype_float(self, index):
     dtype = IntervalDtype("float64", "right")
     msg = "Cannot convert .* to .*; subtypes are incompatible"
     with pytest.raises(TypeError, match=msg):
         index.astype(dtype)
示例#17
0
 def dtype(self):
     """
     Class level fixture of dtype for TestIntervalDtype
     """
     return IntervalDtype("int64")
示例#18
0
        ('u4', 0),
        ('u8', 0),
        ('i1', 0),
        ('i2', 0),
        ('i4', 0),
        ('i8', 0),
        # Bool
        ('bool', False),
        # Float
        ('f2', np.nan),
        ('f4', np.nan),
        ('f8', np.nan),
        # Object
        ('O', np.nan),
        # Interval
        (IntervalDtype(), np.nan),
    ])
def test_na_value_for_dtype(dtype, na_value):
    result = na_value_for_dtype(dtype)
    assert result is na_value


class TestNAObj(object):

    _1d_methods = ['isnaobj', 'isnaobj_old']
    _2d_methods = ['isnaobj2d', 'isnaobj2d_old']

    def _check_behavior(self, arr, expected):
        for method in TestNAObj._1d_methods:
            result = getattr(libmissing, method)(arr)
            tm.assert_numpy_array_equal(result, expected)
示例#19
0
 def test_construction_generic(self, subtype):
     # generic
     i = IntervalDtype(subtype)
     assert i.subtype is None
     assert is_interval_dtype(i)
示例#20
0
class TestDataFrameSetItem:
    @pytest.mark.parametrize("dtype", ["int32", "int64", "float32", "float64"])
    def test_setitem_dtype(self, dtype, float_frame):
        arr = np.random.randn(len(float_frame))

        float_frame[dtype] = np.array(arr, dtype=dtype)
        assert float_frame[dtype].dtype.name == dtype

    def test_setitem_list_not_dataframe(self, float_frame):
        data = np.random.randn(len(float_frame), 2)
        float_frame[["A", "B"]] = data
        tm.assert_almost_equal(float_frame[["A", "B"]].values, data)

    def test_setitem_error_msmgs(self):

        # GH 7432
        df = DataFrame(
            {
                "bar": [1, 2, 3],
                "baz": ["d", "e", "f"]
            },
            index=Index(["a", "b", "c"], name="foo"),
        )
        ser = Series(
            ["g", "h", "i", "j"],
            index=Index(["a", "b", "c", "a"], name="foo"),
            name="fiz",
        )
        msg = "cannot reindex from a duplicate axis"
        with pytest.raises(ValueError, match=msg):
            df["newcol"] = ser

        # GH 4107, more descriptive error message
        df = DataFrame(np.random.randint(0, 2, (4, 4)),
                       columns=["a", "b", "c", "d"])

        msg = "incompatible index of inserted column with frame index"
        with pytest.raises(TypeError, match=msg):
            df["gr"] = df.groupby(["b", "c"]).count()

    def test_setitem_benchmark(self):
        # from the vb_suite/frame_methods/frame_insert_columns
        N = 10
        K = 5
        df = DataFrame(index=range(N))
        new_col = np.random.randn(N)
        for i in range(K):
            df[i] = new_col
        expected = DataFrame(np.repeat(new_col, K).reshape(N, K),
                             index=range(N))
        tm.assert_frame_equal(df, expected)

    def test_setitem_different_dtype(self):
        df = DataFrame(np.random.randn(5, 3),
                       index=np.arange(5),
                       columns=["c", "b", "a"])
        df.insert(0, "foo", df["a"])
        df.insert(2, "bar", df["c"])

        # diff dtype

        # new item
        df["x"] = df["a"].astype("float32")
        result = df.dtypes
        expected = Series(
            [np.dtype("float64")] * 5 + [np.dtype("float32")],
            index=["foo", "c", "bar", "b", "a", "x"],
        )
        tm.assert_series_equal(result, expected)

        # replacing current (in different block)
        df["a"] = df["a"].astype("float32")
        result = df.dtypes
        expected = Series(
            [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2,
            index=["foo", "c", "bar", "b", "a", "x"],
        )
        tm.assert_series_equal(result, expected)

        df["y"] = df["a"].astype("int32")
        result = df.dtypes
        expected = Series(
            [np.dtype("float64")] * 4 + [np.dtype("float32")] * 2 +
            [np.dtype("int32")],
            index=["foo", "c", "bar", "b", "a", "x", "y"],
        )
        tm.assert_series_equal(result, expected)

    def test_setitem_empty_columns(self):
        # GH 13522
        df = DataFrame(index=["A", "B", "C"])
        df["X"] = df.index
        df["X"] = ["x", "y", "z"]
        exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"])
        tm.assert_frame_equal(df, exp)

    def test_setitem_dt64_index_empty_columns(self):
        rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
        df = DataFrame(index=np.arange(len(rng)))

        df["A"] = rng
        assert df["A"].dtype == np.dtype("M8[ns]")

    def test_setitem_timestamp_empty_columns(self):
        # GH#19843
        df = DataFrame(index=range(3))
        df["now"] = Timestamp("20130101", tz="UTC")

        expected = DataFrame([[Timestamp("20130101", tz="UTC")]] * 3,
                             index=[0, 1, 2],
                             columns=["now"])
        tm.assert_frame_equal(df, expected)

    def test_setitem_wrong_length_categorical_dtype_raises(self):
        # GH#29523
        cat = Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"])
        df = DataFrame(range(10), columns=["bar"])

        msg = (rf"Length of values \({len(cat)}\) "
               rf"does not match length of index \({len(df)}\)")
        with pytest.raises(ValueError, match=msg):
            df["foo"] = cat

    def test_setitem_with_sparse_value(self):
        # GH#8131
        df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]})
        sp_array = SparseArray([0, 0, 1])
        df["new_column"] = sp_array

        expected = Series(sp_array, name="new_column")
        tm.assert_series_equal(df["new_column"], expected)

    def test_setitem_with_unaligned_sparse_value(self):
        df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]})
        sp_series = Series(SparseArray([0, 0, 1]), index=[2, 1, 0])

        df["new_column"] = sp_series
        expected = Series(SparseArray([1, 0, 0]), name="new_column")
        tm.assert_series_equal(df["new_column"], expected)

    def test_setitem_dict_preserves_dtypes(self):
        # https://github.com/pandas-dev/pandas/issues/34573
        expected = DataFrame({
            "a": Series([0, 1, 2], dtype="int64"),
            "b": Series([1, 2, 3], dtype=float),
            "c": Series([1, 2, 3], dtype=float),
        })
        df = DataFrame({
            "a": Series([], dtype="int64"),
            "b": Series([], dtype=float),
            "c": Series([], dtype=float),
        })
        for idx, b in enumerate([1, 2, 3]):
            df.loc[df.shape[0]] = {"a": int(idx), "b": float(b), "c": float(b)}
        tm.assert_frame_equal(df, expected)

    @pytest.mark.parametrize(
        "obj,dtype",
        [
            (Period("2020-01"), PeriodDtype("M")),
            (Interval(left=0, right=5), IntervalDtype("int64", "right")),
            (
                Timestamp("2011-01-01", tz="US/Eastern"),
                DatetimeTZDtype(tz="US/Eastern"),
            ),
        ],
    )
    def test_setitem_extension_types(self, obj, dtype):
        # GH: 34832
        expected = DataFrame({
            "idx": [1, 2, 3],
            "obj": Series([obj] * 3, dtype=dtype)
        })

        df = DataFrame({"idx": [1, 2, 3]})
        df["obj"] = obj

        tm.assert_frame_equal(df, expected)

    @pytest.mark.parametrize(
        "ea_name",
        [
            dtype.name for dtype in ea_registry.dtypes
            # property would require instantiation
            if not isinstance(dtype.name, property)
        ]
        # mypy doesn't allow adding lists of different types
        # https://github.com/python/mypy/issues/5492
        + ["datetime64[ns, UTC]", "period[D]"],  # type: ignore[list-item]
    )
    def test_setitem_with_ea_name(self, ea_name):
        # GH 38386
        result = DataFrame([0])
        result[ea_name] = [1]
        expected = DataFrame({0: [0], ea_name: [1]})
        tm.assert_frame_equal(result, expected)

    def test_setitem_dt64_ndarray_with_NaT_and_diff_time_units(self):
        # GH#7492
        data_ns = np.array([1, "nat"], dtype="datetime64[ns]")
        result = Series(data_ns).to_frame()
        result["new"] = data_ns
        expected = DataFrame({
            0: [1, None],
            "new": [1, None]
        },
                             dtype="datetime64[ns]")
        tm.assert_frame_equal(result, expected)

        # OutOfBoundsDatetime error shouldn't occur
        data_s = np.array([1, "nat"], dtype="datetime64[s]")
        result["new"] = data_s
        expected = DataFrame({
            0: [1, None],
            "new": [1e9, None]
        },
                             dtype="datetime64[ns]")
        tm.assert_frame_equal(result, expected)

    @pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"])
    def test_frame_setitem_datetime64_col_other_units(self, unit):
        # Check that non-nano dt64 values get cast to dt64 on setitem
        #  into a not-yet-existing column
        n = 100

        dtype = np.dtype(f"M8[{unit}]")
        vals = np.arange(n, dtype=np.int64).view(dtype)
        ex_vals = vals.astype("datetime64[ns]")

        df = DataFrame({"ints": np.arange(n)}, index=np.arange(n))
        df[unit] = vals

        assert df[unit].dtype == np.dtype("M8[ns]")
        assert (df[unit].values == ex_vals).all()

    @pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"])
    def test_frame_setitem_existing_datetime64_col_other_units(self, unit):
        # Check that non-nano dt64 values get cast to dt64 on setitem
        #  into an already-existing dt64 column
        n = 100

        dtype = np.dtype(f"M8[{unit}]")
        vals = np.arange(n, dtype=np.int64).view(dtype)
        ex_vals = vals.astype("datetime64[ns]")

        df = DataFrame({"ints": np.arange(n)}, index=np.arange(n))
        df["dates"] = np.arange(n, dtype=np.int64).view("M8[ns]")

        # We overwrite existing dt64 column with new, non-nano dt64 vals
        df["dates"] = vals
        assert (df["dates"].values == ex_vals).all()

    def test_setitem_dt64tz(self, timezone_frame):

        df = timezone_frame
        idx = df["B"].rename("foo")

        # setitem
        df["C"] = idx
        tm.assert_series_equal(df["C"], Series(idx, name="C"))

        df["D"] = "foo"
        df["D"] = idx
        tm.assert_series_equal(df["D"], Series(idx, name="D"))
        del df["D"]

        # assert that A & C are not sharing the same base (e.g. they
        # are copies)
        v1 = df._mgr.arrays[1]
        v2 = df._mgr.arrays[2]
        tm.assert_extension_array_equal(v1, v2)
        v1base = v1._data.base
        v2base = v2._data.base
        assert v1base is None or (id(v1base) != id(v2base))

        # with nan
        df2 = df.copy()
        df2.iloc[1, 1] = NaT
        df2.iloc[1, 2] = NaT
        result = df2["B"]
        tm.assert_series_equal(notna(result),
                               Series([True, False, True], name="B"))
        tm.assert_series_equal(df2.dtypes, df.dtypes)

    def test_setitem_periodindex(self):
        rng = period_range("1/1/2000", periods=5, name="index")
        df = DataFrame(np.random.randn(5, 3), index=rng)

        df["Index"] = rng
        rs = Index(df["Index"])
        tm.assert_index_equal(rs, rng, check_names=False)
        assert rs.name == "Index"
        assert rng.name == "index"

        rs = df.reset_index().set_index("index")
        assert isinstance(rs.index, PeriodIndex)
        tm.assert_index_equal(rs.index, rng)

    def test_setitem_complete_column_with_array(self):
        # GH#37954
        df = DataFrame({"a": ["one", "two", "three"], "b": [1, 2, 3]})
        arr = np.array([[1, 1], [3, 1], [5, 1]])
        df[["c", "d"]] = arr
        expected = DataFrame({
            "a": ["one", "two", "three"],
            "b": [1, 2, 3],
            "c": [1, 3, 5],
            "d": [1, 1, 1],
        })
        expected["c"] = expected["c"].astype(arr.dtype)
        expected["d"] = expected["d"].astype(arr.dtype)
        assert expected["c"].dtype == arr.dtype
        assert expected["d"].dtype == arr.dtype
        tm.assert_frame_equal(df, expected)

    @pytest.mark.parametrize("dtype", ["f8", "i8", "u8"])
    def test_setitem_bool_with_numeric_index(self, dtype):
        # GH#36319
        cols = Index([1, 2, 3], dtype=dtype)
        df = DataFrame(np.random.randn(3, 3), columns=cols)

        df[False] = ["a", "b", "c"]

        expected_cols = Index([1, 2, 3, False], dtype=object)
        if dtype == "f8":
            expected_cols = Index([1.0, 2.0, 3.0, False], dtype=object)

        tm.assert_index_equal(df.columns, expected_cols)

    @pytest.mark.parametrize("indexer", ["B", ["B"]])
    def test_setitem_frame_length_0_str_key(self, indexer):
        # GH#38831
        df = DataFrame(columns=["A", "B"])
        other = DataFrame({"B": [1, 2]})
        df[indexer] = other
        expected = DataFrame({"A": [np.nan] * 2, "B": [1, 2]})
        expected["A"] = expected["A"].astype("object")
        tm.assert_frame_equal(df, expected)

    def test_setitem_frame_duplicate_columns(self, using_array_manager):
        # GH#15695
        cols = ["A", "B", "C"] * 2
        df = DataFrame(index=range(3), columns=cols)
        df.loc[0, "A"] = (0, 3)
        df.loc[:, "B"] = (1, 4)
        df["C"] = (2, 5)
        expected = DataFrame(
            [
                [0, 1, 2, 3, 4, 5],
                [np.nan, 1, 2, np.nan, 4, 5],
                [np.nan, 1, 2, np.nan, 4, 5],
            ],
            dtype="object",
        )

        if using_array_manager:
            # setitem replaces column so changes dtype

            expected.columns = cols
            expected["C"] = expected["C"].astype("int64")
            # TODO(ArrayManager) .loc still overwrites
            expected["B"] = expected["B"].astype("int64")
        else:
            # set these with unique columns to be extra-unambiguous
            expected[2] = expected[2].astype(np.int64)
            expected[5] = expected[5].astype(np.int64)
            expected.columns = cols

        tm.assert_frame_equal(df, expected)

    def test_setitem_frame_duplicate_columns_size_mismatch(self):
        # GH#39510
        cols = ["A", "B", "C"] * 2
        df = DataFrame(index=range(3), columns=cols)
        with pytest.raises(ValueError,
                           match="Columns must be same length as key"):
            df[["A"]] = (0, 3, 5)

        df2 = df.iloc[:, :3]  # unique columns
        with pytest.raises(ValueError,
                           match="Columns must be same length as key"):
            df2[["A"]] = (0, 3, 5)

    @pytest.mark.parametrize("cols", [["a", "b", "c"], ["a", "a", "a"]])
    def test_setitem_df_wrong_column_number(self, cols):
        # GH#38604
        df = DataFrame([[1, 2, 3]], columns=cols)
        rhs = DataFrame([[10, 11]], columns=["d", "e"])
        msg = "Columns must be same length as key"
        with pytest.raises(ValueError, match=msg):
            df["a"] = rhs

    def test_setitem_listlike_indexer_duplicate_columns(self):
        # GH#38604
        df = DataFrame([[1, 2, 3]], columns=["a", "b", "b"])
        rhs = DataFrame([[10, 11, 12]], columns=["a", "b", "b"])
        df[["a", "b"]] = rhs
        expected = DataFrame([[10, 11, 12]], columns=["a", "b", "b"])
        tm.assert_frame_equal(df, expected)

        df[["c", "b"]] = rhs
        expected = DataFrame([[10, 11, 12, 10]], columns=["a", "b", "b", "c"])
        tm.assert_frame_equal(df, expected)

    def test_setitem_listlike_indexer_duplicate_columns_not_equal_length(self):
        # GH#39403
        df = DataFrame([[1, 2, 3]], columns=["a", "b", "b"])
        rhs = DataFrame([[10, 11]], columns=["a", "b"])
        msg = "Columns must be same length as key"
        with pytest.raises(ValueError, match=msg):
            df[["a", "b"]] = rhs

    def test_setitem_intervals(self):

        df = DataFrame({"A": range(10)})
        ser = cut(df["A"], 5)
        assert isinstance(ser.cat.categories, IntervalIndex)

        # B & D end up as Categoricals
        # the remainder are converted to in-line objects
        # containing an IntervalIndex.values
        df["B"] = ser
        df["C"] = np.array(ser)
        df["D"] = ser.values
        df["E"] = np.array(ser.values)
        df["F"] = ser.astype(object)

        assert is_categorical_dtype(df["B"].dtype)
        assert is_interval_dtype(df["B"].cat.categories)
        assert is_categorical_dtype(df["D"].dtype)
        assert is_interval_dtype(df["D"].cat.categories)

        # These go through the Series constructor and so get inferred back
        #  to IntervalDtype
        assert is_interval_dtype(df["C"])
        assert is_interval_dtype(df["E"])

        # But the Series constructor doesn't do inference on Series objects,
        #  so setting df["F"] doesn't get cast back to IntervalDtype
        assert is_object_dtype(df["F"])

        # they compare equal as Index
        # when converted to numpy objects
        c = lambda x: Index(np.array(x))
        tm.assert_index_equal(c(df.B), c(df.B))
        tm.assert_index_equal(c(df.B), c(df.C), check_names=False)
        tm.assert_index_equal(c(df.B), c(df.D), check_names=False)
        tm.assert_index_equal(c(df.C), c(df.D), check_names=False)

        # B & D are the same Series
        tm.assert_series_equal(df["B"], df["B"])
        tm.assert_series_equal(df["B"], df["D"], check_names=False)

        # C & E are the same Series
        tm.assert_series_equal(df["C"], df["C"])
        tm.assert_series_equal(df["C"], df["E"], check_names=False)

    def test_setitem_categorical(self):
        # GH#35369
        df = DataFrame({"h": Series(list("mn")).astype("category")})
        df.h = df.h.cat.reorder_categories(["n", "m"])
        expected = DataFrame(
            {"h": Categorical(["m", "n"]).reorder_categories(["n", "m"])})
        tm.assert_frame_equal(df, expected)

    def test_setitem_with_empty_listlike(self):
        # GH#17101
        index = Index([], name="idx")
        result = DataFrame(columns=["A"], index=index)
        result["A"] = []
        expected = DataFrame(columns=["A"], index=index)
        tm.assert_index_equal(result.index, expected.index)

    @pytest.mark.parametrize(
        "cols, values, expected",
        [
            (["C", "D", "D", "a"], [1, 2, 3, 4], 4),  # with duplicates
            (["D", "C", "D", "a"], [1, 2, 3, 4], 4),  # mixed order
            (["C", "B", "B", "a"], [1, 2, 3, 4], 4),  # other duplicate cols
            (["C", "B", "a"], [1, 2, 3], 3),  # no duplicates
            (["B", "C", "a"], [3, 2, 1], 1),  # alphabetical order
            (["C", "a", "B"], [3, 2, 1], 2),  # in the middle
        ],
    )
    def test_setitem_same_column(self, cols, values, expected):
        # GH#23239
        df = DataFrame([values], columns=cols)
        df["a"] = df["a"]
        result = df["a"].values[0]
        assert result == expected

    def test_setitem_multi_index(self):
        # GH#7655, test that assigning to a sub-frame of a frame
        # with multi-index columns aligns both rows and columns
        it = ["jim", "joe", "jolie"], ["first",
                                       "last"], ["left", "center", "right"]

        cols = MultiIndex.from_product(it)
        index = date_range("20141006", periods=20)
        vals = np.random.randint(1, 1000, (len(index), len(cols)))
        df = DataFrame(vals, columns=cols, index=index)

        i, j = df.index.values.copy(), it[-1][:]

        np.random.shuffle(i)
        df["jim"] = df["jolie"].loc[i, ::-1]
        tm.assert_frame_equal(df["jim"], df["jolie"])

        np.random.shuffle(j)
        df[("joe", "first")] = df[("jolie", "last")].loc[i, j]
        tm.assert_frame_equal(df[("joe", "first")], df[("jolie", "last")])

        np.random.shuffle(j)
        df[("joe", "last")] = df[("jolie", "first")].loc[i, j]
        tm.assert_frame_equal(df[("joe", "last")], df[("jolie", "first")])

    @pytest.mark.parametrize(
        "columns,box,expected",
        [
            (
                ["A", "B", "C", "D"],
                7,
                DataFrame(
                    [[7, 7, 7, 7], [7, 7, 7, 7], [7, 7, 7, 7]],
                    columns=["A", "B", "C", "D"],
                ),
            ),
            (
                ["C", "D"],
                [7, 8],
                DataFrame(
                    [[1, 2, 7, 8], [3, 4, 7, 8], [5, 6, 7, 8]],
                    columns=["A", "B", "C", "D"],
                ),
            ),
            (
                ["A", "B", "C"],
                np.array([7, 8, 9], dtype=np.int64),
                DataFrame([[7, 8, 9], [7, 8, 9], [7, 8, 9]],
                          columns=["A", "B", "C"]),
            ),
            (
                ["B", "C", "D"],
                [[7, 8, 9], [10, 11, 12], [13, 14, 15]],
                DataFrame(
                    [[1, 7, 8, 9], [3, 10, 11, 12], [5, 13, 14, 15]],
                    columns=["A", "B", "C", "D"],
                ),
            ),
            (
                ["C", "A", "D"],
                np.array([[7, 8, 9], [10, 11, 12], [13, 14, 15]],
                         dtype=np.int64),
                DataFrame(
                    [[8, 2, 7, 9], [11, 4, 10, 12], [14, 6, 13, 15]],
                    columns=["A", "B", "C", "D"],
                ),
            ),
            (
                ["A", "C"],
                DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]),
                DataFrame([[7, 2, 8], [9, 4, 10], [11, 6, 12]],
                          columns=["A", "B", "C"]),
            ),
        ],
    )
    def test_setitem_list_missing_columns(self, columns, box, expected):
        # GH#29334
        df = DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"])
        df[columns] = box
        tm.assert_frame_equal(df, expected)

    def test_setitem_list_of_tuples(self, float_frame):
        tuples = list(zip(float_frame["A"], float_frame["B"]))
        float_frame["tuples"] = tuples

        result = float_frame["tuples"]
        expected = Series(tuples, index=float_frame.index, name="tuples")
        tm.assert_series_equal(result, expected)

    def test_setitem_iloc_generator(self):
        # GH#39614
        df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
        indexer = (x for x in [1, 2])
        df.iloc[indexer] = 1
        expected = DataFrame({"a": [1, 1, 1], "b": [4, 1, 1]})
        tm.assert_frame_equal(df, expected)

    def test_setitem_iloc_two_dimensional_generator(self):
        df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
        indexer = (x for x in [1, 2])
        df.iloc[indexer, 1] = 1
        expected = DataFrame({"a": [1, 2, 3], "b": [4, 1, 1]})
        tm.assert_frame_equal(df, expected)
示例#21
0
 def test_construction_errors(self, subtype):
     msg = "could not construct IntervalDtype"
     with pytest.raises(TypeError, match=msg):
         IntervalDtype(subtype)
示例#22
0
 def create(self):
     return IntervalDtype("int64")
示例#23
0
 def test_equality_generic(self, subtype):
     # GH 18980
     dtype = IntervalDtype(subtype)
     assert is_dtype_equal(dtype, "interval")
     assert is_dtype_equal(dtype, IntervalDtype())
示例#24
0
def dtype():
    return IntervalDtype()
示例#25
0
 def test_name_repr_generic(self, subtype):
     # GH 18980
     dtype = IntervalDtype(subtype)
     assert str(dtype) == "interval"
     assert dtype.name == "interval"
示例#26
0
文件: interval.py 项目: zyazxr/pandas
class IntervalArray(IntervalMixin, ExtensionArray):
    dtype = IntervalDtype()
    ndim = 1
    can_hold_na = True
    _na_value = _fill_value = np.nan

    def __new__(cls,
                data,
                closed=None,
                dtype=None,
                copy=False,
                verify_integrity=True):

        if isinstance(data, ABCSeries) and is_interval_dtype(data):
            data = data.values

        if isinstance(data, (cls, ABCIntervalIndex)):
            left = data.left
            right = data.right
            closed = closed or data.closed
        else:

            # don't allow scalars
            if is_scalar(data):
                msg = ("{}(...) must be called with a collection of some kind,"
                       " {} was passed")
                raise TypeError(msg.format(cls.__name__, data))

            # might need to convert empty or purely na data
            data = maybe_convert_platform_interval(data)
            left, right, infer_closed = intervals_to_interval_bounds(
                data, validate_closed=closed is None)
            closed = closed or infer_closed

        return cls._simple_new(
            left,
            right,
            closed,
            copy=copy,
            dtype=dtype,
            verify_integrity=verify_integrity,
        )

    @classmethod
    def _simple_new(cls,
                    left,
                    right,
                    closed=None,
                    copy=False,
                    dtype=None,
                    verify_integrity=True):
        result = IntervalMixin.__new__(cls)

        closed = closed or "right"
        left = ensure_index(left, copy=copy)
        right = ensure_index(right, copy=copy)

        if dtype is not None:
            # GH 19262: dtype must be an IntervalDtype to override inferred
            dtype = pandas_dtype(dtype)
            if not is_interval_dtype(dtype):
                msg = "dtype must be an IntervalDtype, got {dtype}"
                raise TypeError(msg.format(dtype=dtype))
            elif dtype.subtype is not None:
                left = left.astype(dtype.subtype)
                right = right.astype(dtype.subtype)

        # coerce dtypes to match if needed
        if is_float_dtype(left) and is_integer_dtype(right):
            right = right.astype(left.dtype)
        elif is_float_dtype(right) and is_integer_dtype(left):
            left = left.astype(right.dtype)

        if type(left) != type(right):
            msg = "must not have differing left [{ltype}] and right " "[{rtype}] types"
            raise ValueError(
                msg.format(ltype=type(left).__name__,
                           rtype=type(right).__name__))
        elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype):
            # GH 19016
            msg = ("category, object, and string subtypes are not supported "
                   "for IntervalArray")
            raise TypeError(msg)
        elif isinstance(left, ABCPeriodIndex):
            msg = "Period dtypes are not supported, use a PeriodIndex instead"
            raise ValueError(msg)
        elif isinstance(left,
                        ABCDatetimeIndex) and str(left.tz) != str(right.tz):
            msg = ("left and right must have the same time zone, got "
                   "'{left_tz}' and '{right_tz}'")
            raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz))

        result._left = left
        result._right = right
        result._closed = closed
        if verify_integrity:
            result._validate()
        return result

    @classmethod
    def _from_sequence(cls, scalars, dtype=None, copy=False):
        return cls(scalars, dtype=dtype, copy=copy)

    @classmethod
    def _from_factorized(cls, values, original):
        if len(values) == 0:
            # An empty array returns object-dtype here. We can't create
            # a new IA from an (empty) object-dtype array, so turn it into the
            # correct dtype.
            values = values.astype(original.dtype.subtype)
        return cls(values, closed=original.closed)

    _interval_shared_docs["from_breaks"] = """
    Construct an %(klass)s from an array of splits.

    Parameters
    ----------
    breaks : array-like (1-dimensional)
        Left and right bounds for each interval.
    closed : {'left', 'right', 'both', 'neither'}, default 'right'
        Whether the intervals are closed on the left-side, right-side, both
        or neither.
    copy : boolean, default False
        copy the data
    dtype : dtype or None, default None
        If None, dtype will be inferred

        .. versionadded:: 0.23.0

    Returns
    -------
    %(klass)s

    See Also
    --------
    interval_range : Function to create a fixed frequency IntervalIndex.
    %(klass)s.from_arrays : Construct from a left and right array.
    %(klass)s.from_tuples : Construct from a sequence of tuples.

    Examples
    --------
    >>> pd.%(qualname)s.from_breaks([0, 1, 2, 3])
    %(klass)s([(0, 1], (1, 2], (2, 3]],
                  closed='right',
                  dtype='interval[int64]')
    """

    @classmethod
    @Appender(_interval_shared_docs["from_breaks"] % _shared_docs_kwargs)
    def from_breaks(cls, breaks, closed="right", copy=False, dtype=None):
        breaks = maybe_convert_platform_interval(breaks)

        return cls.from_arrays(breaks[:-1],
                               breaks[1:],
                               closed,
                               copy=copy,
                               dtype=dtype)

    _interval_shared_docs["from_arrays"] = """
        Construct from two arrays defining the left and right bounds.

        Parameters
        ----------
        left : array-like (1-dimensional)
            Left bounds for each interval.
        right : array-like (1-dimensional)
            Right bounds for each interval.
        closed : {'left', 'right', 'both', 'neither'}, default 'right'
            Whether the intervals are closed on the left-side, right-side, both
            or neither.
        copy : boolean, default False
            Copy the data.
        dtype : dtype, optional
            If None, dtype will be inferred.

            .. versionadded:: 0.23.0

        Returns
        -------
        %(klass)s

        Raises
        ------
        ValueError
            When a value is missing in only one of `left` or `right`.
            When a value in `left` is greater than the corresponding value
            in `right`.

        See Also
        --------
        interval_range : Function to create a fixed frequency IntervalIndex.
        %(klass)s.from_breaks : Construct an %(klass)s from an array of
            splits.
        %(klass)s.from_tuples : Construct an %(klass)s from an
            array-like of tuples.

        Notes
        -----
        Each element of `left` must be less than or equal to the `right`
        element at the same position. If an element is missing, it must be
        missing in both `left` and `right`. A TypeError is raised when
        using an unsupported type for `left` or `right`. At the moment,
        'category', 'object', and 'string' subtypes are not supported.

        Examples
        --------
        >>> %(klass)s.from_arrays([0, 1, 2], [1, 2, 3])
        %(klass)s([(0, 1], (1, 2], (2, 3]],
                     closed='right',
                     dtype='interval[int64]')
        """

    @classmethod
    @Appender(_interval_shared_docs["from_arrays"] % _shared_docs_kwargs)
    def from_arrays(cls, left, right, closed="right", copy=False, dtype=None):
        left = maybe_convert_platform_interval(left)
        right = maybe_convert_platform_interval(right)

        return cls._simple_new(left,
                               right,
                               closed,
                               copy=copy,
                               dtype=dtype,
                               verify_integrity=True)

    _interval_shared_docs["from_intervals"] = """
    Construct an %(klass)s from a 1d array of Interval objects

    .. deprecated:: 0.23.0

    Parameters
    ----------
    data : array-like (1-dimensional)
        Array of Interval objects. All intervals must be closed on the same
        sides.
    copy : boolean, default False
        by-default copy the data, this is compat only and ignored
    dtype : dtype or None, default None
        If None, dtype will be inferred

        ..versionadded:: 0.23.0

    See Also
    --------
    interval_range : Function to create a fixed frequency IntervalIndex.
    %(klass)s.from_arrays : Construct an %(klass)s from a left and
                                right array.
    %(klass)s.from_breaks : Construct an %(klass)s from an array of
                                splits.
    %(klass)s.from_tuples : Construct an %(klass)s from an
                                array-like of tuples.

    Examples
    --------
    >>> pd.%(qualname)s.from_intervals([pd.Interval(0, 1),
    ...                                  pd.Interval(1, 2)])
    %(klass)s([(0, 1], (1, 2]],
                  closed='right', dtype='interval[int64]')

    The generic Index constructor work identically when it infers an array
    of all intervals:

    >>> pd.Index([pd.Interval(0, 1), pd.Interval(1, 2)])
    %(klass)s([(0, 1], (1, 2]],
                  closed='right', dtype='interval[int64]')
    """

    _interval_shared_docs["from_tuples"] = """
    Construct an %(klass)s from an array-like of tuples

    Parameters
    ----------
    data : array-like (1-dimensional)
        Array of tuples
    closed : {'left', 'right', 'both', 'neither'}, default 'right'
        Whether the intervals are closed on the left-side, right-side, both
        or neither.
    copy : boolean, default False
        by-default copy the data, this is compat only and ignored
    dtype : dtype or None, default None
        If None, dtype will be inferred

        ..versionadded:: 0.23.0

    Returns
    -------
    %(klass)s

    See Also
    --------
    interval_range : Function to create a fixed frequency IntervalIndex.
    %(klass)s.from_arrays : Construct an %(klass)s from a left and
                                right array.
    %(klass)s.from_breaks : Construct an %(klass)s from an array of
                                splits.

    Examples
    --------
    >>> pd.%(qualname)s.from_tuples([(0, 1), (1, 2)])
    %(klass)s([(0, 1], (1, 2]],
                closed='right', dtype='interval[int64]')
    """

    @classmethod
    @Appender(_interval_shared_docs["from_tuples"] % _shared_docs_kwargs)
    def from_tuples(cls, data, closed="right", copy=False, dtype=None):
        if len(data):
            left, right = [], []
        else:
            # ensure that empty data keeps input dtype
            left = right = data

        for d in data:
            if isna(d):
                lhs = rhs = np.nan
            else:
                name = cls.__name__
                try:
                    # need list of length 2 tuples, e.g. [(0, 1), (1, 2), ...]
                    lhs, rhs = d
                except ValueError:
                    msg = ("{name}.from_tuples requires tuples of "
                           "length 2, got {tpl}").format(name=name, tpl=d)
                    raise ValueError(msg)
                except TypeError:
                    msg = ("{name}.from_tuples received an invalid "
                           "item, {tpl}").format(name=name, tpl=d)
                    raise TypeError(msg)
            left.append(lhs)
            right.append(rhs)

        return cls.from_arrays(left, right, closed, copy=False, dtype=dtype)

    def _validate(self):
        """Verify that the IntervalArray is valid.

        Checks that

        * closed is valid
        * left and right match lengths
        * left and right have the same missing values
        * left is always below right
        """
        if self.closed not in _VALID_CLOSED:
            raise ValueError("invalid option for 'closed': {closed}".format(
                closed=self.closed))
        if len(self.left) != len(self.right):
            raise ValueError("left and right must have the same length")
        left_mask = notna(self.left)
        right_mask = notna(self.right)
        if not (left_mask == right_mask).all():
            raise ValueError("missing values must be missing in the same "
                             "location both left and right sides")
        if not (self.left[left_mask] <= self.right[left_mask]).all():
            raise ValueError("left side of interval must be <= right side")

    # ---------
    # Interface
    # ---------
    def __iter__(self):
        return iter(np.asarray(self))

    def __len__(self):
        return len(self.left)

    def __getitem__(self, value):
        left = self.left[value]
        right = self.right[value]

        # scalar
        if not isinstance(left, Index):
            if isna(left):
                return self._fill_value
            return Interval(left, right, self.closed)

        return self._shallow_copy(left, right)

    def __setitem__(self, key, value):
        # na value: need special casing to set directly on numpy arrays
        needs_float_conversion = False
        if is_scalar(value) and isna(value):
            if is_integer_dtype(self.dtype.subtype):
                # can't set NaN on a numpy integer array
                needs_float_conversion = True
            elif is_datetime64_any_dtype(self.dtype.subtype):
                # need proper NaT to set directly on the numpy array
                value = np.datetime64("NaT")
            elif is_timedelta64_dtype(self.dtype.subtype):
                # need proper NaT to set directly on the numpy array
                value = np.timedelta64("NaT")
            value_left, value_right = value, value

        # scalar interval
        elif is_interval_dtype(value) or isinstance(value, ABCInterval):
            self._check_closed_matches(value, name="value")
            value_left, value_right = value.left, value.right

        else:
            # list-like of intervals
            try:
                array = IntervalArray(value)
                value_left, value_right = array.left, array.right
            except TypeError:
                # wrong type: not interval or NA
                msg = "'value' should be an interval type, got {} instead."
                raise TypeError(msg.format(type(value)))

        # Need to ensure that left and right are updated atomically, so we're
        # forced to copy, update the copy, and swap in the new values.
        left = self.left.copy(deep=True)
        if needs_float_conversion:
            left = left.astype("float")
        left.values[key] = value_left
        self._left = left

        right = self.right.copy(deep=True)
        if needs_float_conversion:
            right = right.astype("float")
        right.values[key] = value_right
        self._right = right

    def fillna(self, value=None, method=None, limit=None):
        """
        Fill NA/NaN values using the specified method.

        Parameters
        ----------
        value : scalar, dict, Series
            If a scalar value is passed it is used to fill all missing values.
            Alternatively, a Series or dict can be used to fill in different
            values for each index. The value should not be a list. The
            value(s) passed should be either Interval objects or NA/NaN.
        method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
            (Not implemented yet for IntervalArray)
            Method to use for filling holes in reindexed Series
        limit : int, default None
            (Not implemented yet for IntervalArray)
            If method is specified, this is the maximum number of consecutive
            NaN values to forward/backward fill. In other words, if there is
            a gap with more than this number of consecutive NaNs, it will only
            be partially filled. If method is not specified, this is the
            maximum number of entries along the entire axis where NaNs will be
            filled.

        Returns
        -------
        filled : IntervalArray with NA/NaN filled
        """
        if method is not None:
            raise TypeError("Filling by method is not supported for "
                            "IntervalArray.")
        if limit is not None:
            raise TypeError("limit is not supported for IntervalArray.")

        if not isinstance(value, ABCInterval):
            msg = ("'IntervalArray.fillna' only supports filling with a "
                   "scalar 'pandas.Interval'. Got a '{}' instead.".format(
                       type(value).__name__))
            raise TypeError(msg)

        value = getattr(value, "_values", value)
        self._check_closed_matches(value, name="value")

        left = self.left.fillna(value=value.left)
        right = self.right.fillna(value=value.right)
        return self._shallow_copy(left, right)

    @property
    def dtype(self):
        return IntervalDtype(self.left.dtype)

    def astype(self, dtype, copy=True):
        """
        Cast to an ExtensionArray or NumPy array with dtype 'dtype'.

        Parameters
        ----------
        dtype : str or dtype
            Typecode or data-type to which the array is cast.

        copy : bool, default True
            Whether to copy the data, even if not necessary. If False,
            a copy is made only if the old dtype does not match the
            new dtype.

        Returns
        -------
        array : ExtensionArray or ndarray
            ExtensionArray or NumPy ndarray with 'dtype' for its dtype.
        """
        dtype = pandas_dtype(dtype)
        if is_interval_dtype(dtype):
            if dtype == self.dtype:
                return self.copy() if copy else self

            # need to cast to different subtype
            try:
                new_left = self.left.astype(dtype.subtype)
                new_right = self.right.astype(dtype.subtype)
            except TypeError:
                msg = ("Cannot convert {dtype} to {new_dtype}; subtypes are "
                       "incompatible")
                raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype))
            return self._shallow_copy(new_left, new_right)
        elif is_categorical_dtype(dtype):
            return Categorical(np.asarray(self))
        # TODO: This try/except will be repeated.
        try:
            return np.asarray(self).astype(dtype, copy=copy)
        except (TypeError, ValueError):
            msg = "Cannot cast {name} to dtype {dtype}"
            raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))

    @classmethod
    def _concat_same_type(cls, to_concat):
        """
        Concatenate multiple IntervalArray

        Parameters
        ----------
        to_concat : sequence of IntervalArray

        Returns
        -------
        IntervalArray
        """
        closed = {interval.closed for interval in to_concat}
        if len(closed) != 1:
            raise ValueError("Intervals must all be closed on the same side.")
        closed = closed.pop()

        left = np.concatenate([interval.left for interval in to_concat])
        right = np.concatenate([interval.right for interval in to_concat])
        return cls._simple_new(left, right, closed=closed, copy=False)

    def _shallow_copy(self, left=None, right=None, closed=None):
        """
        Return a new IntervalArray with the replacement attributes

        Parameters
        ----------
        left : array-like
            Values to be used for the left-side of the the intervals.
            If None, the existing left and right values will be used.

        right : array-like
            Values to be used for the right-side of the the intervals.
            If None and left is IntervalArray-like, the left and right
            of the IntervalArray-like will be used.

        closed : {'left', 'right', 'both', 'neither'}, optional
            Whether the intervals are closed on the left-side, right-side, both
            or neither.  If None, the existing closed will be used.
        """
        if left is None:

            # no values passed
            left, right = self.left, self.right

        elif right is None:

            # only single value passed, could be an IntervalArray
            # or array of Intervals
            if not isinstance(left, (type(self), ABCIntervalIndex)):
                left = type(self)(left)

            left, right = left.left, left.right
        else:

            # both left and right are values
            pass

        closed = closed or self.closed
        return self._simple_new(left,
                                right,
                                closed=closed,
                                verify_integrity=False)

    def copy(self):
        """
        Return a copy of the array.

        Returns
        -------
        IntervalArray
        """
        left = self.left.copy(deep=True)
        right = self.right.copy(deep=True)
        closed = self.closed
        # TODO: Could skip verify_integrity here.
        return type(self).from_arrays(left, right, closed=closed)

    def isna(self):
        return isna(self.left)

    @property
    def nbytes(self):
        return self.left.nbytes + self.right.nbytes

    @property
    def size(self):
        # Avoid materializing self.values
        return self.left.size

    @property
    def shape(self):
        return self.left.shape

    def take(self,
             indices,
             allow_fill=False,
             fill_value=None,
             axis=None,
             **kwargs):
        """
        Take elements from the IntervalArray.

        Parameters
        ----------
        indices : sequence of integers
            Indices to be taken.

        allow_fill : bool, default False
            How to handle negative values in `indices`.

            * False: negative values in `indices` indicate positional indices
              from the right (the default). This is similar to
              :func:`numpy.take`.

            * True: negative values in `indices` indicate
              missing values. These values are set to `fill_value`. Any other
              other negative values raise a ``ValueError``.

        fill_value : Interval or NA, optional
            Fill value to use for NA-indices when `allow_fill` is True.
            This may be ``None``, in which case the default NA value for
            the type, ``self.dtype.na_value``, is used.

            For many ExtensionArrays, there will be two representations of
            `fill_value`: a user-facing "boxed" scalar, and a low-level
            physical NA value. `fill_value` should be the user-facing version,
            and the implementation should handle translating that to the
            physical version for processing the take if necessary.

        axis : any, default None
            Present for compat with IntervalIndex; does nothing.

        Returns
        -------
        IntervalArray

        Raises
        ------
        IndexError
            When the indices are out of bounds for the array.
        ValueError
            When `indices` contains negative values other than ``-1``
            and `allow_fill` is True.
        """
        from pandas.core.algorithms import take

        nv.validate_take(tuple(), kwargs)

        fill_left = fill_right = fill_value
        if allow_fill:
            if fill_value is None:
                fill_left = fill_right = self.left._na_value
            elif is_interval(fill_value):
                self._check_closed_matches(fill_value, name="fill_value")
                fill_left, fill_right = fill_value.left, fill_value.right
            elif not is_scalar(fill_value) and notna(fill_value):
                msg = ("'IntervalArray.fillna' only supports filling with a "
                       "'scalar pandas.Interval or NA'. Got a '{}' instead.".
                       format(type(fill_value).__name__))
                raise ValueError(msg)

        left_take = take(self.left,
                         indices,
                         allow_fill=allow_fill,
                         fill_value=fill_left)
        right_take = take(self.right,
                          indices,
                          allow_fill=allow_fill,
                          fill_value=fill_right)

        return self._shallow_copy(left_take, right_take)

    def value_counts(self, dropna=True):
        """
        Returns a Series containing counts of each interval.

        Parameters
        ----------
        dropna : boolean, default True
            Don't include counts of NaN.

        Returns
        -------
        counts : Series

        See Also
        --------
        Series.value_counts
        """
        # TODO: implement this is a non-naive way!
        from pandas.core.algorithms import value_counts

        return value_counts(np.asarray(self), dropna=dropna)

    # Formatting

    def _format_data(self):

        # TODO: integrate with categorical and make generic
        # name argument is unused here; just for compat with base / categorical
        n = len(self)
        max_seq_items = min((get_option("display.max_seq_items") or n) // 10,
                            10)

        formatter = str

        if n == 0:
            summary = "[]"
        elif n == 1:
            first = formatter(self[0])
            summary = "[{first}]".format(first=first)
        elif n == 2:
            first = formatter(self[0])
            last = formatter(self[-1])
            summary = "[{first}, {last}]".format(first=first, last=last)
        else:

            if n > max_seq_items:
                n = min(max_seq_items // 2, 10)
                head = [formatter(x) for x in self[:n]]
                tail = [formatter(x) for x in self[-n:]]
                summary = "[{head} ... {tail}]".format(head=", ".join(head),
                                                       tail=", ".join(tail))
            else:
                tail = [formatter(x) for x in self]
                summary = "[{tail}]".format(tail=", ".join(tail))

        return summary

    def __repr__(self):
        tpl = textwrap.dedent("""\
        {cls}({data},
        {lead}closed='{closed}',
        {lead}dtype='{dtype}')""")
        return tpl.format(
            cls=self.__class__.__name__,
            data=self._format_data(),
            lead=" " * len(self.__class__.__name__) + " ",
            closed=self.closed,
            dtype=self.dtype,
        )

    def _format_space(self):
        space = " " * (len(self.__class__.__name__) + 1)
        return "\n{space}".format(space=space)

    @property
    def left(self):
        """
        Return the left endpoints of each Interval in the IntervalArray as
        an Index
        """
        return self._left

    @property
    def right(self):
        """
        Return the right endpoints of each Interval in the IntervalArray as
        an Index
        """
        return self._right

    @property
    def closed(self):
        """
        Whether the intervals are closed on the left-side, right-side, both or
        neither
        """
        return self._closed

    _interval_shared_docs["set_closed"] = """
        Return an %(klass)s identical to the current one, but closed on the
        specified side

        .. versionadded:: 0.24.0

        Parameters
        ----------
        closed : {'left', 'right', 'both', 'neither'}
            Whether the intervals are closed on the left-side, right-side, both
            or neither.

        Returns
        -------
        new_index : %(klass)s

        Examples
        --------
        >>> index = pd.interval_range(0, 3)
        >>> index
        IntervalIndex([(0, 1], (1, 2], (2, 3]],
              closed='right',
              dtype='interval[int64]')
        >>> index.set_closed('both')
        IntervalIndex([[0, 1], [1, 2], [2, 3]],
              closed='both',
              dtype='interval[int64]')
        """

    @Appender(_interval_shared_docs["set_closed"] % _shared_docs_kwargs)
    def set_closed(self, closed):
        if closed not in _VALID_CLOSED:
            msg = "invalid option for 'closed': {closed}"
            raise ValueError(msg.format(closed=closed))

        return self._shallow_copy(closed=closed)

    @property
    def length(self):
        """
        Return an Index with entries denoting the length of each Interval in
        the IntervalArray
        """
        try:
            return self.right - self.left
        except TypeError:
            # length not defined for some types, e.g. string
            msg = ("IntervalArray contains Intervals without defined length, "
                   "e.g. Intervals with string endpoints")
            raise TypeError(msg)

    @property
    def mid(self):
        """
        Return the midpoint of each Interval in the IntervalArray as an Index
        """
        try:
            return 0.5 * (self.left + self.right)
        except TypeError:
            # datetime safe version
            return self.left + 0.5 * self.length

    _interval_shared_docs["is_non_overlapping_monotonic"] = """
        Return True if the %(klass)s is non-overlapping (no Intervals share
        points) and is either monotonic increasing or monotonic decreasing,
        else False
        """
    # https://github.com/python/mypy/issues/1362
    # Mypy does not support decorated properties
    @property  # type: ignore
    @Appender(_interval_shared_docs["is_non_overlapping_monotonic"] %
              _shared_docs_kwargs)
    def is_non_overlapping_monotonic(self):
        # must be increasing  (e.g., [0, 1), [1, 2), [2, 3), ... )
        # or decreasing (e.g., [-1, 0), [-2, -1), [-3, -2), ...)
        # we already require left <= right

        # strict inequality for closed == 'both'; equality implies overlapping
        # at a point when both sides of intervals are included
        if self.closed == "both":
            return bool((self.right[:-1] < self.left[1:]).all()
                        or (self.left[:-1] > self.right[1:]).all())

        # non-strict inequality when closed != 'both'; at least one side is
        # not included in the intervals, so equality does not imply overlapping
        return bool((self.right[:-1] <= self.left[1:]).all()
                    or (self.left[:-1] >= self.right[1:]).all())

    # Conversion
    def __array__(self, dtype=None):
        """
        Return the IntervalArray's data as a numpy array of Interval
        objects (with dtype='object')
        """
        left = self.left
        right = self.right
        mask = self.isna()
        closed = self._closed

        result = np.empty(len(left), dtype=object)
        for i in range(len(left)):
            if mask[i]:
                result[i] = np.nan
            else:
                result[i] = Interval(left[i], right[i], closed)
        return result

    _interval_shared_docs["to_tuples"] = """
        Return an %(return_type)s of tuples of the form (left, right)

        Parameters
        ----------
        na_tuple : boolean, default True
            Returns NA as a tuple if True, ``(nan, nan)``, or just as the NA
            value itself if False, ``nan``.

            .. versionadded:: 0.23.0

        Returns
        -------
        tuples: %(return_type)s
        %(examples)s\
        """

    @Appender(_interval_shared_docs["to_tuples"] %
              dict(return_type="ndarray", examples=""))
    def to_tuples(self, na_tuple=True):
        tuples = com.asarray_tuplesafe(zip(self.left, self.right))
        if not na_tuple:
            # GH 18756
            tuples = np.where(~self.isna(), tuples, np.nan)
        return tuples

    @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs)
    def repeat(self, repeats, axis=None):
        nv.validate_repeat(tuple(), dict(axis=axis))
        left_repeat = self.left.repeat(repeats)
        right_repeat = self.right.repeat(repeats)
        return self._shallow_copy(left=left_repeat, right=right_repeat)

    _interval_shared_docs["contains"] = """
        Check elementwise if the Intervals contain the value.

        Return a boolean mask whether the value is contained in the Intervals
        of the %(klass)s.

        .. versionadded:: 0.25.0

        Parameters
        ----------
        other : scalar
            The value to check whether it is contained in the Intervals.

        Returns
        -------
        boolean array

        See Also
        --------
        Interval.contains : Check whether Interval object contains value.
        %(klass)s.overlaps : Check if an Interval overlaps the values in the
            %(klass)s.

        Examples
        --------
        >>> intervals = pd.%(qualname)s.from_tuples([(0, 1), (1, 3), (2, 4)])
        >>> intervals
        %(klass)s([(0, 1], (1, 3], (2, 4]],
              closed='right',
              dtype='interval[int64]')
        >>> intervals.contains(0.5)
        array([ True, False, False])
    """

    @Appender(_interval_shared_docs["contains"] % _shared_docs_kwargs)
    def contains(self, other):
        if isinstance(other, Interval):
            raise NotImplementedError(
                "contains not implemented for two intervals")

        return (self.left < other if self.open_left else
                self.left <= other) & (other < self.right if self.open_right
                                       else other <= self.right)

    _interval_shared_docs["overlaps"] = """
        Check elementwise if an Interval overlaps the values in the %(klass)s.

        Two intervals overlap if they share a common point, including closed
        endpoints. Intervals that only have an open endpoint in common do not
        overlap.

        .. versionadded:: 0.24.0

        Parameters
        ----------
        other : Interval
            Interval to check against for an overlap.

        Returns
        -------
        ndarray
            Boolean array positionally indicating where an overlap occurs.

        See Also
        --------
        Interval.overlaps : Check whether two Interval objects overlap.

        Examples
        --------
        >>> intervals = pd.%(qualname)s.from_tuples([(0, 1), (1, 3), (2, 4)])
        >>> intervals
        %(klass)s([(0, 1], (1, 3], (2, 4]],
              closed='right',
              dtype='interval[int64]')
        >>> intervals.overlaps(pd.Interval(0.5, 1.5))
        array([ True,  True, False])

        Intervals that share closed endpoints overlap:

        >>> intervals.overlaps(pd.Interval(1, 3, closed='left'))
        array([ True,  True, True])

        Intervals that only have an open endpoint in common do not overlap:

        >>> intervals.overlaps(pd.Interval(1, 2, closed='right'))
        array([False,  True, False])
    """

    @Appender(_interval_shared_docs["overlaps"] % _shared_docs_kwargs)
    def overlaps(self, other):
        if isinstance(other, (IntervalArray, ABCIntervalIndex)):
            raise NotImplementedError
        elif not isinstance(other, Interval):
            msg = "`other` must be Interval-like, got {other}"
            raise TypeError(msg.format(other=type(other).__name__))

        # equality is okay if both endpoints are closed (overlap at a point)
        op1 = le if (self.closed_left and other.closed_right) else lt
        op2 = le if (other.closed_left and self.closed_right) else lt

        # overlaps is equivalent negation of two interval being disjoint:
        # disjoint = (A.left > B.right) or (B.left > A.right)
        # (simplifying the negation allows this to be done in less operations)
        return op1(self.left, other.right) & op2(other.left, self.right)
示例#27
0
 def setUp(self):
     self.dtype = IntervalDtype('int64')
示例#28
0
文件: interval.py 项目: zyazxr/pandas
 def dtype(self):
     return IntervalDtype(self.left.dtype)
示例#29
0
 def test_identity(self):
     self.assertEqual(IntervalDtype('interval[int64]'),
                      IntervalDtype('interval[int64]'))
示例#30
0
 def test_coerce_to_dtype(self):
     self.assertEqual(_coerce_to_dtype('interval[int64]'),
                      IntervalDtype('interval[int64]'))