def test_qcut_include_lowest(): values = np.arange(10) ii = qcut(values, 4) ex_levels = IntervalIndex([ Interval(-0.001, 2.25), Interval(2.25, 4.5), Interval(4.5, 6.75), Interval(6.75, 9), ]) tm.assert_index_equal(ii.categories, ex_levels)
def test_where(self, closed, klass): idx = self.create_index(closed=closed) cond = [True] * len(idx) expected = idx result = expected.where(klass(cond)) tm.assert_index_equal(result, expected) cond = [False] + [True] * len(idx[1:]) expected = IntervalIndex([np.nan] + idx[1:].tolist()) result = idx.where(klass(cond)) tm.assert_index_equal(result, expected)
def test_putmask_td64(self): # GH#37968 dti = date_range("2016-01-01", periods=9) tdi = dti - dti[0] idx = IntervalIndex.from_breaks(tdi) mask = np.zeros(idx.shape, dtype=bool) mask[0:3] = True result = idx.putmask(mask, idx[-1]) expected = IntervalIndex([idx[-1]] * 3 + list(idx[3:])) tm.assert_index_equal(result, expected)
def test_dtype_closed_mismatch(): # GH#38394 closed specified in both dtype and IntervalIndex constructor dtype = IntervalDtype(np.int64, "left") msg = "closed keyword does not match dtype.closed" with pytest.raises(ValueError, match=msg): IntervalIndex([], dtype=dtype, closed="neither") with pytest.raises(ValueError, match=msg): IntervalArray([], dtype=dtype, closed="neither")
def astype(self, dtype, copy=True): if is_interval_dtype(dtype): from pandas import IntervalIndex return IntervalIndex(np.array(self)) elif is_categorical_dtype(dtype): # GH 18630 dtype = self.dtype.update_dtype(dtype) if dtype == self.dtype: return self.copy() if copy else self return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy)
def test_qcut_duplicates_bin(kwargs, msg): # see gh-7751 values = [0, 0, 0, 0, 1, 2, 3] if msg is not None: with pytest.raises(ValueError, match=msg): qcut(values, 3, **kwargs) else: result = qcut(values, 3, **kwargs) expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)]) tm.assert_index_equal(result.categories, expected)
def test_datetime_bin(conv): data = [np.datetime64("2012-12-13"), np.datetime64("2012-12-15")] bin_data = ["2012-12-12", "2012-12-14", "2012-12-16"] expected = Series(IntervalIndex([ Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])), Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2]))])).astype( CDT(ordered=True)) bins = [conv(v) for v in bin_data] result = Series(cut(data, bins=bins)) tm.assert_series_equal(result, expected)
def test_single_quantile(data, start, end, length, labels): # see gh-15431 ser = Series([data] * length) result = qcut(ser, 1, labels=labels) if labels is None: intervals = IntervalIndex([Interval(start, end)] * length, closed="right") expected = Series(intervals).astype(CDT(ordered=True)) else: expected = Series([0] * length) tm.assert_series_equal(result, expected)
def test_ensure_copied_data(self, closed): # exercise the copy flag in the constructor # not copying index = self.create_index(closed=closed) result = IntervalIndex(index, copy=False) tm.assert_numpy_array_equal(index.left.values, result.left.values, check_same='same') tm.assert_numpy_array_equal(index.right.values, result.right.values, check_same='same') # by-definition make a copy result = IntervalIndex(index._ndarray_values, copy=False) tm.assert_numpy_array_equal(index.left.values, result.left.values, check_same='copy') tm.assert_numpy_array_equal(index.right.values, result.right.values, check_same='copy')
def test_where(self, simple_index, listlike_box): klass = listlike_box idx = simple_index cond = [True] * len(idx) expected = idx result = expected.where(klass(cond)) tm.assert_index_equal(result, expected) cond = [False] + [True] * len(idx[1:]) expected = IntervalIndex([np.nan] + idx[1:].tolist()) result = idx.where(klass(cond)) tm.assert_index_equal(result, expected)
def test_datetime_cut(data): # see gh-14714 # # Testing time data when it comes in various collection types. result, _ = cut(data, 3, retbins=True) expected = Series(IntervalIndex([ Interval(Timestamp("2012-12-31 23:57:07.200000"), Timestamp("2013-01-01 16:00:00")), Interval(Timestamp("2013-01-01 16:00:00"), Timestamp("2013-01-02 08:00:00")), Interval(Timestamp("2013-01-02 08:00:00"), Timestamp("2013-01-03 00:00:00"))])).astype(CDT(ordered=True)) tm.assert_series_equal(Series(result), expected)
def test_qcut_duplicates_bin(self): # GH 7751 values = [0, 0, 0, 0, 1, 2, 3] expected = IntervalIndex([Interval(-0.001, 1), Interval(1, 3)]) result = qcut(values, 3, duplicates='drop') tm.assert_index_equal(result.categories, expected) pytest.raises(ValueError, qcut, values, 3) pytest.raises(ValueError, qcut, values, 3, duplicates='raise') # invalid pytest.raises(ValueError, qcut, values, 3, duplicates='foo')
def test_single_quantile(self): # issue 15431 expected = Series([0, 0]) s = Series([9., 9.]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) intervals = IntervalIndex([Interval(8.999, 9.0), Interval(8.999, 9.0)], closed='right') expected = Series(intervals).astype(CDT(ordered=True)) tm.assert_series_equal(result, expected) s = Series([-9., -9.]) expected = Series([0, 0]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) intervals = IntervalIndex([Interval(-9.001, -9.0), Interval(-9.001, -9.0)], closed='right') expected = Series(intervals).astype(CDT(ordered=True)) tm.assert_series_equal(result, expected) s = Series([0., 0.]) expected = Series([0, 0]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) intervals = IntervalIndex([Interval(-0.001, 0.0), Interval(-0.001, 0.0)], closed='right') expected = Series(intervals).astype(CDT(ordered=True)) tm.assert_series_equal(result, expected) s = Series([9]) expected = Series([0]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) intervals = IntervalIndex([Interval(8.999, 9.0)], closed='right') expected = Series(intervals).astype(CDT(ordered=True)) tm.assert_series_equal(result, expected) s = Series([-9]) expected = Series([0]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) intervals = IntervalIndex([Interval(-9.001, -9.0)], closed='right') expected = Series(intervals).astype(CDT(ordered=True)) tm.assert_series_equal(result, expected) s = Series([0]) expected = Series([0]) result = qcut(s, 1, labels=False) tm.assert_series_equal(result, expected) result = qcut(s, 1) intervals = IntervalIndex([Interval(-0.001, 0.0)], closed='right') expected = Series(intervals).astype(CDT(ordered=True)) tm.assert_series_equal(result, expected)
def test_getitem_interval_with_nans(self, frame_or_series, indexer_sl): # GH#41831 index = IntervalIndex([np.nan, np.nan]) key = index[:-1] obj = frame_or_series(range(2), index=index) if frame_or_series is DataFrame and indexer_sl is tm.setitem: obj = obj.T result = indexer_sl(obj)[key] expected = obj tm.assert_equal(result, expected)
def test_difference(self, closed): index = self.create_index(closed=closed) tm.assert_index_equal(index.difference(index[:1]), index[1:]) # GH 19101: empty result, same dtype result = index.difference(index) expected = IntervalIndex(np.array([], dtype='int64'), closed=closed) tm.assert_index_equal(result, expected) # GH 19101: empty result, different dtypes other = IntervalIndex.from_arrays(index.left.astype('float64'), index.right, closed=closed) result = index.difference(other) tm.assert_index_equal(result, expected)
def test_union(self, closed): index = self.create_index(closed=closed) other = IntervalIndex.from_breaks(range(5, 13), closed=closed) expected = IntervalIndex.from_breaks(range(13), closed=closed) result = index.union(other) tm.assert_index_equal(result, expected) result = other.union(index) tm.assert_index_equal(result, expected) tm.assert_index_equal(index.union(index), index) tm.assert_index_equal(index.union(index[:1]), index) # GH 19101: empty result, same dtype index = IntervalIndex(np.array([], dtype='int64'), closed=closed) result = index.union(index) tm.assert_index_equal(result, index) # GH 19101: empty result, different dtypes other = IntervalIndex(np.array([], dtype='float64'), closed=closed) result = index.union(other) tm.assert_index_equal(result, index)
def test_insert(self, data): item = data[0] idx_item = IntervalIndex([item]) # start expected = idx_item.append(data) result = data.insert(0, item) tm.assert_index_equal(result, expected) # end expected = data.append(idx_item) result = data.insert(len(data), item) tm.assert_index_equal(result, expected) # mid expected = data[:3].append(idx_item).append(data[3:]) result = data.insert(3, item) tm.assert_index_equal(result, expected) # invalid type msg = 'can only insert Interval objects and NA into an IntervalIndex' with pytest.raises(ValueError, match=msg): data.insert(1, 'foo') # invalid closed msg = 'inserted item must be closed on the same side as the index' for closed in {'left', 'right', 'both', 'neither'} - {item.closed}: with pytest.raises(ValueError, match=msg): bad_item = Interval(item.left, item.right, closed=closed) data.insert(1, bad_item) # GH 18295 (test missing) na_idx = IntervalIndex([np.nan], closed=data.closed) for na in (np.nan, pd.NaT, None): expected = data[:1].append(na_idx).append(data[1:]) result = data.insert(1, na) tm.assert_index_equal(result, expected)
def test_intersection(self, closed, sort): index = monotonic_index(0, 11, closed=closed) other = monotonic_index(5, 13, closed=closed) expected = monotonic_index(5, 11, closed=closed) result = index[::-1].intersection(other, sort=sort) if sort is None: tm.assert_index_equal(result, expected) assert tm.equalContents(result, expected) result = other[::-1].intersection(index, sort=sort) if sort is None: tm.assert_index_equal(result, expected) assert tm.equalContents(result, expected) tm.assert_index_equal(index.intersection(index, sort=sort), index) # GH 26225: nested intervals index = IntervalIndex.from_tuples([(1, 2), (1, 3), (1, 4), (0, 2)]) other = IntervalIndex.from_tuples([(1, 2), (1, 3)]) expected = IntervalIndex.from_tuples([(1, 2), (1, 3)]) result = index.intersection(other) tm.assert_index_equal(result, expected) # GH 26225 index = IntervalIndex.from_tuples([(0, 3), (0, 2)]) other = IntervalIndex.from_tuples([(0, 2), (1, 3)]) expected = IntervalIndex.from_tuples([(0, 2)]) result = index.intersection(other) tm.assert_index_equal(result, expected) # GH 26225: duplicate nan element index = IntervalIndex([np.nan, np.nan]) other = IntervalIndex([np.nan]) expected = IntervalIndex([np.nan]) result = index.intersection(other) tm.assert_index_equal(result, expected)
def test_datetimetz_qcut(self, bins): # GH 19872 tz = 'US/Eastern' s = Series(date_range('20130101', periods=3, tz=tz)) result = qcut(s, bins) expected = ( Series(IntervalIndex([ Interval(Timestamp('2012-12-31 23:59:59.999999999', tz=tz), Timestamp('2013-01-01 16:00:00', tz=tz)), Interval(Timestamp('2013-01-01 16:00:00', tz=tz), Timestamp('2013-01-02 08:00:00', tz=tz)), Interval(Timestamp('2013-01-02 08:00:00', tz=tz), Timestamp('2013-01-03 00:00:00', tz=tz))])) .astype(CDT(ordered=True))) tm.assert_series_equal(result, expected)
def test_repr_floats(self): # GH 32553 markers = Series( ["foo", "bar"], index=IntervalIndex([ Interval(left, right) for left, right in zip( Float64Index([329.973, 345.137], dtype="float64"), Float64Index([345.137, 360.191], dtype="float64"), ) ]), ) result = str(markers) expected = "(329.973, 345.137] foo\n(345.137, 360.191] bar\ndtype: object" assert result == expected
def test_datetime_tz_qcut(bins): # see gh-19872 tz = "US/Eastern" ser = Series(date_range("20130101", periods=3, tz=tz)) result = qcut(ser, bins) expected = Series( IntervalIndex([ Interval(Timestamp("2012-12-31 23:59:59.999999999", tz=tz), Timestamp("2013-01-01 16:00:00", tz=tz)), Interval(Timestamp("2013-01-01 16:00:00", tz=tz), Timestamp("2013-01-02 08:00:00", tz=tz)), Interval(Timestamp("2013-01-02 08:00:00", tz=tz), Timestamp("2013-01-03 00:00:00", tz=tz)) ])).astype(CDT(ordered=True)) tm.assert_series_equal(result, expected)
def test_get_indexer_with_nans(self): # GH#41831 index = IntervalIndex([np.nan, Interval(1, 2), np.nan]) expected = np.array([True, False, True]) for key in [None, np.nan, NA]: assert key in index result = index.get_loc(key) tm.assert_numpy_array_equal(result, expected) for key in [ NaT, np.timedelta64("NaT", "ns"), np.datetime64("NaT", "ns") ]: with pytest.raises(KeyError, match=str(key)): index.get_loc(key)
def test_datetimetz_cut(self, bins, box): # GH 19872 tz = 'US/Eastern' s = Series(date_range('20130101', periods=3, tz=tz)) if not isinstance(bins, int): bins = box(bins) result = cut(s, bins) expected = ( Series(IntervalIndex([ Interval(Timestamp('2012-12-31 23:57:07.200000', tz=tz), Timestamp('2013-01-01 16:00:00', tz=tz)), Interval(Timestamp('2013-01-01 16:00:00', tz=tz), Timestamp('2013-01-02 08:00:00', tz=tz)), Interval(Timestamp('2013-01-02 08:00:00', tz=tz), Timestamp('2013-01-03 00:00:00', tz=tz))])) .astype(CDT(ordered=True))) tm.assert_series_equal(result, expected)
def test_get_indexer_categorical_with_nans(self): # GH#41934 nans in both index and in target ii = IntervalIndex.from_breaks(range(5), inclusive="right") ii2 = ii.append(IntervalIndex([np.nan])) ci2 = CategoricalIndex(ii2) result = ii2.get_indexer(ci2) expected = np.arange(5, dtype=np.intp) tm.assert_numpy_array_equal(result, expected) # not-all-matches result = ii2[1:].get_indexer(ci2[::-1]) expected = np.array([3, 2, 1, 0, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) # non-unique target, non-unique nans result = ii2.get_indexer(ci2.append(ci2)) expected = np.array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4], dtype=np.intp) tm.assert_numpy_array_equal(result, expected)
def test_interval(self): idx = pd.interval_range(0, 10, periods=10) cat = Categorical(idx, categories=idx) expected_codes = np.arange(10, dtype="int8") tm.assert_numpy_array_equal(cat.codes, expected_codes) tm.assert_index_equal(cat.categories, idx) # infer categories cat = Categorical(idx) tm.assert_numpy_array_equal(cat.codes, expected_codes) tm.assert_index_equal(cat.categories, idx) # list values cat = Categorical(list(idx)) tm.assert_numpy_array_equal(cat.codes, expected_codes) tm.assert_index_equal(cat.categories, idx) # list values, categories cat = Categorical(list(idx), categories=list(idx)) tm.assert_numpy_array_equal(cat.codes, expected_codes) tm.assert_index_equal(cat.categories, idx) # shuffled values = idx.take([1, 2, 0]) cat = Categorical(values, categories=idx) tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="int8")) tm.assert_index_equal(cat.categories, idx) # extra values = pd.interval_range(8, 11, periods=3) cat = Categorical(values, categories=idx) expected_codes = np.array([8, 9, -1], dtype="int8") tm.assert_numpy_array_equal(cat.codes, expected_codes) tm.assert_index_equal(cat.categories, idx) # overlapping idx = IntervalIndex([Interval(0, 2), Interval(0, 1)]) cat = Categorical(idx, categories=idx) expected_codes = np.array([0, 1], dtype="int8") tm.assert_numpy_array_equal(cat.codes, expected_codes) tm.assert_index_equal(cat.categories, idx)
def test_difference(self, closed, sort): index = IntervalIndex.from_arrays([1, 0, 3, 2], [1, 2, 3, 4], closed=closed) result = index.difference(index[:1], sort) expected = index[1:] if sort: expected = expected.sort_values() tm.assert_index_equal(result, expected) # GH 19101: empty result, same dtype result = index.difference(index, sort) expected = IntervalIndex(np.array([], dtype='int64'), closed=closed) tm.assert_index_equal(result, expected) # GH 19101: empty result, different dtypes other = IntervalIndex.from_arrays(index.left.astype('float64'), index.right, closed=closed) result = index.difference(other, sort) tm.assert_index_equal(result, expected)
def test_datetime_bin(self): data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')] bin_data = ['2012-12-12', '2012-12-14', '2012-12-16'] expected = ( Series(IntervalIndex([ Interval(Timestamp(bin_data[0]), Timestamp(bin_data[1])), Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2]))])) .astype(CDT(ordered=True))) for conv in [Timestamp, Timestamp, np.datetime64]: bins = [conv(v) for v in bin_data] result = cut(data, bins=bins) tm.assert_series_equal(Series(result), expected) bin_pydatetime = [Timestamp(v).to_pydatetime() for v in bin_data] result = cut(data, bins=bin_pydatetime) tm.assert_series_equal(Series(result), expected) bins = to_datetime(bin_data) result = cut(data, bins=bin_pydatetime) tm.assert_series_equal(Series(result), expected)
def test_symmetric_difference(self, closed, sort): index = monotonic_index(0, 11, closed=closed) result = index[1:].symmetric_difference(index[:-1], sort=sort) expected = IntervalIndex([index[0], index[-1]]) if sort is None: tm.assert_index_equal(result, expected) assert tm.equalContents(result, expected) # GH 19101: empty result, same dtype result = index.symmetric_difference(index, sort=sort) expected = empty_index(dtype="int64", closed=closed) if sort is None: tm.assert_index_equal(result, expected) assert tm.equalContents(result, expected) # GH 19101: empty result, different dtypes other = IntervalIndex.from_arrays(index.left.astype("float64"), index.right, closed=closed) result = index.symmetric_difference(other, sort=sort) tm.assert_index_equal(result, expected)
def test_datetime_cut(self): # GH 14714 # testing for time data to be present as series data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03'])) result, bins = cut(data, 3, retbins=True) expected = (Series( IntervalIndex([ Interval(Timestamp('2012-12-31 23:57:07.200000'), Timestamp('2013-01-01 16:00:00')), Interval(Timestamp('2013-01-01 16:00:00'), Timestamp('2013-01-02 08:00:00')), Interval(Timestamp('2013-01-02 08:00:00'), Timestamp('2013-01-03 00:00:00')) ])).astype(CDT(ordered=True))) tm.assert_series_equal(result, expected) # testing for time data to be present as list data = [ np.datetime64('2013-01-01'), np.datetime64('2013-01-02'), np.datetime64('2013-01-03') ] result, bins = cut(data, 3, retbins=True) tm.assert_series_equal(Series(result), expected) # testing for time data to be present as ndarray data = np.array([ np.datetime64('2013-01-01'), np.datetime64('2013-01-02'), np.datetime64('2013-01-03') ]) result, bins = cut(data, 3, retbins=True) tm.assert_series_equal(Series(result), expected) # testing for time data to be present as datetime index data = DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03']) result, bins = cut(data, 3, retbins=True) tm.assert_series_equal(Series(result), expected)
def _isna_ndarraylike(obj): values = getattr(obj, 'values', obj) dtype = values.dtype if is_extension_array_dtype(obj): if isinstance(obj, (ABCIndexClass, ABCSeries)): values = obj._values else: values = obj result = values.isna() elif is_interval_dtype(values): # TODO(IntervalArray): remove this if block from pandas import IntervalIndex result = IntervalIndex(obj).isna() elif is_string_dtype(dtype): # Working around NumPy ticket 1542 shape = values.shape if is_string_like_dtype(dtype): # object array of strings result = np.zeros(values.shape, dtype=bool) else: # object array of non-strings result = np.empty(shape, dtype=bool) vec = libmissing.isnaobj(values.ravel()) result[...] = vec.reshape(shape) elif needs_i8_conversion(obj): # this is the NaT pattern result = values.view('i8') == iNaT else: result = np.isnan(values) # box if isinstance(obj, ABCSeries): from pandas import Series result = Series(result, index=obj.index, name=obj.name, copy=False) return result