def test_get_indexer_non_unique_nas(self, nulls_fixture): # even though this isn't non-unique, this should still work index = Index(["a", "b", nulls_fixture]) indexer, missing = index.get_indexer_non_unique([nulls_fixture]) expected_indexer = np.array([2], dtype=np.intp) expected_missing = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) # actually non-unique index = Index(["a", nulls_fixture, "b", nulls_fixture]) indexer, missing = index.get_indexer_non_unique([nulls_fixture]) expected_indexer = np.array([1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) # matching-but-not-identical nans if is_matching_na(nulls_fixture, float("NaN")): index = Index(["a", float("NaN"), "b", float("NaN")]) match_but_not_identical = True elif is_matching_na(nulls_fixture, Decimal("NaN")): index = Index(["a", Decimal("NaN"), "b", Decimal("NaN")]) match_but_not_identical = True else: match_but_not_identical = False if match_but_not_identical: indexer, missing = index.get_indexer_non_unique([nulls_fixture]) expected_indexer = np.array([1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing)
def test_get_indexer_non_unique(self): np.random.seed(123456789) ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False) oidx = Index(np.array(ci)) for n in [1, 2, 5, len(ci)]: finder = oidx[np.random.randint(0, len(ci), size=n)] expected = oidx.get_indexer_non_unique(finder)[0] actual = ci.get_indexer(finder) tm.assert_numpy_array_equal(expected, actual) # see gh-17323 # # Even when indexer is equal to the # members in the index, we should # respect duplicates instead of taking # the fast-track path. for finder in [list("aabbca"), list("aababca")]: expected = oidx.get_indexer_non_unique(finder)[0] actual = ci.get_indexer(finder) tm.assert_numpy_array_equal(expected, actual)
def test_get_indexer_non_unique(self, idx_values, key_values, key_class): # GH 21448 key = key_class(key_values, categories=range(1, 5)) # Test for flat index and CategoricalIndex with same/different cats: for dtype in [None, "category", key.dtype]: idx = Index(idx_values, dtype=dtype) expected, exp_miss = idx.get_indexer_non_unique(key_values) result, res_miss = idx.get_indexer_non_unique(key) tm.assert_numpy_array_equal(expected, result) tm.assert_numpy_array_equal(exp_miss, res_miss)
def test_get_indexer_non_unique(self, idx_values, key_values, key_class): # GH 21448 key = key_class(key_values, categories=range(1, 5)) # Test for flat index and CategoricalIndex with same/different cats: for dtype in None, 'category', key.dtype: idx = Index(idx_values, dtype=dtype) expected, exp_miss = idx.get_indexer_non_unique(key_values) result, res_miss = idx.get_indexer_non_unique(key) tm.assert_numpy_array_equal(expected, result) tm.assert_numpy_array_equal(exp_miss, res_miss)
def test_get_indexer_non_unique_nans_in_object_dtype_target(nulls_fixture): idx = Index([1.0, 2.0]) target = Index([1, nulls_fixture], dtype="object") result_idx, result_missing = idx.get_indexer_non_unique(target) tm.assert_numpy_array_equal(result_idx, np.array([0, -1], dtype=np.intp)) tm.assert_numpy_array_equal(result_missing, np.array([1], dtype=np.intp))
def test_get_indexer_non_unique(self, idx_values, key_values, key_class, dtype): # GH 21448 key = key_class(key_values, categories=range(1, 5)) if dtype == "key": dtype = key.dtype # Test for flat index and CategoricalIndex with same/different cats: idx = Index(idx_values, dtype=dtype) expected, exp_miss = idx.get_indexer_non_unique(key_values) result, res_miss = idx.get_indexer_non_unique(key) tm.assert_numpy_array_equal(expected, result) tm.assert_numpy_array_equal(exp_miss, res_miss) exp_unique = idx.unique().get_indexer(key_values) res_unique = idx.unique().get_indexer(key) tm.assert_numpy_array_equal(res_unique, exp_unique)
def test_get_indexer_non_unique_np_nats(self, np_nat_fixture, np_nat_fixture2): expected_missing = np.array([], dtype=np.intp) # matching-but-not-identical nats if is_matching_na(np_nat_fixture, np_nat_fixture2): # ensure nats are different objects index = Index( np.array( [ "2021-10-02", np_nat_fixture.copy(), np_nat_fixture2.copy() ], dtype=object, ), dtype=object, ) # pass as index to prevent target from being casted to DatetimeIndex indexer, missing = index.get_indexer_non_unique( Index([np_nat_fixture], dtype=object)) expected_indexer = np.array([1, 2], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) # dt64nat vs td64nat else: index = Index( np.array( [ "2021-10-02", np_nat_fixture, np_nat_fixture2, np_nat_fixture, np_nat_fixture2, ], dtype=object, ), dtype=object, ) # pass as index to prevent target from being casted to DatetimeIndex indexer, missing = index.get_indexer_non_unique( Index([np_nat_fixture], dtype=object)) expected_indexer = np.array([1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing)
def test_get_indexer_numeric_vs_bool(self): left = Index([1, 2, 3]) right = Index([True, False]) res = left.get_indexer(right) expected = -1 * np.ones(len(right), dtype=np.intp) tm.assert_numpy_array_equal(res, expected) res = right.get_indexer(left) expected = -1 * np.ones(len(left), dtype=np.intp) tm.assert_numpy_array_equal(res, expected) res = left.get_indexer_non_unique(right)[0] expected = -1 * np.ones(len(right), dtype=np.intp) tm.assert_numpy_array_equal(res, expected) res = right.get_indexer_non_unique(left)[0] expected = -1 * np.ones(len(left), dtype=np.intp) tm.assert_numpy_array_equal(res, expected)
def _make_indexer(self, self_indexer: Index, other_indexer: Index): if self.aggregation_required: group_ints, group_order = other_indexer.factorize() self.other_grouper = group_ints self.flat_indexer, self.missing_indices = group_order.get_indexer_non_unique( self_indexer) else: # Performance-tuned fast paths for constructing indexers if self_indexer.equals(other_indexer): # Indexers are identical self.flat_indexer = np.arange(len(other_indexer)) self.missing_indices = np.array([], dtype=int) elif len(self_indexer.difference( other_indexer)) == 0: # No missing values # Taking the difference is faster than `all(.isin())` self.missing_indices = np.array([], dtype=int) self.flat_indexer = other_indexer.get_indexer(self_indexer) else: # All other cases self.flat_indexer, self.missing_indices = other_indexer.get_indexer_non_unique( self_indexer)