def _test_dtype(dtype, can_hold_na, writeable=True): data = np.random.randint(0, 2, (5, 3)).astype(dtype) data.flags.writeable = writeable indexer = [2, 1, 0, 1] out0 = np.empty((4, 3), dtype=dtype) out1 = np.empty((5, 4), dtype=dtype) algos.take_nd(data, indexer, out=out0, axis=0) algos.take_nd(data, indexer, out=out1, axis=1) expected0 = data.take(indexer, axis=0) expected1 = data.take(indexer, axis=1) tm.assert_almost_equal(out0, expected0) tm.assert_almost_equal(out1, expected1) indexer = [2, 1, 0, -1] out0 = np.empty((4, 3), dtype=dtype) out1 = np.empty((5, 4), dtype=dtype) if can_hold_na: algos.take_nd(data, indexer, out=out0, axis=0) algos.take_nd(data, indexer, out=out1, axis=1) expected0 = data.take(indexer, axis=0) expected1 = data.take(indexer, axis=1) expected0[3, :] = np.nan expected1[:, 3] = np.nan tm.assert_almost_equal(out0, expected0) tm.assert_almost_equal(out1, expected1) else: for i, out in enumerate([out0, out1]): with tm.assertRaisesRegexp(TypeError, self.fill_error): algos.take_nd(data, indexer, out=out, axis=i) # no exception o/w data.take(indexer, out=out, axis=i)
def test_2d_fill_nonna(self, dtype_fill_out_dtype): dtype, fill_value, out_dtype = dtype_fill_out_dtype data = np.random.randint(0, 2, (5, 3)).astype(dtype) indexer = [2, 1, 0, -1] result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value) assert ((result[[0, 1, 2], :] == data[[2, 1, 0], :]).all()) assert ((result[3, :] == fill_value).all()) assert (result.dtype == out_dtype) result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value) assert ((result[:, [0, 1, 2]] == data[:, [2, 1, 0]]).all()) assert ((result[:, 3] == fill_value).all()) assert (result.dtype == out_dtype) indexer = [2, 1, 0, 1] result = algos.take_nd(data, indexer, axis=0, fill_value=fill_value) assert ((result[[0, 1, 2, 3], :] == data[indexer, :]).all()) assert (result.dtype == dtype) result = algos.take_nd(data, indexer, axis=1, fill_value=fill_value) assert ((result[:, [0, 1, 2, 3]] == data[:, indexer]).all()) assert (result.dtype == dtype)
def _bins_to_cuts(x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates='raise'): if duplicates not in ['raise', 'drop']: raise ValueError("invalid value for 'duplicates' parameter, " "valid options are: raise, drop") if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) result = algos.take_nd(bins, ids) result = Categorical(result, categories=bins, ordered=True) return result, bins unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == 'raise': raise ValueError("Bin edges must be unique: {bins!r}.\nYou " "can drop duplicate edges by setting " "the 'duplicates' kwarg".format(bins=bins)) else: bins = unique_bins side = 'left' if right else 'right' ids = _ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: # Numpy 1.9 support: ensure this mask is a Numpy array ids[np.asarray(x == bins[0])] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: labels = _format_labels(bins, precision, right=right, include_lowest=include_lowest, dtype=dtype) else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) else: result = ids - 1 if has_nas: result = result.astype(np.float64) np.putmask(result, na_mask, np.nan) return result, bins
def test_2d_bool(self): arr = np.array([[0, 1, 0], [1, 0, 1], [0, 1, 1]], dtype=bool) result = algos.take_nd(arr, [0, 2, 2, 1]) expected = arr.take([0, 2, 2, 1], axis=0) tm.assert_numpy_array_equal(result, expected) result = algos.take_nd(arr, [0, 2, 2, 1], axis=1) expected = arr.take([0, 2, 2, 1], axis=1) tm.assert_numpy_array_equal(result, expected) result = algos.take_nd(arr, [0, 2, -1]) assert result.dtype == np.object_
def test_2d_float32(self): arr = np.random.randn(4, 3).astype(np.float32) indexer = [0, 2, -1, 1, -1] # axis=0 result = algos.take_nd(arr, indexer, axis=0) result2 = np.empty_like(result) algos.take_nd(arr, indexer, axis=0, out=result2) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=0) expected[[2, 4], :] = np.nan tm.assert_almost_equal(result, expected) # this now accepts a float32! # test with float64 out buffer out = np.empty((len(indexer), arr.shape[1]), dtype='float32') algos.take_nd(arr, indexer, out=out) # it works! # axis=1 result = algos.take_nd(arr, indexer, axis=1) result2 = np.empty_like(result) algos.take_nd(arr, indexer, axis=1, out=result2) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=1) expected[:, [2, 4]] = np.nan tm.assert_almost_equal(result, expected)
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = algos.take_nd(values, inds, axis=1) columns = columns[inds] # may need to coerce categoricals here if self.is_categorical is not None: categories = self.is_categorical.categories ordered = self.is_categorical.ordered values = [ Categorical(values[:, i], categories=categories, ordered=ordered) for i in range(values.shape[-1]) ] return DataFrame(values, index=index, columns=columns)
def test_2d_other_dtypes(self): arr = np.random.randn(10, 5).astype(np.float32) indexer = [1, 2, 3, -1] # axis=0 result = algos.take_nd(arr, indexer, axis=0) expected = arr.take(indexer, axis=0) expected[-1] = np.nan tm.assert_almost_equal(result, expected) # axis=1 result = algos.take_nd(arr, indexer, axis=1) expected = arr.take(indexer, axis=1) expected[:, -1] = np.nan tm.assert_almost_equal(result, expected)
def _reorder_by_uniques(uniques, labels): # sorter is index where elements ought to go sorter = uniques.argsort() # reverse_indexer is where elements came from reverse_indexer = np.empty(len(sorter), dtype=np.int64) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 # move labels to right locations (ie, unsort ascending labels) labels = algos.take_nd(reverse_indexer, labels, allow_fill=False) np.putmask(labels, mask, -1) # sort observed ids uniques = algos.take_nd(uniques, sorter, allow_fill=False) return uniques, labels
def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): mat = mgr.as_matrix() reindexed_mat = algos.take_nd(mat, indexer, axis, fill_value=fill_value) reindexed = mgr.reindex_indexer(new_labels, indexer, axis, fill_value=fill_value) tm.assert_numpy_array_equal(reindexed_mat, reindexed.as_matrix()) tm.assert_index_equal(reindexed.axes[axis], new_labels)
def get_reindexed_values(self, empty_dtype, upcasted_na): if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na if self.is_na: if getattr(self.block, 'is_object', False): # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls values = self.block.values.ravel(order='K') if len(values) and values[0] is None: fill_value = None if getattr(self.block, 'is_datetimetz', False) or \ is_datetimetz(empty_dtype): pass elif getattr(self.block, 'is_categorical', False): pass elif getattr(self.block, 'is_sparse', False): pass else: missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr if not self.indexers: if not self.block._can_consolidate: # preserve these for validation in _concat_compat return self.block.values if self.block.is_bool and not self.block.is_categorical: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values elif self.block.is_extension: values = self.block.values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.get_values() if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax, fill_value=fill_value) return values
def assert_reindex_axis_is_ok(mgr, axis, new_labels, fill_value): mat = mgr.as_matrix() indexer = mgr.axes[axis].get_indexer_for(new_labels) reindexed = mgr.reindex_axis(new_labels, axis, fill_value=fill_value) tm.assert_numpy_array_equal(algos.take_nd(mat, indexer, axis, fill_value=fill_value), reindexed.as_matrix(), check_dtype=False) tm.assert_index_equal(reindexed.axes[axis], new_labels)
def test_2d_with_out(self, dtype_can_hold_na, writeable): dtype, can_hold_na = dtype_can_hold_na data = np.random.randint(0, 2, (5, 3)).astype(dtype) data.flags.writeable = writeable indexer = [2, 1, 0, 1] out0 = np.empty((4, 3), dtype=dtype) out1 = np.empty((5, 4), dtype=dtype) algos.take_nd(data, indexer, out=out0, axis=0) algos.take_nd(data, indexer, out=out1, axis=1) expected0 = data.take(indexer, axis=0) expected1 = data.take(indexer, axis=1) tm.assert_almost_equal(out0, expected0) tm.assert_almost_equal(out1, expected1) indexer = [2, 1, 0, -1] out0 = np.empty((4, 3), dtype=dtype) out1 = np.empty((5, 4), dtype=dtype) if can_hold_na: algos.take_nd(data, indexer, out=out0, axis=0) algos.take_nd(data, indexer, out=out1, axis=1) expected0 = data.take(indexer, axis=0) expected1 = data.take(indexer, axis=1) expected0[3, :] = np.nan expected1[:, 3] = np.nan tm.assert_almost_equal(out0, expected0) tm.assert_almost_equal(out1, expected1) else: for i, out in enumerate([out0, out1]): with pytest.raises(TypeError, match=self.fill_error): algos.take_nd(data, indexer, out=out, axis=i) # No Exception otherwise. data.take(indexer, out=out, axis=i)
def _make_sorted_values_labels(self): v = self.level labs = list(self.index.labels) levs = list(self.index.levels) to_sort = labs[:v] + labs[v + 1 :] + [labs[v]] sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]] comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) indexer = _algos.groupsort_indexer(comp_index, ngroups)[0] indexer = _ensure_platform_int(indexer) self.sorted_values = algos.take_nd(self.values, indexer, axis=0) self.sorted_labels = [l.take(indexer) for l in to_sort]
def _aggregate_series_fast(self, obj, func): func = self._is_builtin_func(func) if obj.index._has_complex_internals: raise TypeError('Incompatible index for Cython grouper') group_index, _, ngroups = self.group_info # avoids object / Series creation overhead dummy = obj._get_values(slice(None, 0)).to_dense() indexer = get_group_index_sorter(group_index, ngroups) obj = obj._take(indexer).to_dense() group_index = algorithms.take_nd( group_index, indexer, allow_fill=False) grouper = reduction.SeriesGrouper(obj, func, group_index, ngroups, dummy) result, counts = grouper.get_result() return result, counts
def _map_values(self, mapper, na_action=None): """ An internal function that maps values using the input correspondence (which can be a dict, Series, or function). Parameters ---------- mapper : function, dict, or Series The input correspondence object na_action : {None, 'ignore'} If 'ignore', propagate NA values, without passing them to the mapping function Returns ------- Union[Index, MultiIndex], inferred The output of the mapping function applied to the index. If the function returns a tuple with more than one element a MultiIndex will be returned. """ # we can fastpath dict/Series to an efficient map # as we know that we are not going to have to yield # python types if is_dict_like(mapper): if isinstance(mapper, dict) and hasattr(mapper, "__missing__"): # If a dictionary subclass defines a default value method, # convert mapper to a lookup function (GH #15999). dict_with_default = mapper mapper = lambda x: dict_with_default[x] else: # Dictionary does not have a default. Thus it's safe to # convert to an Series for efficiency. # we specify the keys here to handle the # possibility that they are tuples # The return value of mapping with an empty mapper is # expected to be pd.Series(np.nan, ...). As np.nan is # of dtype float64 the return value of this method should # be float64 as well mapper = create_series_with_explicit_dtype( mapper, dtype_if_empty=np.float64) if isinstance(mapper, ABCSeries): # Since values were input this means we came from either # a dict or a series and mapper should be an index if is_categorical_dtype(self.dtype): # use the built in categorical series mapper which saves # time by mapping the categories instead of all values cat = cast("Categorical", self._values) return cat.map(mapper) values = self._values indexer = mapper.index.get_indexer(values) new_values = algorithms.take_nd(mapper._values, indexer) return new_values # we must convert to python types if is_extension_array_dtype(self.dtype) and hasattr( self._values, "map"): # GH#23179 some EAs do not have `map` values = self._values if na_action is not None: raise NotImplementedError map_f = lambda values, f: values.map(f) else: values = self._values.astype(object) if na_action == "ignore": map_f = lambda values, f: lib.map_infer_mask( values, f, isna(values).view(np.uint8)) elif na_action is None: map_f = lib.map_infer else: msg = ("na_action must either be 'ignore' or None, " f"{na_action} was passed") raise ValueError(msg) # mapper is a function new_values = map_f(values, mapper) return new_values
def _test_dtype(dtype, can_hold_na): data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype) indexer = [2, 1, 0, 1] out0 = np.empty((4, 4, 3), dtype=dtype) out1 = np.empty((5, 4, 3), dtype=dtype) out2 = np.empty((5, 4, 4), dtype=dtype) algos.take_nd(data, indexer, out=out0, axis=0) algos.take_nd(data, indexer, out=out1, axis=1) algos.take_nd(data, indexer, out=out2, axis=2) expected0 = data.take(indexer, axis=0) expected1 = data.take(indexer, axis=1) expected2 = data.take(indexer, axis=2) tm.assert_almost_equal(out0, expected0) tm.assert_almost_equal(out1, expected1) tm.assert_almost_equal(out2, expected2) indexer = [2, 1, 0, -1] out0 = np.empty((4, 4, 3), dtype=dtype) out1 = np.empty((5, 4, 3), dtype=dtype) out2 = np.empty((5, 4, 4), dtype=dtype) if can_hold_na: algos.take_nd(data, indexer, out=out0, axis=0) algos.take_nd(data, indexer, out=out1, axis=1) algos.take_nd(data, indexer, out=out2, axis=2) expected0 = data.take(indexer, axis=0) expected1 = data.take(indexer, axis=1) expected2 = data.take(indexer, axis=2) expected0[3, :, :] = np.nan expected1[:, 3, :] = np.nan expected2[:, :, 3] = np.nan tm.assert_almost_equal(out0, expected0) tm.assert_almost_equal(out1, expected1) tm.assert_almost_equal(out2, expected2) else: for i, out in enumerate([out0, out1, out2]): with tm.assertRaisesRegexp(TypeError, self.fill_error): algos.take_nd(data, indexer, out=out, axis=i) # no exception o/w data.take(indexer, out=out, axis=i)
def test_2d_datetime64(self): # 2005/01/01 - 2006/01/01 arr = np.random.randint(long(11045376), long(11360736), (5, 3)) * 100000000000 arr = arr.view(dtype='datetime64[ns]') indexer = [0, 2, -1, 1, -1] # axis=0 result = algos.take_nd(arr, indexer, axis=0) result2 = np.empty_like(result) algos.take_nd(arr, indexer, axis=0, out=result2) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=0) expected.view(np.int64)[[2, 4], :] = iNaT tm.assert_almost_equal(result, expected) result = algos.take_nd(arr, indexer, axis=0, fill_value=datetime(2007, 1, 1)) result2 = np.empty_like(result) algos.take_nd(arr, indexer, out=result2, axis=0, fill_value=datetime(2007, 1, 1)) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=0) expected[[2, 4], :] = datetime(2007, 1, 1) tm.assert_almost_equal(result, expected) # axis=1 result = algos.take_nd(arr, indexer, axis=1) result2 = np.empty_like(result) algos.take_nd(arr, indexer, axis=1, out=result2) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=1) expected.view(np.int64)[:, [2, 4]] = iNaT tm.assert_almost_equal(result, expected) result = algos.take_nd(arr, indexer, axis=1, fill_value=datetime(2007, 1, 1)) result2 = np.empty_like(result) algos.take_nd(arr, indexer, out=result2, axis=1, fill_value=datetime(2007, 1, 1)) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=1) expected[:, [2, 4]] = datetime(2007, 1, 1) tm.assert_almost_equal(result, expected)
def test_3d_with_out(self, dtype_can_hold_na): dtype, can_hold_na = dtype_can_hold_na data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype) indexer = [2, 1, 0, 1] out0 = np.empty((4, 4, 3), dtype=dtype) out1 = np.empty((5, 4, 3), dtype=dtype) out2 = np.empty((5, 4, 4), dtype=dtype) algos.take_nd(data, indexer, out=out0, axis=0) algos.take_nd(data, indexer, out=out1, axis=1) algos.take_nd(data, indexer, out=out2, axis=2) expected0 = data.take(indexer, axis=0) expected1 = data.take(indexer, axis=1) expected2 = data.take(indexer, axis=2) tm.assert_almost_equal(out0, expected0) tm.assert_almost_equal(out1, expected1) tm.assert_almost_equal(out2, expected2) indexer = [2, 1, 0, -1] out0 = np.empty((4, 4, 3), dtype=dtype) out1 = np.empty((5, 4, 3), dtype=dtype) out2 = np.empty((5, 4, 4), dtype=dtype) if can_hold_na: algos.take_nd(data, indexer, out=out0, axis=0) algos.take_nd(data, indexer, out=out1, axis=1) algos.take_nd(data, indexer, out=out2, axis=2) expected0 = data.take(indexer, axis=0) expected1 = data.take(indexer, axis=1) expected2 = data.take(indexer, axis=2) expected0[3, :, :] = np.nan expected1[:, 3, :] = np.nan expected2[:, :, 3] = np.nan tm.assert_almost_equal(out0, expected0) tm.assert_almost_equal(out1, expected1) tm.assert_almost_equal(out2, expected2) else: for i, out in enumerate([out0, out1, out2]): with tm.assert_raises_regex(TypeError, self.fill_error): algos.take_nd(data, indexer, out=out, axis=i) # No Exception otherwise. data.take(indexer, out=out, axis=i)
def parallel_take1d(): take_nd(df["col"].values, indexer)
def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: Dict[int, np.ndarray]): """ Construct concatenation plan for given block manager and indexers. Parameters ---------- mgr : BlockManager indexers : dict of {axis: indexer} Returns ------- plan : list of (BlockPlacement, JoinUnit) tuples """ # Calculate post-reindex shape , save for item axis which will be separate # for each block anyway. mgr_shape_list = list(mgr.shape) for ax, indexer in indexers.items(): mgr_shape_list[ax] = len(indexer) mgr_shape = tuple(mgr_shape_list) if 0 in indexers: ax0_indexer = indexers.pop(0) blknos = algos.take_nd(mgr.blknos, ax0_indexer, fill_value=-1) blklocs = algos.take_nd(mgr.blklocs, ax0_indexer, fill_value=-1) else: if mgr.is_single_block: blk = mgr.blocks[0] return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] ax0_indexer = None blknos = mgr.blknos blklocs = mgr.blklocs plan = [] for blkno, placements in libinternals.get_blkno_placements(blknos, group=False): assert placements.is_slice_like join_unit_indexers = indexers.copy() shape_list = list(mgr_shape) shape_list[0] = len(placements) shape = tuple(shape_list) if blkno == -1: unit = JoinUnit(None, shape) else: blk = mgr.blocks[blkno] ax0_blk_indexer = blklocs[placements.indexer] unit_no_ax0_reindexing = ( len(placements) == len(blk.mgr_locs) and # Fastpath detection of join unit not # needing to reindex its block: no ax0 # reindexing took place and block # placement was sequential before. ((ax0_indexer is None and blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1) or # Slow-ish detection: all indexer locs # are sequential (and length match is # checked above). (np.diff(ax0_blk_indexer) == 1).all())) # Omit indexer if no item reindexing is required. if unit_no_ax0_reindexing: join_unit_indexers.pop(0, None) else: join_unit_indexers[0] = ax0_blk_indexer unit = JoinUnit(blk, shape, join_unit_indexers) plan.append((placements, unit)) return plan
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na if self.is_na: if getattr(self.block, "is_object", False): # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls values = self.block.values.ravel(order="K") if len(values) and values[0] is None: fill_value = None if getattr(self.block, "is_datetimetz", False) or is_datetime64tz_dtype(empty_dtype): if self.block is None: # TODO(EA2D): special case unneeded with 2D EAs return DatetimeArray(np.full(self.shape[1], fill_value.value), dtype=empty_dtype) elif getattr(self.block, "is_categorical", False): pass elif getattr(self.block, "is_extension", False): pass elif is_extension_array_dtype(empty_dtype): missing_arr = empty_dtype.construct_array_type( )._from_sequence([], dtype=empty_dtype) ncols, nrows = self.shape assert ncols == 1, ncols empty_arr = -1 * np.ones((nrows, ), dtype=np.intp) return missing_arr.take(empty_arr, allow_fill=True, fill_value=fill_value) else: missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values if self.block.is_bool and not self.block.is_categorical: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values elif self.block.is_extension: values = self.block.values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax, fill_value=fill_value) return values
def _bins_to_cuts( x, bins, right: bool = True, labels=None, precision: int = 3, include_lowest: bool = False, dtype=None, duplicates: str = "raise", ordered: bool = True, ): if not ordered and labels is None: raise ValueError("'labels' must be provided if 'ordered = False'") if duplicates not in ["raise", "drop"]: raise ValueError( "invalid value for 'duplicates' parameter, valid options are: raise, drop" ) if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) result = Categorical.from_codes(ids, categories=bins, ordered=True) return result, bins unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == "raise": raise ValueError( f"Bin edges must be unique: {repr(bins)}.\n" f"You can drop duplicate edges by setting the 'duplicates' kwarg" ) else: bins = unique_bins side = "left" if right else "right" ids = ensure_platform_int(bins.searchsorted(x, side=side)) if include_lowest: ids[x == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if not (labels is None or is_list_like(labels)): raise ValueError( "Bin labels must either be False, None or passed in as a " "list-like argument" ) elif labels is None: labels = _format_labels( bins, precision, right=right, include_lowest=include_lowest, dtype=dtype ) elif ordered and len(set(labels)) != len(labels): raise ValueError( "labels must be unique if ordered=True; pass ordered=False for duplicate labels" # noqa ) else: if len(labels) != len(bins) - 1: raise ValueError( "Bin labels must be one fewer than the number of bin edges" ) if not is_categorical_dtype(labels): labels = Categorical( labels, categories=labels if len(set(labels)) == len(labels) else None, ordered=ordered, ) # TODO: handle mismatch between categorical label order and pandas.cut order. np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) else: result = ids - 1 if has_nas: result = result.astype(np.float64) np.putmask(result, na_mask, np.nan) return result, bins
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na if self.is_valid_na_for(empty_dtype): blk_dtype = getattr(self.block, "dtype", None) if blk_dtype == np.dtype("object"): # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls values = self.block.values.ravel(order="K") if len(values) and values[0] is None: fill_value = None if is_datetime64tz_dtype(empty_dtype): i8values = np.full(self.shape, fill_value.value) return DatetimeArray(i8values, dtype=empty_dtype) elif is_extension_array_dtype(blk_dtype): pass elif is_1d_only_ea_dtype(empty_dtype): empty_dtype = cast(ExtensionDtype, empty_dtype) cls = empty_dtype.construct_array_type() missing_arr = cls._from_sequence([], dtype=empty_dtype) ncols, nrows = self.shape assert ncols == 1, ncols empty_arr = -1 * np.ones((nrows,), dtype=np.intp) return missing_arr.take( empty_arr, allow_fill=True, fill_value=fill_value ) else: # NB: we should never get here with empty_dtype integer or bool; # if we did, the missing_arr.fill would cast to gibberish empty_dtype = cast(np.dtype, empty_dtype) missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values if self.block.is_bool: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax) return values
def _get_ilevel_values(index, level): # accept level number only unique = index.levels[level] level_codes = index.codes[level] filled = take_nd(unique._values, level_codes, fill_value=unique._na_value) return unique._shallow_copy(filled, name=index.names[level])
def union_categoricals( to_union, sort_categories: bool = False, ignore_order: bool = False ): """ Combine list-like of Categorical-like, unioning categories. All categories must have the same dtype. Parameters ---------- to_union : list-like Categorical, CategoricalIndex, or Series with dtype='category'. sort_categories : bool, default False If true, resulting categories will be lexsorted, otherwise they will be ordered as they appear in the data. ignore_order : bool, default False If true, the ordered attribute of the Categoricals will be ignored. Results in an unordered categorical. Returns ------- Categorical Raises ------ TypeError - all inputs do not have the same dtype - all inputs do not have the same ordered property - all inputs are ordered and their categories are not identical - sort_categories=True and Categoricals are ordered ValueError Empty list of categoricals passed Notes ----- To learn more about categories, see `link <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#unioning>`__ Examples -------- >>> from pandas.api.types import union_categoricals If you want to combine categoricals that do not necessarily have the same categories, `union_categoricals` will combine a list-like of categoricals. The new categories will be the union of the categories being combined. >>> a = pd.Categorical(["b", "c"]) >>> b = pd.Categorical(["a", "b"]) >>> union_categoricals([a, b]) ['b', 'c', 'a', 'b'] Categories (3, object): ['b', 'c', 'a'] By default, the resulting categories will be ordered as they appear in the `categories` of the data. If you want the categories to be lexsorted, use `sort_categories=True` argument. >>> union_categoricals([a, b], sort_categories=True) ['b', 'c', 'a', 'b'] Categories (3, object): ['a', 'b', 'c'] `union_categoricals` also works with the case of combining two categoricals of the same categories and order information (e.g. what you could also `append` for). >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "a"], ordered=True) >>> union_categoricals([a, b]) ['a', 'b', 'a', 'b', 'a'] Categories (2, object): ['a' < 'b'] Raises `TypeError` because the categories are ordered and not identical. >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "c"], ordered=True) >>> union_categoricals([a, b]) Traceback (most recent call last): ... TypeError: to union ordered Categoricals, all categories must be the same New in version 0.20.0 Ordered categoricals with different categories or orderings can be combined by using the `ignore_ordered=True` argument. >>> a = pd.Categorical(["a", "b", "c"], ordered=True) >>> b = pd.Categorical(["c", "b", "a"], ordered=True) >>> union_categoricals([a, b], ignore_order=True) ['a', 'b', 'c', 'c', 'b', 'a'] Categories (3, object): ['a', 'b', 'c'] `union_categoricals` also works with a `CategoricalIndex`, or `Series` containing categorical data, but note that the resulting array will always be a plain `Categorical` >>> a = pd.Series(["b", "c"], dtype='category') >>> b = pd.Series(["a", "b"], dtype='category') >>> union_categoricals([a, b]) ['b', 'c', 'a', 'b'] Categories (3, object): ['b', 'c', 'a'] """ from pandas import Categorical from pandas.core.arrays.categorical import recode_for_categories if len(to_union) == 0: raise ValueError("No Categoricals to union") def _maybe_unwrap(x): if isinstance(x, (ABCCategoricalIndex, ABCSeries)): return x._values elif isinstance(x, Categorical): return x else: raise TypeError("all components to combine must be Categorical") to_union = [_maybe_unwrap(x) for x in to_union] first = to_union[0] if not all( is_dtype_equal(other.categories.dtype, first.categories.dtype) for other in to_union[1:] ): raise TypeError("dtype of categories must be the same") ordered = False if all(first._categories_match_up_to_permutation(other) for other in to_union[1:]): # identical categories - fastpath categories = first.categories ordered = first.ordered all_codes = [first._encode_with_my_categories(x)._codes for x in to_union] new_codes = np.concatenate(all_codes) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with ordered Categoricals") if sort_categories and not categories.is_monotonic_increasing: categories = categories.sort_values() indexer = categories.get_indexer(first.categories) from pandas.core.algorithms import take_nd new_codes = take_nd(indexer, new_codes, fill_value=-1) elif ignore_order or all(not c.ordered for c in to_union): # different categories - union and recode cats = first.categories.append([c.categories for c in to_union[1:]]) categories = cats.unique() if sort_categories: categories = categories.sort_values() new_codes = [ recode_for_categories(c.codes, c.categories, categories) for c in to_union ] new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message if all(c.ordered for c in to_union): msg = "to union ordered Categoricals, all categories must be the same" raise TypeError(msg) else: raise TypeError("Categorical.ordered must be the same") if ignore_order: ordered = False return Categorical(new_codes, categories=categories, ordered=ordered, fastpath=True)
def slabels(self): # Sorted labels return algorithms.take_nd(self.labels, self.sort_idx, allow_fill=False)
def get_reindexed_values(self, empty_dtype, upcasted_na): if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na if self.is_na: if getattr(self.block, 'is_object', False): # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls values = self.block.values.ravel(order='K') if len(values) and values[0] is None: fill_value = None if (getattr(self.block, 'is_datetimetz', False) or is_datetime64tz_dtype(empty_dtype)): if self.block is None: array = empty_dtype.construct_array_type() missing_arr = array([fill_value], dtype=empty_dtype) return missing_arr.repeat(self.shape[1]) pass elif getattr(self.block, 'is_categorical', False): pass elif getattr(self.block, 'is_sparse', False): pass else: missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr if not self.indexers: if not self.block._can_consolidate: # preserve these for validation in _concat_compat return self.block.values if self.block.is_bool and not self.block.is_categorical: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values elif self.block.is_extension: values = self.block.values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.get_values() if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax, fill_value=fill_value) return values
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na if self.is_valid_na_for(empty_dtype): blk_dtype = getattr(self.block, "dtype", None) # error: Value of type variable "_DTypeScalar" of "dtype" cannot be # "object" if blk_dtype == np.dtype(object): # type: ignore[type-var] # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls values = self.block.values.ravel(order="K") if len(values) and values[0] is None: fill_value = None if is_datetime64tz_dtype(empty_dtype): # TODO(EA2D): special case unneeded with 2D EAs i8values = np.full(self.shape[1], fill_value.value) # error: Incompatible return value type (got "DatetimeArray", # expected "ndarray") return DatetimeArray( # type: ignore[return-value] i8values, dtype=empty_dtype) elif is_extension_array_dtype(blk_dtype): pass elif is_extension_array_dtype(empty_dtype): # error: Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" # has no attribute "construct_array_type" cls = empty_dtype.construct_array_type( ) # type: ignore[union-attr] missing_arr = cls._from_sequence([], dtype=empty_dtype) ncols, nrows = self.shape assert ncols == 1, ncols empty_arr = -1 * np.ones((nrows, ), dtype=np.intp) return missing_arr.take(empty_arr, allow_fill=True, fill_value=fill_value) else: # NB: we should never get here with empty_dtype integer or bool; # if we did, the missing_arr.fill would cast to gibberish # error: Argument "dtype" to "empty" has incompatible type # "Union[dtype[Any], ExtensionDtype]"; expected "Union[dtype[Any], # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, # Any]]]" missing_arr = np.empty( self.shape, dtype=empty_dtype # type: ignore[arg-type] ) missing_arr.fill(fill_value) return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values if self.block.is_bool and not self.block.is_categorical: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values elif self.block.is_extension: values = self.block.values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax) return values
def test_3d_with_out(self, dtype_can_hold_na): dtype, can_hold_na = dtype_can_hold_na data = np.random.randint(0, 2, (5, 4, 3)).astype(dtype) indexer = [2, 1, 0, 1] out0 = np.empty((4, 4, 3), dtype=dtype) out1 = np.empty((5, 4, 3), dtype=dtype) out2 = np.empty((5, 4, 4), dtype=dtype) algos.take_nd(data, indexer, out=out0, axis=0) algos.take_nd(data, indexer, out=out1, axis=1) algos.take_nd(data, indexer, out=out2, axis=2) expected0 = data.take(indexer, axis=0) expected1 = data.take(indexer, axis=1) expected2 = data.take(indexer, axis=2) tm.assert_almost_equal(out0, expected0) tm.assert_almost_equal(out1, expected1) tm.assert_almost_equal(out2, expected2) indexer = [2, 1, 0, -1] out0 = np.empty((4, 4, 3), dtype=dtype) out1 = np.empty((5, 4, 3), dtype=dtype) out2 = np.empty((5, 4, 4), dtype=dtype) if can_hold_na: algos.take_nd(data, indexer, out=out0, axis=0) algos.take_nd(data, indexer, out=out1, axis=1) algos.take_nd(data, indexer, out=out2, axis=2) expected0 = data.take(indexer, axis=0) expected1 = data.take(indexer, axis=1) expected2 = data.take(indexer, axis=2) expected0[3, :, :] = np.nan expected1[:, 3, :] = np.nan expected2[:, :, 3] = np.nan tm.assert_almost_equal(out0, expected0) tm.assert_almost_equal(out1, expected1) tm.assert_almost_equal(out2, expected2) else: for i, out in enumerate([out0, out1, out2]): with pytest.raises(TypeError, match=self.fill_error): algos.take_nd(data, indexer, out=out, axis=i) # No Exception otherwise. data.take(indexer, out=out, axis=i)
def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: indexer, _ = self._indexer_and_to_sort sorted_values = algos.take_nd(values, indexer, axis=0) return sorted_values
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: values: ArrayLike if upcasted_na is None and not self.is_na: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na if self.is_na: if is_datetime64tz_dtype(empty_dtype): i8values = np.full(self.shape, fill_value.value) return DatetimeArray(i8values, dtype=empty_dtype) elif is_1d_only_ea_dtype(empty_dtype): empty_dtype = cast(ExtensionDtype, empty_dtype) cls = empty_dtype.construct_array_type() missing_arr = cls._from_sequence([], dtype=empty_dtype) ncols, nrows = self.shape assert ncols == 1, ncols empty_arr = -1 * np.ones((nrows, ), dtype=np.intp) return missing_arr.take(empty_arr, allow_fill=True, fill_value=fill_value) elif isinstance(empty_dtype, ExtensionDtype): # TODO: no tests get here, a handful would if we disabled # the dt64tz special-case above (which is faster) cls = empty_dtype.construct_array_type() missing_arr = cls._empty(shape=self.shape, dtype=empty_dtype) missing_arr[:] = fill_value return missing_arr else: # NB: we should never get here with empty_dtype integer or bool; # if we did, the missing_arr.fill would cast to gibberish missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values if self.block.is_bool: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax) return values
def _bins_to_cuts( x, bins, right=True, labels=None, precision=3, include_lowest=False, dtype=None, duplicates="raise", ): if duplicates not in ["raise", "drop"]: raise ValueError( "invalid value for 'duplicates' parameter, " "valid options are: raise, drop" ) if isinstance(bins, IntervalIndex): # we have a fast-path here ids = bins.get_indexer(x) result = Categorical.from_codes(ids, categories=bins, ordered=True) return result, bins unique_bins = algos.unique(bins) if len(unique_bins) < len(bins) and len(bins) != 2: if duplicates == "raise": raise ValueError( "Bin edges must be unique: {bins!r}.\nYou " "can drop duplicate edges by setting " "the 'duplicates' kwarg".format(bins=bins) ) else: bins = unique_bins side = "left" if right else "right" ids = ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: ids[x == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() if labels is not False: if labels is None: labels = _format_labels( bins, precision, right=right, include_lowest=include_lowest, dtype=dtype ) else: if len(labels) != len(bins) - 1: raise ValueError( "Bin labels must be one fewer than the number of bin edges" ) if not is_categorical_dtype(labels): labels = Categorical(labels, categories=labels, ordered=True) np.putmask(ids, na_mask, 0) result = algos.take_nd(labels, ids - 1) else: result = ids - 1 if has_nas: result = result.astype(np.float64) np.putmask(result, na_mask, np.nan) return result, bins
def test_2d_datetime64(self): # 2005/01/01 - 2006/01/01 arr = np.random.randint( long(11045376), long(11360736), (5, 3)) * 100000000000 arr = arr.view(dtype='datetime64[ns]') indexer = [0, 2, -1, 1, -1] # axis=0 result = algos.take_nd(arr, indexer, axis=0) result2 = np.empty_like(result) algos.take_nd(arr, indexer, axis=0, out=result2) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=0) expected.view(np.int64)[[2, 4], :] = iNaT tm.assert_almost_equal(result, expected) result = algos.take_nd(arr, indexer, axis=0, fill_value=datetime(2007, 1, 1)) result2 = np.empty_like(result) algos.take_nd(arr, indexer, out=result2, axis=0, fill_value=datetime(2007, 1, 1)) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=0) expected[[2, 4], :] = datetime(2007, 1, 1) tm.assert_almost_equal(result, expected) # axis=1 result = algos.take_nd(arr, indexer, axis=1) result2 = np.empty_like(result) algos.take_nd(arr, indexer, axis=1, out=result2) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=1) expected.view(np.int64)[:, [2, 4]] = iNaT tm.assert_almost_equal(result, expected) result = algos.take_nd(arr, indexer, axis=1, fill_value=datetime(2007, 1, 1)) result2 = np.empty_like(result) algos.take_nd(arr, indexer, out=result2, axis=1, fill_value=datetime(2007, 1, 1)) tm.assert_almost_equal(result, result2) expected = arr.take(indexer, axis=1) expected[:, [2, 4]] = datetime(2007, 1, 1) tm.assert_almost_equal(result, expected)
def slabels(self) -> np.ndarray: # np.ndarray[np.intp] # Sorted labels return algorithms.take_nd(self.labels, self._sort_idx, allow_fill=False)