Exemplo n.º 1
0
    def take(self,
             indices: Sequence[int],
             allow_fill: bool = False,
             fill_value: Any = None) -> "TensorArray":
        """
        See docstring in :class:`ExtensionArray` class in ``pandas/core/arrays/base.py``
        for information about this method.
        """
        if allow_fill:
            # From API docs: "[If allow_fill == True, then] negative values in
            # `indices` indicate missing values and are set to `fill_value`
            indices = np.asarray(indices, dtype=np.intp)
            validate_indices(indices, len(self._tensor))

            # Check if there are missing indices to fill, if not can use numpy take below
            has_missing = np.any(indices < 0)
            if has_missing:
                if fill_value is None:
                    fill_value = np.nan
                # Create an array populated with fill value
                values = np.full((len(indices), ) + self._tensor.shape[1:],
                                 fill_value)

                # Iterate over each index and set non-missing elements
                for i, idx in enumerate(indices):
                    if idx >= 0:
                        values[i] = self._tensor[idx]
                return TensorArray(values)

        # Delegate take to numpy array
        values = self._tensor.take(indices, axis=0)

        return TensorArray(values)
Exemplo n.º 2
0
    def take(self,
             indices: Sequence[int],
             allow_fill: bool = False,
             fill_value: Any = None) -> "TensorArray":
        """
        Take elements from an array.

        Parameters
        ----------
        indices : sequence of int
            Indices to be taken.
        allow_fill : bool, default False
            How to handle negative values in `indices`.

            * False: negative values in `indices` indicate positional indices
              from the right (the default). This is similar to
              :func:`numpy.take`.

            * True: negative values in `indices` indicate
              missing values. These values are set to `fill_value`. Any other
              other negative values raise a ``ValueError``.

        fill_value : any, optional
            Fill value to use for NA-indices when `allow_fill` is True.
            This may be ``None``, in which case the default NA value for
            the type, ``self.dtype.na_value``, is used.

            For many ExtensionArrays, there will be two representations of
            `fill_value`: a user-facing "boxed" scalar, and a low-level
            physical NA value. `fill_value` should be the user-facing version,
            and the implementation should handle translating that to the
            physical version for processing the take if necessary.

        Returns
        -------
        ExtensionArray

        Raises
        ------
        IndexError
            When the indices are out of bounds for the array.
        ValueError
            When `indices` contains negative values other than ``-1``
            and `allow_fill` is True.

        See Also
        --------
        numpy.take : Take elements from an array along an axis.
        api.extensions.take : Take elements from an array.

        Notes
        -----
        ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
        ``iloc``, when `indices` is a sequence of values. Additionally,
        it's called by :meth:`Series.reindex`, or any other method
        that causes realignment, with a `fill_value`.

        Examples
        --------
        Here's an example implementation, which relies on casting the
        extension array to object dtype. This uses the helper method
        :func:`pandas.api.extensions.take`.

        .. code-block:: python

           def take(self, indices, allow_fill=False, fill_value=None):
               from pandas.core.algorithms import take

               # If the ExtensionArray is backed by an ndarray, then
               # just pass that here instead of coercing to object.
               data = self.astype(object)

               if allow_fill and fill_value is None:
                   fill_value = self.dtype.na_value

               # fill value should always be translated from the scalar
               # type for the array, to the physical storage type for
               # the data, before passing to take.

               result = take(data, indices, fill_value=fill_value,
                             allow_fill=allow_fill)
               return self._from_sequence(result, dtype=self.dtype)
        """
        if allow_fill:
            # With allow_fill being True, negative values in `indices` indicate
            # missing values and should be set to `fill_value`.
            indices = np.asarray(indices, dtype=np.intp)
            validate_indices(indices, len(self._tensor))

            # Check if there are missing indices to fill, otherwise we can
            # delegate to NumPy ndarray .take().
            has_missing = np.any(indices < 0)
            if has_missing:
                if fill_value is None:
                    fill_value = np.nan

                # Create an array populated with fill value.
                values = np.full((len(indices), ) + self._tensor.shape[1:],
                                 fill_value)

                # Put tensors at the given positive indices into array.
                is_nonneg = indices >= 0
                np.put(values,
                       np.where(is_nonneg)[0],
                       self._tensor[indices[is_nonneg]])

                return TensorArray(values)

        # Delegate take to NumPy array.
        values = self._tensor.take(indices, axis=0)

        return TensorArray(values)
Exemplo n.º 3
0
    def _reindex_indexer(
        self: T,
        new_axis,
        indexer,
        axis: int,
        fill_value=None,
        allow_dups: bool = False,
        copy: bool = True,
        use_na_proxy: bool = False,
    ) -> T:
        """
        Parameters
        ----------
        new_axis : Index
        indexer : ndarray of int64 or None
        axis : int
        fill_value : object, default None
        allow_dups : bool, default False
        copy : bool, default True


        pandas-indexer with -1's only.
        """
        if indexer is None:
            if new_axis is self._axes[axis] and not copy:
                return self

            result = self.copy(deep=copy)
            result._axes = list(self._axes)
            result._axes[axis] = new_axis
            return result

        # some axes don't allow reindexing with dups
        if not allow_dups:
            self._axes[axis]._validate_can_reindex(indexer)

        if axis >= self.ndim:
            raise IndexError("Requested axis not found in manager")

        if axis == 1:
            new_arrays = []
            for i in indexer:
                if i == -1:
                    arr = self._make_na_array(fill_value=fill_value,
                                              use_na_proxy=use_na_proxy)
                else:
                    arr = self.arrays[i]
                new_arrays.append(arr)

        else:
            validate_indices(indexer, len(self._axes[0]))
            indexer = ensure_platform_int(indexer)
            if (indexer == -1).any():
                allow_fill = True
            else:
                allow_fill = False
            new_arrays = [
                take_1d(
                    arr,
                    indexer,
                    allow_fill=allow_fill,
                    fill_value=fill_value,
                    # if fill_value is not None else blk.fill_value
                ) for arr in self.arrays
            ]

        new_axes = list(self._axes)
        new_axes[axis] = new_axis

        return type(self)(new_arrays, new_axes, verify_integrity=False)
Exemplo n.º 4
0
    def take(self,
             indices: Sequence[int],
             allow_fill: bool = False,
             fill_value: Any = None):
        """
        Take elements from an array.

        Parameters
        ----------
        indices : sequence of int
            Indices to be taken.
        allow_fill : bool, default False
            How to handle negative values in `indices`.

            * False: negative values in `indices` indicate positional indices
              from the right (the default). This is similar to
              :func:`numpy.take`.

            * True: negative values in `indices` indicate
              missing values. These values are set to `fill_value`. Any other
              other negative values raise a ``ValueError``.

        fill_value : any, optional
            Fill value to use for NA-indices when `allow_fill` is True.
            This may be ``None``, in which case the default NA value for
            the type, ``self.dtype.na_value``, is used.

            For many ExtensionArrays, there will be two representations of
            `fill_value`: a user-facing "boxed" scalar, and a low-level
            physical NA value. `fill_value` should be the user-facing version,
            and the implementation should handle translating that to the
            physical version for processing the take if necessary.

        Returns
        -------
        ExtensionArray

        Raises
        ------
        IndexError
            When the indices are out of bounds for the array.
        ValueError
            When `indices` contains negative values other than ``-1``
            and `allow_fill` is True.

        See Also
        --------
        numpy.take
        api.extensions.take

        Notes
        -----
        ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
        ``iloc``, when `indices` is a sequence of values. Additionally,
        it's called by :meth:`Series.reindex`, or any other method
        that causes realignment, with a `fill_value`.
        """
        # TODO: Remove once we got rid of the (indices < 0) check
        if not is_array_like(indices):
            indices_array = np.asanyarray(indices)
        else:
            # error: Incompatible types in assignment (expression has type
            # "Sequence[int]", variable has type "ndarray")
            indices_array = indices  # type: ignore[assignment]

        if len(self._data) == 0 and (indices_array >= 0).any():
            raise IndexError("cannot do a non-empty take")
        if indices_array.size > 0 and indices_array.max() >= len(self._data):
            raise IndexError("out of bounds value in 'indices'.")

        if allow_fill:
            fill_mask = indices_array < 0
            if fill_mask.any():
                validate_indices(indices_array, len(self._data))
                # TODO(ARROW-9433): Treat negative indices as NULL
                indices_array = pa.array(indices_array, mask=fill_mask)
                result = self._data.take(indices_array)
                if isna(fill_value):
                    return type(self)(result)
                # TODO: ArrowNotImplementedError: Function fill_null has no
                # kernel matching input types (array[string], scalar[string])
                result = type(self)(result)
                result[fill_mask] = fill_value
                return result
                # return type(self)(pc.fill_null(result, pa.scalar(fill_value)))
            else:
                # Nothing to fill
                return type(self)(self._data.take(indices))
        else:  # allow_fill=False
            # TODO(ARROW-9432): Treat negative indices as indices from the right.
            if (indices_array < 0).any():
                # Don't modify in-place
                indices_array = np.copy(indices_array)
                indices_array[indices_array < 0] += len(self._data)
            return type(self)(self._data.take(indices_array))
Exemplo n.º 5
0
def test_validate_indices_empty():
    with pytest.raises(IndexError, match="indices are out"):
        validate_indices(np.array([0, 1]), 0)
Exemplo n.º 6
0
def test_validate_indices_high():
    indices = np.asarray([0, 1, 2])
    with pytest.raises(IndexError, match="indices are out"):
        validate_indices(indices, 2)
Exemplo n.º 7
0
def test_validate_indices_low():
    indices = np.asarray([0, -2])
    with pytest.raises(ValueError, match="'indices' contains"):
        validate_indices(indices, 2)
Exemplo n.º 8
0
def test_validate_indices_ok():
    indices = np.asarray([0, 1])
    validate_indices(indices, 2)
    validate_indices(indices[:0], 0)
    validate_indices(np.array([-1, -1]), 0)
Exemplo n.º 9
0
    def _reindex_indexer(
        self: T,
        new_axis,
        indexer,
        axis: int,
        fill_value=None,
        allow_dups: bool = False,
        copy: bool = True,
    ) -> T:
        """
        Parameters
        ----------
        new_axis : Index
        indexer : ndarray of int64 or None
        axis : int
        fill_value : object, default None
        allow_dups : bool, default False
        copy : bool, default True


        pandas-indexer with -1's only.
        """
        if indexer is None:
            if new_axis is self._axes[axis] and not copy:
                return self

            result = self.copy(deep=copy)
            result._axes = list(self._axes)
            result._axes[axis] = new_axis
            return result

        # some axes don't allow reindexing with dups
        if not allow_dups:
            self._axes[axis]._validate_can_reindex(indexer)

        if axis >= self.ndim:
            raise IndexError("Requested axis not found in manager")

        if axis == 1:
            new_arrays = []
            for i in indexer:
                if i == -1:
                    arr = self._make_na_array(fill_value=fill_value)
                else:
                    arr = self.arrays[i]
                new_arrays.append(arr)

        else:
            validate_indices(indexer, len(self._axes[0]))
            new_arrays = [
                # error: Value of type variable "ArrayLike" of "take_1d" cannot be
                # "Union[ndarray, ExtensionArray]"  [type-var]
                take_1d(  # type: ignore[type-var]
                    arr,
                    indexer,
                    allow_fill=True,
                    fill_value=fill_value,
                    # if fill_value is not None else blk.fill_value
                ) for arr in self.arrays
            ]

        new_axes = list(self._axes)
        new_axes[axis] = new_axis

        return type(self)(new_arrays, new_axes, verify_integrity=False)
Exemplo n.º 10
0
    def take(self, indices, allow_fill=False, fill_value=None):
        # type: (Sequence[int] , bool, Optional[Any]) -> FletcherArray
        """
        Take elements from an array.

        Parameters
        ----------
        indices : sequence of integers
            Indices to be taken.
        allow_fill : bool, default False
            How to handle negative values in `indices`.
            * False: negative values in `indices` indicate positional indices
              from the right (the default). This is similar to
              :func:`numpy.take`.
            * True: negative values in `indices` indicate
              missing values. These values are set to `fill_value`. Any other
              other negative values raise a ``ValueError``.
        fill_value : any, optional
            Fill value to use for NA-indices when `allow_fill` is True.
            This may be ``None``, in which case the default NA value for
            the type, ``self.dtype.na_value``, is used.
            For many FletcherArrays, there will be two representations of
            `fill_value`: a user-facing "boxed" scalar, and a low-level
            physical NA value. `fill_value` should be the user-facing version,
            and the implementation should handle translating that to the
            physical version for processing the take if nescessary.

        Returns
        -------
        FletcherArray

        Raises
        ------
        IndexError
            When the indices are out of bounds for the array.
        ValueError
            When `indices` contains negative values other than ``-1``
            and `allow_fill` is True.

        Notes
        -----
        ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
        ``iloc``, when `indices` is a sequence of values. Additionally,
        it's called by :meth:`Series.reindex`, or any other method
        that causes realignemnt, with a `fill_value`.


        Notes
        -----
        FletcherArray.take is called by ``Series.__getitem__``, ``.loc``,
        ``iloc``, when `indices` is a sequence of values. Additionally,
        it's called by :meth:`Series.reindex`, or any other method
        that causes realignemnt, with a `fill_value`.

        See Also
        --------
        numpy.take
        pandas.api.extensions.take
        """
        threshold_ratio = 0.3

        # this is the threshold to decide whether or not to concat everything first.
        # Benchmarks were made on string, int32, int64, float32, float64 and it turns out that 0.3 is the value where it
        # is best to switch to concatening everything first, both time-wise and memory-wise

        length = len(self)
        indices = np.asarray(indices, dtype=self._indices_dtype)
        has_negative_indices = np.any(indices < 0)  # type: ignore
        allow_fill &= has_negative_indices
        if allow_fill:
            validate_indices(indices, length)
        if (has_negative_indices
                and not allow_fill) or np.any(indices >= length  # type: ignore
                                              ):
            # this will raise IndexError expected by pandas in all needed cases
            indices = np.arange(length,
                                dtype=self._indices_dtype).take(indices)
        # here we guarantee that indices is numpy array of ints
        # and we have checked that all indices are between -1/0 and len(self)

        if not allow_fill:

            if self._has_single_chunk:
                if (self.dtype.is_list
                        and self.data.chunk(0).flatten().null_count == 0
                        and self.data.chunk(0).null_count == 0
                        and self.flatten().dtype._is_numeric):
                    return FletcherArray(
                        take_indices_on_pyarrow_list(self.data.chunk(0),
                                                     indices))
                else:
                    return FletcherArray(
                        self.data.chunk(0).take(pa.array(indices)))

            lengths = np.fromiter(map(len, self.data.iterchunks()),
                                  dtype=np.int)
            cum_lengths = lengths.cumsum()

            bins = self._get_chunk_indexer(indices)

            cum_lengths -= lengths
            limits_idx = np.concatenate(
                [[0],
                 np.bincount(bins, minlength=self.data.num_chunks).cumsum()])

            if pd.Series(bins).is_monotonic:
                del bins
                return self._take_on_chunks(indices,
                                            limits_idx=limits_idx,
                                            cum_lengths=cum_lengths)
            elif len(indices) / len(self) > threshold_ratio:
                # check which method is going to take less memory
                return self._take_on_concatenated_chunks(indices)
            else:
                sort_idx = get_group_index_sorter(bins, self.data.num_chunks)
                del bins
                indices = indices.take(sort_idx, out=indices)  # type: ignore
                sort_idx = np.argsort(sort_idx,
                                      kind="merge")  # inverse sort indices
                return self._take_on_chunks(
                    indices,
                    sort_idx=sort_idx,
                    limits_idx=limits_idx,
                    cum_lengths=cum_lengths,
                )

        else:
            if pd.isnull(fill_value):
                fill_value = None
            return self._concat_same_type(
                [self, FletcherArray([fill_value],
                                     dtype=self.data.type)]).take(indices)