Пример #1
0
    def na_op(x, y):
        try:
            result = op(x, y)
        except TypeError:
            if isinstance(y, list):
                y = construct_1d_object_array_from_listlike(y)

            if isinstance(y, (np.ndarray, ABCSeries)):
                if (is_bool_dtype(x.dtype) and is_bool_dtype(y.dtype)):
                    result = op(x, y)  # when would this be hit?
                else:
                    x = _ensure_object(x)
                    y = _ensure_object(y)
                    result = lib.vec_binop(x, y, op)
            else:
                # let null fall thru
                if not isna(y):
                    y = bool(y)
                try:
                    result = lib.scalar_binop(x, y, op)
                except:
                    msg = ("cannot compare a dtyped [{dtype}] array "
                           "with a scalar of type [{type}]"
                           ).format(dtype=x.dtype, type=type(y).__name__)
                    raise TypeError(msg)

        return result
Пример #2
0
def test_is_bool_dtype():
    assert not com.is_bool_dtype(int)
    assert not com.is_bool_dtype(str)
    assert not com.is_bool_dtype(pd.Series([1, 2]))
    assert not com.is_bool_dtype(np.array(['a', 'b']))
    assert not com.is_bool_dtype(pd.Index(['a', 'b']))

    assert com.is_bool_dtype(bool)
    assert com.is_bool_dtype(np.bool)
    assert com.is_bool_dtype(np.array([True, False]))
    assert com.is_bool_dtype(pd.Index([True, False]))
Пример #3
0
    def _evaluate_compare(self, other, op):
        """
        We have been called because a comparison between
        8 aware arrays. numpy >= 1.11 will
        now warn about NaT comparisons
        """
        # Called by comparison methods when comparing datetimelike
        # with datetimelike

        if not isinstance(other, type(self)):
            # coerce to a similar object
            if not is_list_like(other):
                # scalar
                other = [other]
            elif lib.is_scalar(lib.item_from_zerodim(other)):
                # ndarray scalar
                other = [other.item()]
            other = type(self)(other)

        # compare
        result = op(self.asi8, other.asi8)

        # technically we could support bool dtyped Index
        # for now just return the indexing array directly
        mask = (self._isnan) | (other._isnan)

        filler = iNaT
        if is_bool_dtype(result):
            filler = False

        result[mask] = filler
        return result
Пример #4
0
def _isfinite(values):
    if is_datetime_or_timedelta_dtype(values):
        return isna(values)
    if (is_complex_dtype(values) or is_float_dtype(values) or
            is_integer_dtype(values) or is_bool_dtype(values)):
        return ~np.isfinite(values)
    return ~np.isfinite(values.astype('float64'))
Пример #5
0
    def __array_wrap__(self, result, context=None):
        """
        Gets called after a ufunc. Needs additional handling as
        PeriodIndex stores internal data as int dtype

        Replace this to __numpy_ufunc__ in future version
        """
        if isinstance(context, tuple) and len(context) > 0:
            func = context[0]
            if func is np.add:
                pass
            elif func is np.subtract:
                name = self.name
                left = context[1][0]
                right = context[1][1]
                if (isinstance(left, PeriodIndex) and
                        isinstance(right, PeriodIndex)):
                    name = left.name if left.name == right.name else None
                    return Index(result, name=name)
                elif isinstance(left, Period) or isinstance(right, Period):
                    return Index(result, name=name)
            elif isinstance(func, np.ufunc):
                if 'M->M' not in func.types:
                    msg = "ufunc '{0}' not supported for the PeriodIndex"
                    # This should be TypeError, but TypeError cannot be raised
                    # from here because numpy catches.
                    raise ValueError(msg.format(func.__name__))

        if is_bool_dtype(result):
            return result
        # the result is object dtype array of Period
        # cannot pass _simple_new as it is
        return type(self)(result, freq=self.freq, name=self.name)
Пример #6
0
def nanall(values, axis=None, skipna=True, mask=None):
    """
    Check if all elements along an axis evaluate to True.

    Parameters
    ----------
    values : ndarray
    axis: int, optional
    skipna : bool, default True
    mask : ndarray[bool], optional
        nan-mask if known

    Returns
    -------
    result : bool

    Examples
    --------
    >>> import pandas.core.nanops as nanops
    >>> s = pd.Series([1, 2, np.nan])
    >>> nanops.nanall(s)
    True

    >>> import pandas.core.nanops as nanops
    >>> s = pd.Series([1, 0])
    >>> nanops.nanall(s)
    False
    """
    if is_bool_dtype(values.dtype) and mask is None:
        # Assume np.bool cannot store NaNs
        skipna = False
    values, _, _, _, _ = _get_values(values, skipna, True, copy=skipna,
                                            mask=mask, compute_mask=False)
    return values.all(axis)
Пример #7
0
    def wrapper(self, other):
        msg = "cannot compare a TimedeltaIndex with type {0}"
        func = getattr(super(TimedeltaIndex, self), opname)
        if _is_convertible_to_td(other) or other is NaT:
            try:
                other = _to_m8(other)
            except ValueError:
                # failed to parse as timedelta
                raise TypeError(msg.format(type(other)))
            result = func(other)
            if isna(other):
                result.fill(nat_result)
        else:
            if not is_list_like(other):
                raise TypeError(msg.format(type(other)))

            other = TimedeltaIndex(other).values
            result = func(other)
            result = _values_from_object(result)

            if isinstance(other, Index):
                o_mask = other.values.view('i8') == iNaT
            else:
                o_mask = other.view('i8') == iNaT

            if o_mask.any():
                result[o_mask] = nat_result

        if self.hasnans:
            result[self._isnan] = nat_result

        # support of bool dtype indexers
        if is_bool_dtype(result):
            return result
        return Index(result)
Пример #8
0
    def wrapper(self, other):
        msg = "cannot compare a {cls} with type {typ}"
        func = getattr(super(TimedeltaIndex, self), opname)
        if _is_convertible_to_td(other) or other is NaT:
            try:
                other = _to_m8(other)
            except ValueError:
                # failed to parse as timedelta
                raise TypeError(msg.format(cls=type(self).__name__,
                                           typ=type(other).__name__))
            result = func(other)
            if isna(other):
                result.fill(nat_result)

        elif not is_list_like(other):
            raise TypeError(msg.format(cls=type(self).__name__,
                                       typ=type(other).__name__))
        else:
            other = TimedeltaIndex(other).values
            result = func(other)
            result = com._values_from_object(result)

            o_mask = np.array(isna(other))
            if o_mask.any():
                result[o_mask] = nat_result

        if self.hasnans:
            result[self._isnan] = nat_result

        # support of bool dtype indexers
        if is_bool_dtype(result):
            return result
        return Index(result)
Пример #9
0
 def _evaluate_compare(self, other, op):
     result = self._eadata._evaluate_compare(other, op)
     if is_bool_dtype(result):
         return result
     try:
         return Index(result)
     except TypeError:
         return result
Пример #10
0
 def _evaluate_compare(self, other, op):
     result = DatetimeLikeArrayMixin._evaluate_compare(self, other, op)
     if is_bool_dtype(result):
         return result
     try:
         return Index(result)
     except TypeError:
         return result
Пример #11
0
def _get_values(values, skipna, fill_value=None, fill_value_typ=None,
                isfinite=False, copy=True, mask=None, compute_mask=True):
    """ utility to get the values view, mask, dtype
    if necessary copy and mask using the specified fill_value
    copy = True will force the copy
    """
    if skipna:
        compute_mask = True

    if is_datetime64tz_dtype(values):
        # com.values_from_object returns M8[ns] dtype instead of tz-aware,
        #  so this case must be handled separately from the rest
        dtype = values.dtype
        values = getattr(values, "_values", values)
    else:
        values = com.values_from_object(values)
        dtype = values.dtype

    if mask is None and compute_mask:
        if isfinite:
            mask = _isfinite(values)
        else:
            mask = isna(values)

    if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values):
        # changing timedelta64/datetime64 to int64 needs to happen after
        #  finding `mask` above
        values = getattr(values, "asi8", values)
        values = values.view(np.int64)

    dtype_ok = _na_ok_dtype(dtype)

    # get our fill value (in case we need to provide an alternative
    # dtype for it)
    fill_value = _get_fill_value(dtype, fill_value=fill_value,
                                 fill_value_typ=fill_value_typ)

    if skipna:
        if copy:
            values = values.copy()
        if dtype_ok:
            np.putmask(values, mask, fill_value)

        # promote if needed
        else:
            values, changed = maybe_upcast_putmask(values, mask, fill_value)

    elif copy:
        values = values.copy()

    # return a platform independent precision dtype
    dtype_max = dtype
    if is_integer_dtype(dtype) or is_bool_dtype(dtype):
        dtype_max = np.int64
    elif is_float_dtype(dtype):
        dtype_max = np.float64

    return values, mask, dtype, dtype_max, fill_value
Пример #12
0
    def _convert_for_op(self, value):
        """ Convert value to be insertable to ndarray """

        if is_bool(value) or is_bool_dtype(value):
            # force conversion to object
            # so we don't lose the bools
            raise TypeError

        return value
Пример #13
0
def _wrap_result(name, data, sparse_index, fill_value, dtype=None):
    """ wrap op result to have correct dtype """
    if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
        dtype = np.bool

    if is_bool_dtype(dtype):
        # fill_value may be np.bool_
        fill_value = bool(fill_value)
    return SparseArray(data, sparse_index=sparse_index,
                       fill_value=fill_value, dtype=dtype)
Пример #14
0
 def astype(self, dtype=None, copy=True):
     dtype = np.dtype(dtype)
     sp_values = astype_nansafe(self.sp_values, dtype, copy=copy)
     try:
         if is_bool_dtype(dtype):
             # to avoid np.bool_ dtype
             fill_value = bool(self.fill_value)
         else:
             fill_value = dtype.type(self.fill_value)
     except ValueError:
         msg = 'unable to coerce current fill_value {0} to {1} dtype'
         raise ValueError(msg.format(self.fill_value, dtype))
     return self._simple_new(sp_values, self.sp_index,
                             fill_value=fill_value)
Пример #15
0
    def f(x, y):
        xmask = isna(x)
        ymask = isna(y)
        mask = xmask | ymask

        with np.errstate(all='ignore'):
            result = op(x, y)

        if mask.any():
            if is_bool_dtype(result):
                result = result.astype('O')
            np.putmask(result, mask, np.nan)

        return result
Пример #16
0
def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype):
    # GH 4343
    tm.skip_if_no_package('scipy')

    # Make one ndarray and from it one sparse matrix, both to be used for
    # constructing frames and comparing results
    arr = np.eye(3, dtype=dtype)
    # GH 16179
    arr[0, 1] = dtype(2)
    try:
        spm = spmatrix(arr)
        assert spm.dtype == arr.dtype
    except (TypeError, AssertionError):
        # If conversion to sparse fails for this spmatrix type and arr.dtype,
        # then the combination is not currently supported in NumPy, so we
        # can just skip testing it thoroughly
        return

    sdf = pd.SparseDataFrame(spm, index=index, columns=columns,
                             default_fill_value=fill_value)

    # Expected result construction is kind of tricky for all
    # dtype-fill_value combinations; easiest to cast to something generic
    # and except later on
    rarr = arr.astype(object)
    rarr[arr == 0] = np.nan
    expected = pd.SparseDataFrame(rarr, index=index, columns=columns).fillna(
        fill_value if fill_value is not None else np.nan)

    # Assert frame is as expected
    sdf_obj = sdf.astype(object)
    tm.assert_sp_frame_equal(sdf_obj, expected)
    tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense())

    # Assert spmatrices equal
    assert dict(sdf.to_coo().todok()) == dict(spm.todok())

    # Ensure dtype is preserved if possible
    was_upcast = ((fill_value is None or is_float(fill_value)) and
                  not is_object_dtype(dtype) and
                  not is_float_dtype(dtype))
    res_dtype = (bool if is_bool_dtype(dtype) else
                 float if was_upcast else
                 dtype)
    tm.assert_contains_all(sdf.dtypes, {np.dtype(res_dtype)})
    assert sdf.to_coo().dtype == res_dtype

    # However, adding a str column results in an upcast to object
    sdf['strings'] = np.arange(len(sdf)).astype(str)
    assert sdf.to_coo().dtype == np.object_
Пример #17
0
def is_bool_indexer(key):
    # type: (Any) -> bool
    """
    Check whether `key` is a valid boolean indexer.

    Parameters
    ----------
    key : Any
        Only list-likes may be considered boolean indexers.
        All other types are not considered a boolean indexer.
        For array-like input, boolean ndarrays or ExtensionArrays
        with ``_is_boolean`` set are considered boolean indexers.

    Returns
    -------
    bool

    Raises
    ------
    ValueError
        When the array is an object-dtype ndarray or ExtensionArray
        and contains missing values.
    """
    na_msg = 'cannot index with vector containing NA / NaN values'
    if (isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or
            (is_array_like(key) and is_extension_array_dtype(key.dtype))):
        if key.dtype == np.object_:
            key = np.asarray(values_from_object(key))

            if not lib.is_bool_array(key):
                if isna(key).any():
                    raise ValueError(na_msg)
                return False
            return True
        elif is_bool_dtype(key.dtype):
            # an ndarray with bool-dtype by definition has no missing values.
            # So we only need to check for NAs in ExtensionArrays
            if is_extension_array_dtype(key.dtype):
                if np.any(key.isna()):
                    raise ValueError(na_msg)
            return True
    elif isinstance(key, list):
        try:
            arr = np.asarray(key)
            return arr.dtype == np.bool_ and len(arr) == len(key)
        except TypeError:  # pragma: no cover
            return False

    return False
Пример #18
0
    def __init__(self, values, mask, copy=False):
        if not (isinstance(values, np.ndarray)
                and is_integer_dtype(values.dtype)):
            raise TypeError("values should be integer numpy array. Use "
                            "the 'integer_array' function instead")
        if not (isinstance(mask, np.ndarray) and is_bool_dtype(mask.dtype)):
            raise TypeError("mask should be boolean numpy array. Use "
                            "the 'integer_array' function instead")

        if copy:
            values = values.copy()
            mask = mask.copy()

        self._data = values
        self._mask = mask
Пример #19
0
def as_json_table_type(x):
    """
    Convert a NumPy / pandas type to its corresponding json_table.

    Parameters
    ----------
    x : array or dtype

    Returns
    -------
    t : str
        the Table Schema data types

    Notes
    -----
    This table shows the relationship between NumPy / pandas dtypes,
    and Table Schema dtypes.

    ==============  =================
    Pandas type     Table Schema type
    ==============  =================
    int64           integer
    float64         number
    bool            boolean
    datetime64[ns]  datetime
    timedelta64[ns] duration
    object          str
    categorical     any
    =============== =================
    """
    if is_integer_dtype(x):
        return 'integer'
    elif is_bool_dtype(x):
        return 'boolean'
    elif is_numeric_dtype(x):
        return 'number'
    elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or
          is_period_dtype(x)):
        return 'datetime'
    elif is_timedelta64_dtype(x):
        return 'duration'
    elif is_categorical_dtype(x):
        return 'any'
    elif is_string_dtype(x):
        return 'string'
    else:
        return 'any'
Пример #20
0
def _get_values(values, skipna, fill_value=None, fill_value_typ=None,
                isfinite=False, copy=True, mask=None):
    """ utility to get the values view, mask, dtype
    if necessary copy and mask using the specified fill_value
    copy = True will force the copy
    """
    values = com.values_from_object(values)

    if mask is None:
        if isfinite:
            mask = _isfinite(values)
        else:
            mask = isna(values)

    dtype = values.dtype
    dtype_ok = _na_ok_dtype(dtype)

    # get our fill value (in case we need to provide an alternative
    # dtype for it)
    fill_value = _get_fill_value(dtype, fill_value=fill_value,
                                 fill_value_typ=fill_value_typ)

    if skipna:
        if copy:
            values = values.copy()
        if dtype_ok:
            np.putmask(values, mask, fill_value)

        # promote if needed
        else:
            values, changed = maybe_upcast_putmask(values, mask, fill_value)

    elif copy:
        values = values.copy()

    values = _view_if_needed(values)

    # return a platform independent precision dtype
    dtype_max = dtype
    if is_integer_dtype(dtype) or is_bool_dtype(dtype):
        dtype_max = np.int64
    elif is_float_dtype(dtype):
        dtype_max = np.float64

    return values, mask, dtype, dtype_max
Пример #21
0
def _wrap_result(name, data, sparse_index, fill_value, dtype=None):
    """ wrap op result to have correct dtype """
    if name.startswith('__'):
        # e.g. __eq__ --> eq
        name = name[2:-2]

    if name in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'):
        dtype = np.bool

    fill_value = lib.item_from_zerodim(fill_value)

    if is_bool_dtype(dtype):
        # fill_value may be np.bool_
        fill_value = bool(fill_value)
    return SparseArray(data,
                       sparse_index=sparse_index,
                       fill_value=fill_value,
                       dtype=dtype)
Пример #22
0
def get_dtype_kinds(l):
    """
    Parameters
    ----------
    l : list of arrays

    Returns
    -------
    a set of kinds that exist in this list of arrays
    """

    typs = set()
    for arr in l:

        dtype = arr.dtype
        if is_categorical_dtype(dtype):
            typ = 'category'
        elif is_sparse(arr):
            typ = 'sparse'
        elif isinstance(arr, ABCRangeIndex):
            typ = 'range'
        elif is_datetimetz(arr):
            # if to_concat contains different tz,
            # the result must be object dtype
            typ = str(arr.dtype)
        elif is_datetime64_dtype(dtype):
            typ = 'datetime'
        elif is_timedelta64_dtype(dtype):
            typ = 'timedelta'
        elif is_object_dtype(dtype):
            typ = 'object'
        elif is_bool_dtype(dtype):
            typ = 'bool'
        elif is_period_dtype(dtype):
            typ = str(arr.dtype)
        elif is_interval_dtype(dtype):
            typ = str(arr.dtype)
        else:
            typ = dtype.kind
        typs.add(typ)
    return typs
Пример #23
0
def _ensure_numeric(x):
    if isinstance(x, np.ndarray):
        if is_integer_dtype(x) or is_bool_dtype(x):
            x = x.astype(np.float64)
        elif is_object_dtype(x):
            try:
                x = x.astype(np.complex128)
            except:
                x = x.astype(np.float64)
            else:
                if not np.any(x.imag):
                    x = x.real
    elif not (is_float(x) or is_integer(x) or is_complex(x)):
        try:
            x = float(x)
        except Exception:
            try:
                x = complex(x)
            except Exception:
                raise TypeError('Could not convert %s to numeric' % str(x))
    return x
Пример #24
0
    def __getitem__(self, key):
        if isinstance(key, tuple):
            if len(key) > 1:
                raise IndexError("too many indices for array.")
            key = key[0]

        if is_integer(key):
            return self._get_val_at(key)
        elif isinstance(key, tuple):
            data_slice = self.values[key]
        elif isinstance(key, slice):
            # special case to preserve dtypes
            if key == slice(None):
                return self.copy()
            # TODO: this logic is surely elsewhere
            # TODO: this could be more efficient
            indices = np.arange(len(self), dtype=np.int32)[key]
            return self.take(indices)
        else:
            # TODO: I think we can avoid densifying when masking a
            # boolean SparseArray with another. Need to look at the
            # key's fill_value for True / False, and then do an intersection
            # on the indicies of the sp_values.
            if isinstance(key, SparseArray):
                if is_bool_dtype(key):
                    key = key.to_dense()
                else:
                    key = np.asarray(key)

            if com.is_bool_indexer(key) and len(self) == len(key):
                # TODO(numpy 1.11): Remove this asarray.
                # Old NumPy didn't treat array-like as boolean masks.
                key = np.asarray(key)
                return self.take(np.arange(len(key), dtype=np.int32)[key])
            elif hasattr(key, '__len__'):
                return self.take(key)
            else:
                raise ValueError("Cannot slice with '{}'".format(key))

        return type(self)(data_slice, kind=self.kind)
Пример #25
0
    def __getitem__(self, key):
        """

        """

        if is_integer(key):
            return self._get_val_at(key)
        elif isinstance(key, tuple):
            data_slice = self.values[key]
        else:
            if isinstance(key, SparseArray):
                if is_bool_dtype(key):
                    key = key.to_dense()
                else:
                    key = np.asarray(key)

            if hasattr(key, '__len__') and len(self) != len(key):
                return self.take(key)
            else:
                data_slice = self.values[key]

        return self._constructor(data_slice)
Пример #26
0
def coerce_to_array(values,
                    dtype=None,
                    mask=None,
                    copy: bool = False) -> tuple[np.ndarray, np.ndarray]:
    """
    Coerce the input values array to numpy arrays with a mask.

    Parameters
    ----------
    values : 1D list-like
    dtype : float dtype
    mask : bool 1D array, optional
    copy : bool, default False
        if True, copy the input

    Returns
    -------
    tuple of (values, mask)
    """
    # if values is floating numpy array, preserve its dtype
    if dtype is None and hasattr(values, "dtype"):
        if is_float_dtype(values.dtype):
            dtype = values.dtype

    if dtype is not None:
        if isinstance(dtype, str) and dtype.startswith("Float"):
            # Avoid DeprecationWarning from NumPy about np.dtype("Float64")
            # https://github.com/numpy/numpy/pull/7476
            dtype = dtype.lower()

        if not issubclass(type(dtype), FloatingDtype):
            try:
                dtype = FLOAT_STR_TO_DTYPE[str(np.dtype(dtype))]
            except KeyError as err:
                raise ValueError(f"invalid dtype specified {dtype}") from err

    if isinstance(values, FloatingArray):
        values, mask = values._data, values._mask
        if dtype is not None:
            values = values.astype(dtype.numpy_dtype, copy=False)

        if copy:
            values = values.copy()
            mask = mask.copy()
        return values, mask

    values = np.array(values, copy=copy)
    if is_object_dtype(values.dtype):
        inferred_type = lib.infer_dtype(values, skipna=True)
        if inferred_type == "empty":
            pass
        elif inferred_type == "boolean":
            raise TypeError(
                f"{values.dtype} cannot be converted to a FloatingDtype")

    elif is_bool_dtype(values) and is_float_dtype(dtype):
        values = np.array(values, dtype=float, copy=copy)

    elif not (is_integer_dtype(values) or is_float_dtype(values)):
        raise TypeError(
            f"{values.dtype} cannot be converted to a FloatingDtype")

    if values.ndim != 1:
        raise TypeError("values must be a 1D list-like")

    if mask is None:
        mask = libmissing.is_numeric_na(values)

    else:
        assert len(mask) == len(values)

    if not mask.ndim == 1:
        raise TypeError("mask must be a 1D list-like")

    # infer dtype if needed
    if dtype is None:
        dtype = np.dtype("float64")
    else:
        dtype = dtype.type

    # if we are float, let's make sure that we can
    # safely cast

    # we copy as need to coerce here
    # TODO should this be a safe cast?
    if mask.any():
        values = values.copy()
        values[mask] = np.nan
    values = values.astype(dtype, copy=False)  # , casting="safe")

    return values, mask
Пример #27
0
def _get_values(
    values: np.ndarray,
    skipna: bool,
    fill_value: Any = None,
    fill_value_typ: Optional[str] = None,
    mask: Optional[np.ndarray] = None,
) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]:
    """
    Utility to get the values view, mask, dtype, dtype_max, and fill_value.

    If both mask and fill_value/fill_value_typ are not None and skipna is True,
    the values array will be copied.

    For input arrays of boolean or integer dtypes, copies will only occur if a
    precomputed mask, a fill_value/fill_value_typ, and skipna=True are
    provided.

    Parameters
    ----------
    values : ndarray
        input array to potentially compute mask for
    skipna : bool
        boolean for whether NaNs should be skipped
    fill_value : Any
        value to fill NaNs with
    fill_value_typ : str
        Set to '+inf' or '-inf' to handle dtype-specific infinities
    mask : Optional[np.ndarray]
        nan-mask if known

    Returns
    -------
    values : ndarray
        Potential copy of input value array
    mask : Optional[ndarray[bool]]
        Mask for values, if deemed necessary to compute
    dtype : np.dtype
        dtype for values
    dtype_max : np.dtype
        platform independent dtype
    fill_value : Any
        fill value used
    """
    # In _get_values is only called from within nanops, and in all cases
    #  with scalar fill_value.  This guarantee is important for the
    #  np.where call below
    assert is_scalar(fill_value)
    values = extract_array(values, extract_numpy=True)

    mask = _maybe_get_mask(values, skipna, mask)

    dtype = values.dtype

    datetimelike = False
    if needs_i8_conversion(values.dtype):
        # changing timedelta64/datetime64 to int64 needs to happen after
        #  finding `mask` above
        values = np.asarray(values.view("i8"))
        datetimelike = True

    dtype_ok = _na_ok_dtype(dtype)

    # get our fill value (in case we need to provide an alternative
    # dtype for it)
    fill_value = _get_fill_value(dtype,
                                 fill_value=fill_value,
                                 fill_value_typ=fill_value_typ)

    if skipna and (mask is not None) and (fill_value is not None):
        if mask.any():
            if dtype_ok or datetimelike:
                values = values.copy()
                np.putmask(values, mask, fill_value)
            else:
                # np.where will promote if needed
                values = np.where(~mask, values, fill_value)

    # return a platform independent precision dtype
    dtype_max = dtype
    if is_integer_dtype(dtype) or is_bool_dtype(dtype):
        dtype_max = np.dtype(np.int64)
    elif is_float_dtype(dtype):
        dtype_max = np.dtype(np.float64)

    return values, mask, dtype, dtype_max, fill_value
Пример #28
0
    def _cython_operation(self,
                          kind: str,
                          values,
                          how: str,
                          axis: int,
                          min_count: int = -1,
                          **kwargs) -> Tuple[np.ndarray, Optional[List[str]]]:
        """
        Returns the values of a cython operation as a Tuple of [data, names].

        Names is only useful when dealing with 2D results, like ohlc
        (see self._name_functions).
        """
        assert kind in ["transform", "aggregate"]
        orig_values = values

        if values.ndim > 2:
            raise NotImplementedError(
                "number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values.dtype) or is_sparse(values.dtype):
            raise NotImplementedError(f"{values.dtype} dtype not supported")
        elif is_datetime64_any_dtype(values.dtype):
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise NotImplementedError(
                    f"datetime64 type does not support {how} operations")
        elif is_timedelta64_dtype(values.dtype):
            if how in ["prod", "cumprod"]:
                raise NotImplementedError(
                    f"timedelta64 type does not support {how} operations")

        if is_datetime64tz_dtype(values.dtype):
            # Cast to naive; we'll cast back at the end of the function
            # TODO: possible need to reshape?
            # TODO(EA2D):kludge can be avoided when 2D EA is allowed.
            values = values.view("M8[ns]")

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_int_or_float(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError(
                    "arity of more than 1 is not supported for the 'how' argument"
                )
            out_shape = (self.ngroups, ) + values.shape[1:]

        func, values = self._get_cython_func_and_vals(kind, how, values,
                                                      is_numeric)

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}"
            else:
                out_dtype = "object"

        codes, _, _ = self.group_info

        if kind == "aggregate":
            result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                 fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(result, counts, values, codes, func,
                                     min_count)
        elif kind == "transform":
            result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                 fill_value=np.nan)

            # TODO: min_count
            result = self._transform(result, values, codes, func,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all(
        ):
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        names: Optional[List[str]] = self._name_functions.get(how, None)

        if swapped:
            result = result.swapaxes(0, axis)

        if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype(
                orig_values.dtype):
            # We need to use the constructors directly for these dtypes
            # since numpy won't recognize them
            # https://github.com/pandas-dev/pandas/issues/31471
            result = type(orig_values)(result.astype(np.int64),
                                       dtype=orig_values.dtype)
        elif is_datetimelike and kind == "aggregate":
            result = result.astype(orig_values.dtype)

        if is_extension_array_dtype(orig_values.dtype):
            result = maybe_cast_result(result=result, obj=orig_values, how=how)

        return result, names
Пример #29
0
def test_is_bool_dtype_sparse():
    result = is_bool_dtype(Series(SparseArray([True, False])))
    assert result is True
Пример #30
0
def _get_values(
    values: np.ndarray,
    skipna: bool,
    fill_value: Any = None,
    fill_value_typ: Optional[str] = None,
    mask: Optional[np.ndarray] = None,
) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]:
    """
    Utility to get the values view, mask, dtype, dtype_max, and fill_value.

    If both mask and fill_value/fill_value_typ are not None and skipna is True,
    the values array will be copied.

    For input arrays of boolean or integer dtypes, copies will only occur if a
    precomputed mask, a fill_value/fill_value_typ, and skipna=True are
    provided.

    Parameters
    ----------
    values : ndarray
        input array to potentially compute mask for
    skipna : bool
        boolean for whether NaNs should be skipped
    fill_value : Any
        value to fill NaNs with
    fill_value_typ : str
        Set to '+inf' or '-inf' to handle dtype-specific infinities
    mask : Optional[np.ndarray]
        nan-mask if known

    Returns
    -------
    values : ndarray
        Potential copy of input value array
    mask : Optional[ndarray[bool]]
        Mask for values, if deemed necessary to compute
    dtype : dtype
        dtype for values
    dtype_max : dtype
        platform independent dtype
    fill_value : Any
        fill value used
    """

    # In _get_values is only called from within nanops, and in all cases
    #  with scalar fill_value.  This guarantee is important for the
    #  maybe_upcast_putmask call below
    assert is_scalar(fill_value)

    mask = _maybe_get_mask(values, skipna, mask)

    if is_datetime64tz_dtype(values):
        # lib.values_from_object returns M8[ns] dtype instead of tz-aware,
        #  so this case must be handled separately from the rest
        dtype = values.dtype
        values = getattr(values, "_values", values)
    else:
        values = lib.values_from_object(values)
        dtype = values.dtype

    if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values):
        # changing timedelta64/datetime64 to int64 needs to happen after
        #  finding `mask` above
        values = getattr(values, "asi8", values)
        values = values.view(np.int64)

    dtype_ok = _na_ok_dtype(dtype)

    # get our fill value (in case we need to provide an alternative
    # dtype for it)
    fill_value = _get_fill_value(dtype,
                                 fill_value=fill_value,
                                 fill_value_typ=fill_value_typ)

    copy = (mask is not None) and (fill_value is not None)

    if skipna and copy:
        values = values.copy()
        if dtype_ok:
            np.putmask(values, mask, fill_value)

        # promote if needed
        else:
            values, _ = maybe_upcast_putmask(values, mask, fill_value)

    # return a platform independent precision dtype
    dtype_max = dtype
    if is_integer_dtype(dtype) or is_bool_dtype(dtype):
        dtype_max = np.int64
    elif is_float_dtype(dtype):
        dtype_max = np.float64

    return values, mask, dtype, dtype_max, fill_value
Пример #31
0
    def _cython_operation(self,
                          kind: str,
                          values,
                          how: str,
                          axis: int,
                          min_count: int = -1,
                          **kwargs) -> np.ndarray:
        """
        Returns the values of a cython operation.
        """
        orig_values = values
        assert kind in ["transform", "aggregate"]

        if values.ndim > 2:
            raise NotImplementedError(
                "number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        # can we do this operation with our cython functions
        # if not raise NotImplementedError
        self._disallow_invalid_ops(values, how)

        if is_extension_array_dtype(values.dtype):
            return self._ea_wrap_cython_operation(kind, values, how, axis,
                                                  min_count, **kwargs)

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_int_or_float(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(ensure_float(values))
        else:
            values = values.astype(object)

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError(
                    "arity of more than 1 is not supported for the 'how' argument"
                )
            out_shape = (self.ngroups, ) + values.shape[1:]

        func, values = self._get_cython_func_and_vals(kind, how, values,
                                                      is_numeric)

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}"
            else:
                out_dtype = "object"

        codes, _, _ = self.group_info

        if kind == "aggregate":
            result = maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(result, counts, values, codes, func,
                                     min_count)
        elif kind == "transform":
            result = maybe_fill(np.empty_like(values, dtype=out_dtype),
                                fill_value=np.nan)

            # TODO: min_count
            result = self._transform(result, values, codes, func,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all(
        ):
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        if swapped:
            result = result.swapaxes(0, axis)

        if how not in base.cython_cast_blocklist:
            # e.g. if we are int64 and need to restore to datetime64/timedelta64
            # "rank" is the only member of cython_cast_blocklist we get here
            dtype = maybe_cast_result_dtype(orig_values.dtype, how)
            result = maybe_downcast_to_dtype(result, dtype)

        return result
Пример #32
0
 def is_bool(self) -> bool:
     return is_bool_dtype(self.pd_dtype)
Пример #33
0
    def _str_map(self,
                 f,
                 na_value=None,
                 dtype: Dtype | None = None,
                 convert: bool = True):
        # TODO: de-duplicate with StringArray method. This method is moreless copy and
        # paste.

        from pandas.arrays import (
            BooleanArray,
            IntegerArray,
        )

        if dtype is None:
            dtype = self.dtype
        if na_value is None:
            na_value = self.dtype.na_value

        mask = isna(self)
        arr = np.asarray(self)

        if is_integer_dtype(dtype) or is_bool_dtype(dtype):
            constructor: type[IntegerArray] | type[BooleanArray]
            if is_integer_dtype(dtype):
                constructor = IntegerArray
            else:
                constructor = BooleanArray

            na_value_is_na = isna(na_value)
            if na_value_is_na:
                na_value = 1
            result = lib.map_infer_mask(
                arr,
                f,
                mask.view("uint8"),
                convert=False,
                na_value=na_value,
                # error: Argument 1 to "dtype" has incompatible type
                # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
                # "Type[object]"
                dtype=np.dtype(dtype),  # type: ignore[arg-type]
            )

            if not na_value_is_na:
                mask[:] = False

            return constructor(result, mask)

        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
            # i.e. StringDtype
            result = lib.map_infer_mask(arr,
                                        f,
                                        mask.view("uint8"),
                                        convert=False,
                                        na_value=na_value)
            result = pa.array(result,
                              mask=mask,
                              type=pa.string(),
                              from_pandas=True)
            return type(self)(result)
        else:
            # This is when the result type is object. We reach this when
            # -> We know the result type is truly object (e.g. .encode returns bytes
            #    or .findall returns a list).
            # -> We don't know the result type. E.g. `.get` can return anything.
            return lib.map_infer_mask(arr, f, mask.view("uint8"))
Пример #34
0
def test_is_bool_dtype_sparse():
    result = is_bool_dtype(pd.SparseSeries([True, False]))
    assert result is True
Пример #35
0
def test_is_bool_dtype(dtype, expected):
    result = is_bool_dtype(dtype)
    assert result is expected
Пример #36
0
    def _cython_operation(self,
                          kind,
                          values,
                          how,
                          axis,
                          min_count=-1,
                          **kwargs):
        assert kind in ["transform", "aggregate"]
        orig_values = values

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values) or is_sparse(values):
            raise NotImplementedError("{} dtype not supported".format(
                values.dtype))
        elif is_datetime64_any_dtype(values):
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise NotImplementedError(
                    "datetime64 type does not support {} operations".format(
                        how))
        elif is_timedelta64_dtype(values):
            if how in ["prod", "cumprod"]:
                raise NotImplementedError(
                    "timedelta64 type does not support {} operations".format(
                        how))

        if is_datetime64tz_dtype(values.dtype):
            # Cast to naive; we'll cast back at the end of the function
            # TODO: possible need to reshape?  kludge can be avoided when
            #  2D EA is allowed.
            values = values.view("M8[ns]")

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError(
                    "arity of more than 1 is not supported for the 'how' argument"
                )
            out_shape = (self.ngroups, ) + values.shape[1:]

        try:
            func = self._get_cython_function(kind, how, values, is_numeric)
        except NotImplementedError:
            if is_numeric:
                values = ensure_float64(values)
                func = self._get_cython_function(kind, how, values, is_numeric)
            else:
                raise

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = "{kind}{itemsize}".format(
                    kind=values.dtype.kind, itemsize=values.dtype.itemsize)
            else:
                out_dtype = "object"

        labels, _, _ = self.group_info

        if kind == "aggregate":
            result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                 fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(
                result,
                counts,
                values,
                labels,
                func,
                is_numeric,
                is_datetimelike,
                min_count,
            )
        elif kind == "transform":
            result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                 fill_value=np.nan)

            # TODO: min_count
            result = self._transform(result, values, labels, func, is_numeric,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all(
        ):
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        if how in self._name_functions:
            # TODO
            names = self._name_functions[how]()
        else:
            names = None

        if swapped:
            result = result.swapaxes(0, axis)

        if is_datetime64tz_dtype(orig_values.dtype):
            result = type(orig_values)(result.astype(np.int64),
                                       dtype=orig_values.dtype)
        elif is_datetimelike and kind == "aggregate":
            result = result.astype(orig_values.dtype)

        return result, names
Пример #37
0
Файл: ops.py Проект: rth/pandas
    def _call_cython_op(
        self,
        values: np.ndarray,  # np.ndarray[ndim=2]
        *,
        min_count: int,
        ngroups: int,
        comp_ids: np.ndarray,
        mask: np.ndarray | None,
        result_mask: np.ndarray | None,
        **kwargs,
    ) -> np.ndarray:  # np.ndarray[ndim=2]
        orig_values = values

        dtype = values.dtype
        is_numeric = is_numeric_dtype(dtype)

        is_datetimelike = needs_i8_conversion(dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(dtype):
            values = values.astype("int64")
        elif is_integer_dtype(dtype):
            # e.g. uint8 -> uint64, int16 -> int64
            dtype_str = dtype.kind + "8"
            values = values.astype(dtype_str, copy=False)
        elif is_numeric:
            if not is_complex_dtype(dtype):
                values = ensure_float64(values)

        values = values.T
        if mask is not None:
            mask = mask.T
            if result_mask is not None:
                result_mask = result_mask.T

        out_shape = self._get_output_shape(ngroups, values)
        func, values = self.get_cython_func_and_vals(values, is_numeric)
        out_dtype = self.get_out_dtype(values.dtype)

        result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
        if self.kind == "aggregate":
            counts = np.zeros(ngroups, dtype=np.int64)
            if self.how in ["min", "max"]:
                func(
                    result,
                    counts,
                    values,
                    comp_ids,
                    min_count,
                    mask=mask,
                    result_mask=result_mask,
                    is_datetimelike=is_datetimelike,
                )
            else:
                func(result, counts, values, comp_ids, min_count)
        else:
            # TODO: min_count
            if self.uses_mask():
                func(
                    result,
                    values,
                    comp_ids,
                    ngroups,
                    is_datetimelike,
                    mask=mask,
                    **kwargs,
                )
            else:
                func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)

        if self.kind == "aggregate":
            # i.e. counts is defined.  Locations where count<min_count
            # need to have the result set to np.nan, which may require casting,
            # see GH#40767
            if is_integer_dtype(result.dtype) and not is_datetimelike:
                cutoff = max(1, min_count)
                empty_groups = counts < cutoff
                if empty_groups.any():
                    # Note: this conversion could be lossy, see GH#40767
                    result = result.astype("float64")
                    result[empty_groups] = np.nan

        result = result.T

        if self.how not in self.cast_blocklist:
            # e.g. if we are int64 and need to restore to datetime64/timedelta64
            # "rank" is the only member of cast_blocklist we get here
            res_dtype = self._get_result_dtype(orig_values.dtype)
            op_result = maybe_downcast_to_dtype(result, res_dtype)
        else:
            op_result = result

        # error: Incompatible return value type (got "Union[ExtensionArray, ndarray]",
        # expected "ndarray")
        return op_result  # type: ignore[return-value]
Пример #38
0
def coerce_to_array(
    values,
    dtype,
    mask=None,
    copy: bool = False,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Coerce the input values array to numpy arrays with a mask

    Parameters
    ----------
    values : 1D list-like
    dtype : integer dtype
    mask : bool 1D array, optional
    copy : bool, default False
        if True, copy the input

    Returns
    -------
    tuple of (values, mask)
    """
    # if values is integer numpy array, preserve it's dtype
    if dtype is None and hasattr(values, "dtype"):
        if is_integer_dtype(values.dtype):
            dtype = values.dtype

    if dtype is not None:
        if isinstance(dtype, str) and (dtype.startswith("Int")
                                       or dtype.startswith("UInt")):
            # Avoid DeprecationWarning from NumPy about np.dtype("Int64")
            # https://github.com/numpy/numpy/pull/7476
            dtype = dtype.lower()

        if not issubclass(type(dtype), _IntegerDtype):
            try:
                dtype = _dtypes[str(np.dtype(dtype))]
            except KeyError:
                raise ValueError(f"invalid dtype specified {dtype}")

    if isinstance(values, IntegerArray):
        values, mask = values._data, values._mask
        if dtype is not None:
            values = values.astype(dtype.numpy_dtype, copy=False)

        if copy:
            values = values.copy()
            mask = mask.copy()
        return values, mask

    values = np.array(values, copy=copy)
    if is_object_dtype(values):
        inferred_type = lib.infer_dtype(values, skipna=True)
        if inferred_type == "empty":
            values = np.empty(len(values))
            values.fill(np.nan)
        elif inferred_type not in [
                "floating",
                "integer",
                "mixed-integer",
                "integer-na",
                "mixed-integer-float",
        ]:
            raise TypeError(
                f"{values.dtype} cannot be converted to an IntegerDtype")

    elif is_bool_dtype(values) and is_integer_dtype(dtype):
        values = np.array(values, dtype=int, copy=copy)

    elif not (is_integer_dtype(values) or is_float_dtype(values)):
        raise TypeError(
            f"{values.dtype} cannot be converted to an IntegerDtype")

    if mask is None:
        mask = isna(values)
    else:
        assert len(mask) == len(values)

    if not values.ndim == 1:
        raise TypeError("values must be a 1D list-like")
    if not mask.ndim == 1:
        raise TypeError("mask must be a 1D list-like")

    # infer dtype if needed
    if dtype is None:
        dtype = np.dtype("int64")
    else:
        dtype = dtype.type

    # if we are float, let's make sure that we can
    # safely cast

    # we copy as need to coerce here
    if mask.any():
        values = values.copy()
        values[mask] = 1
        values = safe_cast(values, dtype, copy=False)
    else:
        values = safe_cast(values, dtype, copy=False)

    return values, mask
Пример #39
0
 def _is_boolean(self):
     return is_bool_dtype(self.subtype)
Пример #40
0
def test_is_bool_dtype(dtype, expected):
    result = is_bool_dtype(dtype)
    assert result is expected
Пример #41
0
 def test_is_boolean(self, categories, expected):
     cat = Categorical(categories)
     assert cat.dtype._is_boolean is expected
     assert is_bool_dtype(cat) is expected
     assert is_bool_dtype(cat.dtype) is expected
Пример #42
0
    def _cython_operation(self,
                          kind,
                          values,
                          how,
                          axis,
                          min_count=-1,
                          **kwargs):
        assert kind in ['transform', 'aggregate']

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values):
            raise NotImplementedError(
                "categoricals are not support in cython ops ATM")
        elif is_datetime64_any_dtype(values):
            if how in ['add', 'prod', 'cumsum', 'cumprod']:
                raise NotImplementedError(
                    "datetime64 type does not support {} "
                    "operations".format(how))
        elif is_timedelta64_dtype(values):
            if how in ['prod', 'cumprod']:
                raise NotImplementedError(
                    "timedelta64 type does not support {} "
                    "operations".format(how))

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                values = values.swapaxes(0, axis)
            if arity > 1:
                raise NotImplementedError("arity of more than 1 is not "
                                          "supported for the 'how' argument")
            out_shape = (self.ngroups, ) + values.shape[1:]

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view('int64')
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int64_or_float64(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        try:
            func = self._get_cython_function(kind, how, values, is_numeric)
        except NotImplementedError:
            if is_numeric:
                values = ensure_float64(values)
                func = self._get_cython_function(kind, how, values, is_numeric)
            else:
                raise

        if how == 'rank':
            out_dtype = 'float'
        else:
            if is_numeric:
                out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize)
            else:
                out_dtype = 'object'

        labels, _, _ = self.group_info

        if kind == 'aggregate':
            result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                 fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(result, counts, values, labels, func,
                                     is_numeric, is_datetimelike, min_count)
        elif kind == 'transform':
            result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                 fill_value=np.nan)

            # TODO: min_count
            result = self._transform(result, values, labels, func, is_numeric,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype('float64')
                result[mask] = np.nan

        if (kind == 'aggregate' and self._filter_empty_groups
                and not counts.all()):
            if result.ndim == 2:
                try:
                    result = lib.row_bool_subset(result,
                                                 (counts > 0).view(np.uint8))
                except ValueError:
                    result = lib.row_bool_subset_object(
                        ensure_object(result), (counts > 0).view(np.uint8))
            else:
                result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        if how in self._name_functions:
            # TODO
            names = self._name_functions[how]()
        else:
            names = None

        if swapped:
            result = result.swapaxes(0, axis)

        return result, names
Пример #43
0
    def _convert_to_ndarrays(self,
                             dct,
                             na_values,
                             na_fvalues,
                             verbose=False,
                             converters=None,
                             dtypes=None):
        result = {}
        for c, values in dct.items():
            conv_f = None if converters is None else converters.get(c, None)
            if isinstance(dtypes, dict):
                cast_type = dtypes.get(c, None)
            else:
                # single dtype or None
                cast_type = dtypes

            if self.na_filter:
                col_na_values, col_na_fvalues = _get_na_values(
                    c, na_values, na_fvalues, self.keep_default_na)
            else:
                col_na_values, col_na_fvalues = set(), set()

            if conv_f is not None:
                # conv_f applied to data before inference
                if cast_type is not None:
                    warnings.warn(
                        ("Both a converter and dtype were specified "
                         f"for column {c} - only the converter will be used"),
                        ParserWarning,
                        stacklevel=7,
                    )

                try:
                    values = lib.map_infer(values, conv_f)
                except ValueError:
                    mask = algorithms.isin(values,
                                           list(na_values)).view(np.uint8)
                    values = lib.map_infer_mask(values, conv_f, mask)

                cvals, na_count = self._infer_types(values,
                                                    set(col_na_values)
                                                    | col_na_fvalues,
                                                    try_num_bool=False)
            else:
                is_ea = is_extension_array_dtype(cast_type)
                is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)
                # skip inference if specified dtype is object
                # or casting to an EA
                try_num_bool = not (cast_type and is_str_or_ea_dtype)

                # general type inference and conversion
                cvals, na_count = self._infer_types(
                    values,
                    set(col_na_values) | col_na_fvalues, try_num_bool)

                # type specified in dtype param or cast_type is an EA
                if cast_type and (not is_dtype_equal(cvals, cast_type)
                                  or is_extension_array_dtype(cast_type)):
                    if not is_ea and na_count > 0:
                        try:
                            if is_bool_dtype(cast_type):
                                raise ValueError(
                                    f"Bool column has NA values in column {c}")
                        except (AttributeError, TypeError):
                            # invalid input to is_bool_dtype
                            pass
                    cast_type = pandas_dtype(cast_type)
                    cvals = self._cast_types(cvals, cast_type, c)

            result[c] = cvals
            if verbose and na_count:
                print(f"Filled {na_count} NA values in column {c!s}")
        return result
Пример #44
0
    def get_new_values(self, values, fill_value=None):

        if values.ndim == 1:
            values = values[:, np.newaxis]

        sorted_values = self._make_sorted_values(values)

        # place the values
        length, width = self.full_shape
        stride = values.shape[1]
        result_width = width * stride
        result_shape = (length, result_width)
        mask = self.mask
        mask_all = mask.all()

        # we can simply reshape if we don't have a mask
        if mask_all and len(values):
            new_values = (
                sorted_values.reshape(length, width, stride)
                .swapaxes(1, 2)
                .reshape(result_shape)
            )
            new_mask = np.ones(result_shape, dtype=bool)
            return new_values, new_mask

        # if our mask is all True, then we can use our existing dtype
        if mask_all:
            dtype = values.dtype
            new_values = np.empty(result_shape, dtype=dtype)
        else:
            dtype, fill_value = maybe_promote(values.dtype, fill_value)
            new_values = np.empty(result_shape, dtype=dtype)
            new_values.fill(fill_value)

        new_mask = np.zeros(result_shape, dtype=bool)

        name = np.dtype(dtype).name

        # we need to convert to a basic dtype
        # and possibly coerce an input to our output dtype
        # e.g. ints -> floats
        if needs_i8_conversion(values):
            sorted_values = sorted_values.view("i8")
            new_values = new_values.view("i8")
        elif is_bool_dtype(values):
            sorted_values = sorted_values.astype("object")
            new_values = new_values.astype("object")
        else:
            sorted_values = sorted_values.astype(name, copy=False)

        # fill in our values & mask
        libreshape.unstack(
            sorted_values,
            mask.view("u1"),
            stride,
            length,
            width,
            new_values,
            new_mask.view("u1"),
        )

        # reconstruct dtype if needed
        if needs_i8_conversion(values):
            new_values = new_values.view(values.dtype)

        return new_values, new_mask
Пример #45
0
 def wrapper(self, other):
     result = getattr(TimedeltaArrayMixin, opname)(self, other)
     if is_bool_dtype(result):
         # support of bool dtype indexers
         return result
     return Index(result)
Пример #46
0
    def get_new_values(self, values, fill_value=None):

        if values.ndim == 1:
            values = values[:, np.newaxis]

        sorted_values = self._make_sorted_values(values)

        # place the values
        length, width = self.full_shape
        stride = values.shape[1]
        result_width = width * stride
        result_shape = (length, result_width)
        mask = self.mask
        mask_all = self.mask_all

        # we can simply reshape if we don't have a mask
        if mask_all and len(values):
            # TODO: Under what circumstances can we rely on sorted_values
            #  matching values?  When that holds, we can slice instead
            #  of take (in particular for EAs)
            new_values = (sorted_values.reshape(
                length, width, stride).swapaxes(1, 2).reshape(result_shape))
            new_mask = np.ones(result_shape, dtype=bool)
            return new_values, new_mask

        # if our mask is all True, then we can use our existing dtype
        if mask_all:
            dtype = values.dtype
            new_values = np.empty(result_shape, dtype=dtype)
            name = np.dtype(dtype).name
        else:
            dtype, fill_value = maybe_promote(values.dtype, fill_value)
            if isinstance(dtype, ExtensionDtype):
                # GH#41875
                cls = dtype.construct_array_type()
                new_values = cls._empty(result_shape, dtype=dtype)
                new_values[:] = fill_value
                name = dtype.name
            else:
                new_values = np.empty(result_shape, dtype=dtype)
                new_values.fill(fill_value)
                name = np.dtype(dtype).name

        new_mask = np.zeros(result_shape, dtype=bool)

        # we need to convert to a basic dtype
        # and possibly coerce an input to our output dtype
        # e.g. ints -> floats
        if needs_i8_conversion(values.dtype):
            sorted_values = sorted_values.view("i8")
            new_values = new_values.view("i8")
        elif is_bool_dtype(values.dtype):
            sorted_values = sorted_values.astype("object")
            new_values = new_values.astype("object")
        else:
            sorted_values = sorted_values.astype(name, copy=False)

        # fill in our values & mask
        libreshape.unstack(
            sorted_values,
            mask.view("u1"),
            stride,
            length,
            width,
            new_values,
            new_mask.view("u1"),
        )

        # reconstruct dtype if needed
        if needs_i8_conversion(values.dtype):
            # view as datetime64 so we can wrap in DatetimeArray and use
            #  DTA's view method
            new_values = new_values.view("M8[ns]")
            new_values = ensure_wrapped_if_datetimelike(new_values)
            new_values = new_values.view(values.dtype)

        return new_values, new_mask
Пример #47
0
def test_is_bool_dtype_numpy_error():
    # GH39010
    assert not com.is_bool_dtype("0 - Name")
Пример #48
0
def test_is_bool_dtype():
    assert not com.is_bool_dtype(int)
    assert not com.is_bool_dtype(str)
    assert not com.is_bool_dtype(pd.Series([1, 2]))
    assert not com.is_bool_dtype(np.array(["a", "b"]))
    assert not com.is_bool_dtype(pd.Index(["a", "b"]))
    assert not com.is_bool_dtype("Int64")

    assert com.is_bool_dtype(bool)
    assert com.is_bool_dtype(np.bool_)
    assert com.is_bool_dtype(np.array([True, False]))
    assert com.is_bool_dtype(pd.Index([True, False]))

    assert com.is_bool_dtype(pd.BooleanDtype())
    assert com.is_bool_dtype(pd.array([True, False, None], dtype="boolean"))
    assert com.is_bool_dtype("boolean")
Пример #49
0
    def get_new_values(self):
        values = self.values

        # place the values
        length, width = self.full_shape
        stride = values.shape[1]
        result_width = width * stride
        result_shape = (length, result_width)
        mask = self.mask
        mask_all = mask.all()

        # we can simply reshape if we don't have a mask
        if mask_all and len(values):
            new_values = (self.sorted_values
                          .reshape(length, width, stride)
                          .swapaxes(1, 2)
                          .reshape(result_shape)
                          )
            new_mask = np.ones(result_shape, dtype=bool)
            return new_values, new_mask

        # if our mask is all True, then we can use our existing dtype
        if mask_all:
            dtype = values.dtype
            new_values = np.empty(result_shape, dtype=dtype)
        else:
            dtype, fill_value = maybe_promote(values.dtype, self.fill_value)
            new_values = np.empty(result_shape, dtype=dtype)
            new_values.fill(fill_value)

        new_mask = np.zeros(result_shape, dtype=bool)

        name = np.dtype(dtype).name
        sorted_values = self.sorted_values

        # we need to convert to a basic dtype
        # and possibly coerce an input to our output dtype
        # e.g. ints -> floats
        if needs_i8_conversion(values):
            sorted_values = sorted_values.view('i8')
            new_values = new_values.view('i8')
            name = 'int64'
        elif is_bool_dtype(values):
            sorted_values = sorted_values.astype('object')
            new_values = new_values.astype('object')
            name = 'object'
        else:
            sorted_values = sorted_values.astype(name, copy=False)

        # fill in our values & mask
        f = getattr(_reshape, "unstack_{}".format(name))
        f(sorted_values,
          mask.view('u1'),
          stride,
          length,
          width,
          new_values,
          new_mask.view('u1'))

        # reconstruct dtype if needed
        if needs_i8_conversion(values):
            new_values = new_values.view(values.dtype)

        return new_values, new_mask
Пример #50
0
def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:
    """
    Check if `indexer` is a valid array indexer for `array`.

    For a boolean mask, `array` and `indexer` are checked to have the same
    length. The dtype is validated, and if it is an integer or boolean
    ExtensionArray, it is checked if there are missing values present, and
    it is converted to the appropriate numpy array. Other dtypes will raise
    an error.

    Non-array indexers (integer, slice, Ellipsis, tuples, ..) are passed
    through as is.

    .. versionadded:: 1.0.0

    Parameters
    ----------
    array : array-like
        The array that is being indexed (only used for the length).
    indexer : array-like or list-like
        The array-like that's used to index. List-like input that is not yet
        a numpy array or an ExtensionArray is converted to one. Other input
        types are passed through as is

    Returns
    -------
    numpy.ndarray
        The validated indexer as a numpy array that can be used to index.

    Raises
    ------
    IndexError
        When the lengths don't match.
    ValueError
        When `indexer` cannot be converted to a numpy ndarray to index
        (e.g. presence of missing values).

    See Also
    --------
    api.types.is_bool_dtype : Check if `key` is of boolean dtype.

    Examples
    --------
    When checking a boolean mask, a boolean ndarray is returned when the
    arguments are all valid.

    >>> mask = pd.array([True, False])
    >>> arr = pd.array([1, 2])
    >>> pd.api.indexers.check_array_indexer(arr, mask)
    array([ True, False])

    An IndexError is raised when the lengths don't match.

    >>> mask = pd.array([True, False, True])
    >>> pd.api.indexers.check_array_indexer(arr, mask)
    Traceback (most recent call last):
    ...
    IndexError: Boolean index has wrong length: 3 instead of 2.

    NA values in a boolean array are treated as False.

    >>> mask = pd.array([True, pd.NA])
    >>> pd.api.indexers.check_array_indexer(arr, mask)
    array([ True, False])

    A numpy boolean mask will get passed through (if the length is correct):

    >>> mask = np.array([True, False])
    >>> pd.api.indexers.check_array_indexer(arr, mask)
    array([ True, False])

    Similarly for integer indexers, an integer ndarray is returned when it is
    a valid indexer, otherwise an error is  (for integer indexers, a matching
    length is not required):

    >>> indexer = pd.array([0, 2], dtype="Int64")
    >>> arr = pd.array([1, 2, 3])
    >>> pd.api.indexers.check_array_indexer(arr, indexer)
    array([0, 2])

    >>> indexer = pd.array([0, pd.NA], dtype="Int64")
    >>> pd.api.indexers.check_array_indexer(arr, indexer)
    Traceback (most recent call last):
    ...
    ValueError: Cannot index with an integer indexer containing NA values

    For non-integer/boolean dtypes, an appropriate error is raised:

    >>> indexer = np.array([0., 2.], dtype="float64")
    >>> pd.api.indexers.check_array_indexer(arr, indexer)
    Traceback (most recent call last):
    ...
    IndexError: arrays used as indices must be of integer or boolean type
    """
    from pandas.core.construction import array as pd_array

    # whathever is not an array-like is returned as-is (possible valid array
    # indexers that are not array-like: integer, slice, Ellipsis, None)
    # In this context, tuples are not considered as array-like, as they have
    # a specific meaning in indexing (multi-dimensional indexing)
    if is_list_like(indexer):
        if isinstance(indexer, tuple):
            return indexer
    else:
        return indexer

    # convert list-likes to array
    if not is_array_like(indexer):
        indexer = pd_array(indexer)
        if len(indexer) == 0:
            # empty list is converted to float array by pd.array
            indexer = np.array([], dtype=np.intp)

    dtype = indexer.dtype
    if is_bool_dtype(dtype):
        if is_extension_array_dtype(dtype):
            indexer = indexer.to_numpy(dtype=bool, na_value=False)
        else:
            indexer = np.asarray(indexer, dtype=bool)

        # GH26658
        if len(indexer) != len(array):
            raise IndexError(f"Boolean index has wrong length: "
                             f"{len(indexer)} instead of {len(array)}")
    elif is_integer_dtype(dtype):
        try:
            indexer = np.asarray(indexer, dtype=np.intp)
        except ValueError:
            raise ValueError(
                "Cannot index with an integer indexer containing NA values")
    else:
        raise IndexError(
            "arrays used as indices must be of integer or boolean type")

    return indexer
Пример #51
0
 def f(self):
     result = fget(self)
     if is_bool_dtype(result):
         # return numpy array b/c there is no BoolIndex
         return result
     return Index(result, name=self.name)
Пример #52
0
    def _cython_operation(self, kind, values, how, axis, min_count=-1,
                          **kwargs):
        assert kind in ['transform', 'aggregate']

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values):
            raise NotImplementedError(
                "categoricals are not support in cython ops ATM")
        elif is_datetime64_any_dtype(values):
            if how in ['add', 'prod', 'cumsum', 'cumprod']:
                raise NotImplementedError(
                    "datetime64 type does not support {} "
                    "operations".format(how))
        elif is_timedelta64_dtype(values):
            if how in ['prod', 'cumprod']:
                raise NotImplementedError(
                    "timedelta64 type does not support {} "
                    "operations".format(how))

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                values = values.swapaxes(0, axis)
            if arity > 1:
                raise NotImplementedError("arity of more than 1 is not "
                                          "supported for the 'how' argument")
            out_shape = (self.ngroups,) + values.shape[1:]

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view('int64')
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int64_or_float64(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        try:
            func = self._get_cython_function(
                kind, how, values, is_numeric)
        except NotImplementedError:
            if is_numeric:
                values = ensure_float64(values)
                func = self._get_cython_function(
                    kind, how, values, is_numeric)
            else:
                raise

        if how == 'rank':
            out_dtype = 'float'
        else:
            if is_numeric:
                out_dtype = '{kind}{itemsize}'.format(
                    kind=values.dtype.kind, itemsize=values.dtype.itemsize)
            else:
                out_dtype = 'object'

        labels, _, _ = self.group_info

        if kind == 'aggregate':
            result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                 fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(
                result, counts, values, labels, func, is_numeric,
                is_datetimelike, min_count)
        elif kind == 'transform':
            result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                 fill_value=np.nan)

            # TODO: min_count
            result = self._transform(
                result, values, labels, func, is_numeric, is_datetimelike,
                **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype('float64')
                result[mask] = np.nan

        if (kind == 'aggregate' and
                self._filter_empty_groups and not counts.all()):
            if result.ndim == 2:
                try:
                    result = lib.row_bool_subset(
                        result, (counts > 0).view(np.uint8))
                except ValueError:
                    result = lib.row_bool_subset_object(
                        ensure_object(result),
                        (counts > 0).view(np.uint8))
            else:
                result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        if how in self._name_functions:
            # TODO
            names = self._name_functions[how]()
        else:
            names = None

        if swapped:
            result = result.swapaxes(0, axis)

        return result, names
Пример #53
0
    def _convert_to_ndarrays(
        self,
        dct: Mapping,
        na_values,
        na_fvalues,
        verbose: bool = False,
        converters=None,
        dtypes=None,
    ):
        result = {}
        for c, values in dct.items():
            conv_f = None if converters is None else converters.get(c, None)
            if isinstance(dtypes, dict):
                cast_type = dtypes.get(c, None)
            else:
                # single dtype or None
                cast_type = dtypes

            if self.na_filter:
                col_na_values, col_na_fvalues = _get_na_values(
                    c, na_values, na_fvalues, self.keep_default_na)
            else:
                col_na_values, col_na_fvalues = set(), set()

            if c in self._parse_date_cols:
                # GH#26203 Do not convert columns which get converted to dates
                # but replace nans to ensure to_datetime works
                mask = algorithms.isin(values,
                                       set(col_na_values) | col_na_fvalues)
                np.putmask(values, mask, np.nan)
                result[c] = values
                continue

            if conv_f is not None:
                # conv_f applied to data before inference
                if cast_type is not None:
                    warnings.warn(
                        ("Both a converter and dtype were specified "
                         f"for column {c} - only the converter will be used."),
                        ParserWarning,
                        stacklevel=find_stack_level(),
                    )

                try:
                    values = lib.map_infer(values, conv_f)
                except ValueError:
                    # error: Argument 2 to "isin" has incompatible type "List[Any]";
                    # expected "Union[Union[ExtensionArray, ndarray], Index, Series]"
                    mask = algorithms.isin(
                        values,
                        list(na_values)  # type: ignore[arg-type]
                    ).view(np.uint8)
                    values = lib.map_infer_mask(values, conv_f, mask)

                cvals, na_count = self._infer_types(values,
                                                    set(col_na_values)
                                                    | col_na_fvalues,
                                                    try_num_bool=False)
            else:
                is_ea = is_extension_array_dtype(cast_type)
                is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)
                # skip inference if specified dtype is object
                # or casting to an EA
                try_num_bool = not (cast_type and is_str_or_ea_dtype)

                # general type inference and conversion
                cvals, na_count = self._infer_types(
                    values,
                    set(col_na_values) | col_na_fvalues, try_num_bool)

                # type specified in dtype param or cast_type is an EA
                if cast_type and (not is_dtype_equal(cvals, cast_type)
                                  or is_extension_array_dtype(cast_type)):
                    if not is_ea and na_count > 0:
                        try:
                            if is_bool_dtype(cast_type):
                                raise ValueError(
                                    f"Bool column has NA values in column {c}")
                        except (AttributeError, TypeError):
                            # invalid input to is_bool_dtype
                            pass
                    cast_type = pandas_dtype(cast_type)
                    cvals = self._cast_types(cvals, cast_type, c)

            result[c] = cvals
            if verbose and na_count:
                print(f"Filled {na_count} NA values in column {c!s}")
        return result
Пример #54
0
    def get_new_values(self):
        values = self.values

        # place the values
        length, width = self.full_shape
        stride = values.shape[1]
        result_width = width * stride
        result_shape = (length, result_width)
        mask = self.mask
        mask_all = mask.all()

        # we can simply reshape if we don't have a mask
        if mask_all and len(values):
            new_values = (self.sorted_values
                              .reshape(length, width, stride)
                              .swapaxes(1, 2)
                              .reshape(result_shape)
                          )
            new_mask = np.ones(result_shape, dtype=bool)
            return new_values, new_mask

        # if our mask is all True, then we can use our existing dtype
        if mask_all:
            dtype = values.dtype
            new_values = np.empty(result_shape, dtype=dtype)
        else:
            dtype, fill_value = maybe_promote(values.dtype, self.fill_value)
            new_values = np.empty(result_shape, dtype=dtype)
            new_values.fill(fill_value)

        new_mask = np.zeros(result_shape, dtype=bool)

        name = np.dtype(dtype).name
        sorted_values = self.sorted_values

        # we need to convert to a basic dtype
        # and possibly coerce an input to our output dtype
        # e.g. ints -> floats
        if needs_i8_conversion(values):
            sorted_values = sorted_values.view('i8')
            new_values = new_values.view('i8')
            name = 'int64'
        elif is_bool_dtype(values):
            sorted_values = sorted_values.astype('object')
            new_values = new_values.astype('object')
            name = 'object'
        else:
            sorted_values = sorted_values.astype(name, copy=False)

        # fill in our values & mask
        f = getattr(_reshape, "unstack_{name}".format(name=name))
        f(sorted_values,
          mask.view('u1'),
          stride,
          length,
          width,
          new_values,
          new_mask.view('u1'))

        # reconstruct dtype if needed
        if needs_i8_conversion(values):
            new_values = new_values.view(values.dtype)

        return new_values, new_mask
Пример #55
0
    def _cast_types(self, values, cast_type, column):
        """
        Cast values to specified type

        Parameters
        ----------
        values : ndarray
        cast_type : string or np.dtype
           dtype to cast values to
        column : string
            column name - used only for error reporting

        Returns
        -------
        converted : ndarray
        """
        if is_categorical_dtype(cast_type):
            known_cats = (isinstance(cast_type, CategoricalDtype)
                          and cast_type.categories is not None)

            if not is_object_dtype(values) and not known_cats:
                # TODO: this is for consistency with
                # c-parser which parses all categories
                # as strings

                values = astype_nansafe(values, np.dtype(str))

            cats = Index(values).unique().dropna()
            values = Categorical._from_inferred_categories(
                cats,
                cats.get_indexer(values),
                cast_type,
                true_values=self.true_values)

        # use the EA's implementation of casting
        elif is_extension_array_dtype(cast_type):
            # ensure cast_type is an actual dtype and not a string
            cast_type = pandas_dtype(cast_type)
            array_type = cast_type.construct_array_type()
            try:
                if is_bool_dtype(cast_type):
                    return array_type._from_sequence_of_strings(
                        values,
                        dtype=cast_type,
                        true_values=self.true_values,
                        false_values=self.false_values,
                    )
                else:
                    return array_type._from_sequence_of_strings(
                        values, dtype=cast_type)
            except NotImplementedError as err:
                raise NotImplementedError(
                    f"Extension Array: {array_type} must implement "
                    "_from_sequence_of_strings in order to be used in parser methods"
                ) from err

        else:
            try:
                values = astype_nansafe(values,
                                        cast_type,
                                        copy=True,
                                        skipna=True)
            except ValueError as err:
                raise ValueError(
                    f"Unable to convert column {column} to type {cast_type}"
                ) from err
        return values
Пример #56
0
 def test_is_boolean(self, categories, expected):
     cat = Categorical(categories)
     assert cat.dtype._is_boolean is expected
     assert is_bool_dtype(cat) is expected
     assert is_bool_dtype(cat.dtype) is expected
Пример #57
0
    def _is_boolean(self):
        from pandas.core.dtypes.common import is_bool_dtype

        return is_bool_dtype(self.categories)
Пример #58
0
    def _is_boolean(self):
        from pandas.core.dtypes.common import is_bool_dtype

        return is_bool_dtype(self.categories)
Пример #59
0
    def astype(self, dtype, copy: bool = True) -> ArrayLike:
        """
        Cast to a NumPy array or ExtensionArray with 'dtype'.

        Parameters
        ----------
        dtype : str or dtype
            Typecode or data-type to which the array is cast.
        copy : bool, default True
            Whether to copy the data, even if not necessary. If False,
            a copy is made only if the old dtype does not match the
            new dtype.

        Returns
        -------
        ndarray or ExtensionArray
            NumPy ndarray, BooleanArray or IntegerArray with 'dtype' for its dtype.

        Raises
        ------
        TypeError
            if incompatible type with an BooleanDtype, equivalent of same_kind
            casting
        """
        from pandas.core.arrays.string_ import StringDtype

        dtype = pandas_dtype(dtype)

        if isinstance(dtype, BooleanDtype):
            values, mask = coerce_to_array(self, copy=copy)
            if not copy:
                return self
            else:
                return BooleanArray(values, mask, copy=False)
        elif isinstance(dtype, StringDtype):
            return dtype.construct_array_type()._from_sequence(self,
                                                               copy=False)

        if is_bool_dtype(dtype):
            # astype_nansafe converts np.nan to True
            if self._hasna:
                raise ValueError("cannot convert float NaN to bool")
            else:
                return self._data.astype(dtype, copy=copy)
        if is_extension_array_dtype(dtype) and is_integer_dtype(dtype):
            from pandas.core.arrays import IntegerArray

            return IntegerArray(self._data.astype(dtype.numpy_dtype),
                                self._mask.copy(),
                                copy=False)
        # for integer, error if there are missing values
        if is_integer_dtype(dtype):
            if self._hasna:
                raise ValueError("cannot convert NA to integer")
        # for float dtype, ensure we use np.nan before casting (numpy cannot
        # deal with pd.NA)
        na_value = self._na_value
        if is_float_dtype(dtype):
            na_value = np.nan
        # coerce
        return self.to_numpy(dtype=dtype, na_value=na_value, copy=False)