예제 #1
0
def test_is_sparse(check_scipy):
    assert com.is_sparse(pd.SparseArray([1, 2, 3]))
    assert com.is_sparse(pd.SparseSeries([1, 2, 3]))

    assert not com.is_sparse(np.array([1, 2, 3]))

    if check_scipy:
        import scipy.sparse
        assert not com.is_sparse(scipy.sparse.bsr_matrix([1, 2, 3]))
예제 #2
0
def test_is_sparse():
    assert com.is_sparse(pd.SparseArray([1, 2, 3]))
    assert com.is_sparse(pd.SparseSeries([1, 2, 3]))

    assert not com.is_sparse(np.array([1, 2, 3]))

    # This test will only skip if the previous assertions
    # pass AND scipy is not installed.
    sparse = pytest.importorskip("scipy.sparse")
    assert not com.is_sparse(sparse.bsr_matrix([1, 2, 3]))
예제 #3
0
    def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx):
        # See GH16874, GH18914 and #18686 for why this should be a DataFrame
        from pandas.core.dtypes.common import is_sparse

        frames = [self.dense1, self.dense3]

        sparse_frame = [frames[dense_idx],
                        frames[sparse_idx].to_sparse(fill_value=fill_value)]
        dense_frame = [frames[dense_idx], frames[sparse_idx]]

        # This will try both directions sparse + dense and dense + sparse
        for _ in range(2):
            res = pd.concat(sparse_frame, axis=1)
            exp = pd.concat(dense_frame, axis=1)
            cols = [i for (i, x) in enumerate(res.dtypes) if is_sparse(x)]

            for col in cols:
                exp.iloc[:, col] = exp.iloc[:, col].astype("Sparse")

            for column in frames[dense_idx].columns:
                if dense_idx == sparse_idx:
                    tm.assert_frame_equal(res[column], exp[column])
                else:
                    tm.assert_series_equal(res[column], exp[column])

            tm.assert_frame_equal(res, exp)

            sparse_frame = sparse_frame[::-1]
            dense_frame = dense_frame[::-1]
예제 #4
0
    def is_na(self):
        if self.block is None:
            return True

        if not self.block._can_hold_na:
            return False

        # Usually it's enough to check but a small fraction of values to see if
        # a block is NOT null, chunks should help in such cases.  1000 value
        # was chosen rather arbitrarily.
        values = self.block.values
        if self.block.is_categorical:
            values_flat = values.categories
        elif is_sparse(self.block.values.dtype):
            return False
        elif self.block.is_extension:
            values_flat = values
        else:
            values_flat = values.ravel(order='K')
        total_len = values_flat.shape[0]
        chunk_len = max(total_len // 40, 1000)
        for i in range(0, total_len, chunk_len):
            if not isna(values_flat[i:i + chunk_len]).all():
                return False

        return True
예제 #5
0
파일: ops.py 프로젝트: frreiss/pandas-fred
    def _disallow_invalid_ops(self, values: ArrayLike, how: str):
        """
        Check if we can do this operation with our cython functions.

        Raises
        ------
        NotImplementedError
            This is either not a valid function for this dtype, or
            valid but not implemented in cython.
        """
        dtype = values.dtype

        if is_categorical_dtype(dtype) or is_sparse(dtype):
            # categoricals are only 1d, so we
            #  are not setup for dim transforming
            raise NotImplementedError(f"{dtype} dtype not supported")
        elif is_datetime64_any_dtype(dtype):
            # we raise NotImplemented if this is an invalid operation
            #  entirely, e.g. adding datetimes
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise NotImplementedError(
                    f"datetime64 type does not support {how} operations")
        elif is_timedelta64_dtype(dtype):
            if how in ["prod", "cumprod"]:
                raise NotImplementedError(
                    f"timedelta64 type does not support {how} operations")
예제 #6
0
    def _disallow_invalid_ops(
        self, dtype: DtypeObj, how: str, is_numeric: bool = False
    ):
        """
        Check if we can do this operation with our cython functions.

        Raises
        ------
        NotImplementedError
            This is either not a valid function for this dtype, or
            valid but not implemented in cython.
        """
        if is_numeric:
            # never an invalid op for those dtypes, so return early as fastpath
            return

        if is_categorical_dtype(dtype) or is_sparse(dtype):
            # categoricals are only 1d, so we
            #  are not setup for dim transforming
            raise NotImplementedError(f"{dtype} dtype not supported")
        elif is_datetime64_any_dtype(dtype):
            # we raise NotImplemented if this is an invalid operation
            #  entirely, e.g. adding datetimes
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise NotImplementedError(
                    f"datetime64 type does not support {how} operations"
                )
        elif is_timedelta64_dtype(dtype):
            if how in ["prod", "cumprod"]:
                raise NotImplementedError(
                    f"timedelta64 type does not support {how} operations"
                )
예제 #7
0
    def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx):
        # See GH16874, GH18914 and #18686 for why this should be a DataFrame
        from pandas.core.dtypes.common import is_sparse

        frames = [self.dense1, self.dense3]

        sparse_frame = [
            frames[dense_idx],
            frames[sparse_idx].to_sparse(fill_value=fill_value)
        ]
        dense_frame = [frames[dense_idx], frames[sparse_idx]]

        # This will try both directions sparse + dense and dense + sparse
        for _ in range(2):
            res = pd.concat(sparse_frame, axis=1)
            exp = pd.concat(dense_frame, axis=1)
            cols = [i for (i, x) in enumerate(res.dtypes) if is_sparse(x)]

            for col in cols:
                exp.iloc[:, col] = exp.iloc[:, col].astype("Sparse")

            for column in frames[dense_idx].columns:
                if dense_idx == sparse_idx:
                    tm.assert_frame_equal(res[column], exp[column])
                else:
                    tm.assert_series_equal(res[column], exp[column])

            tm.assert_frame_equal(res, exp)

            sparse_frame = sparse_frame[::-1]
            dense_frame = dense_frame[::-1]
예제 #8
0
def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool,
              raise_cast_failure: bool):
    """
    Convert input to numpy ndarray and optionally cast to a given dtype.

    Parameters
    ----------
    arr : ndarray, list, tuple, iterator (catchall)
        Excludes: ExtensionArray, Series, Index.
    dtype : np.dtype, ExtensionDtype or None
    copy : bool
        If False, don't copy the data if not needed.
    raise_cast_failure : bool
        If True, and if a dtype is specified, raise errors during casting.
        Otherwise an object array is returned.
    """
    # perf shortcut as this is the most common case
    if isinstance(arr, np.ndarray):
        if maybe_castable(arr) and not copy and dtype is None:
            return arr

    if isinstance(dtype, ExtensionDtype) and (dtype.kind != "M"
                                              or is_sparse(dtype)):
        # create an extension array from its dtype
        # DatetimeTZ case needs to go through maybe_cast_to_datetime but
        # SparseDtype does not
        array_type = dtype.construct_array_type()._from_sequence
        subarr = array_type(arr, dtype=dtype, copy=copy)
        return subarr

    if is_object_dtype(dtype) and not isinstance(arr, np.ndarray):
        subarr = construct_1d_object_array_from_listlike(arr)
        return subarr

    try:
        # GH#15832: Check if we are requesting a numeric dtype and
        # that we can convert the data to the requested dtype.
        if is_integer_dtype(dtype):
            # this will raise if we have e.g. floats
            maybe_cast_to_integer_array(arr, dtype)
            subarr = arr
        else:
            subarr = maybe_cast_to_datetime(arr, dtype)

        if not isinstance(subarr, (ABCExtensionArray, ABCIndex)):
            subarr = construct_1d_ndarray_preserving_na(subarr,
                                                        dtype,
                                                        copy=copy)
    except OutOfBoundsDatetime:
        # in case of out of bound datetime64 -> always raise
        raise
    except (ValueError, TypeError) as err:
        if dtype is not None and raise_cast_failure:
            raise
        elif "Cannot cast" in str(err):
            # via _disallow_mismatched_datetimelike
            raise
        else:
            subarr = np.array(arr, dtype=object, copy=copy)
    return subarr
예제 #9
0
    def is_na(self):
        if self.block is None:
            return True

        if not self.block._can_hold_na:
            return False

        # Usually it's enough to check but a small fraction of values to see if
        # a block is NOT null, chunks should help in such cases.  1000 value
        # was chosen rather arbitrarily.
        values = self.block.values
        if self.block.is_categorical:
            values_flat = values.categories
        elif is_sparse(self.block.values.dtype):
            return False
        elif self.block.is_extension:
            values_flat = values
        else:
            values_flat = values.ravel(order='K')
        total_len = values_flat.shape[0]
        chunk_len = max(total_len // 40, 1000)
        for i in range(0, total_len, chunk_len):
            if not isna(values_flat[i:i + chunk_len]).all():
                return False

        return True
예제 #10
0
파일: quantile.py 프로젝트: yeung-r/pandas
def quantile_ea_compat(values: ExtensionArray, qs, interpolation: str,
                       axis: int) -> ExtensionArray:
    """
    ExtensionArray compatibility layer for quantile_with_mask.

    We pretend that an ExtensionArray with shape (N,) is actually (1, N,)
    for compatibility with non-EA code.

    Parameters
    ----------
    values : ExtensionArray
    qs : a scalar or list of the quantiles to be computed
    interpolation: str
    axis : int

    Returns
    -------
    ExtensionArray
    """
    # TODO(EA2D): make-believe not needed with 2D EAs
    orig = values

    # asarray needed for Sparse, see GH#24600
    mask = np.asarray(values.isna())
    mask = np.atleast_2d(mask)

    # error: Incompatible types in assignment (expression has type "ndarray", variable
    # has type "ExtensionArray")
    values, fill_value = values._values_for_factorize(
    )  # type: ignore[assignment]
    # error: No overload variant of "atleast_2d" matches argument type "ExtensionArray"
    values = np.atleast_2d(values)  # type: ignore[call-overload]

    # error: Argument 1 to "quantile_with_mask" has incompatible type "ExtensionArray";
    # expected "ndarray"
    result = quantile_with_mask(
        values,
        mask,
        fill_value,
        qs,
        interpolation,
        axis  # type: ignore[arg-type]
    )

    if not is_sparse(orig.dtype):
        # shape[0] should be 1 as long as EAs are 1D

        if result.ndim == 1:
            # i.e. qs was originally a scalar
            assert result.shape == (1, ), result.shape
            result = type(orig)._from_factorized(result, orig)

        else:
            assert result.shape == (1, len(qs)), result.shape
            result = type(orig)._from_factorized(result[0], orig)

    # error: Incompatible return value type (got "ndarray", expected "ExtensionArray")
    return result  # type: ignore[return-value]
    def __init__(self,
                 values,
                 index,
                 level=-1,
                 value_columns=None,
                 fill_value=None,
                 constructor=None):

        self.is_categorical = None
        self.is_sparse = is_sparse(values)
        if values.ndim == 1:
            if isinstance(values, Categorical):
                self.is_categorical = values
                values = np.array(values)
            elif self.is_sparse:
                # XXX: Makes SparseArray *dense*, but it's supposedly
                # a single column at a time, so it's "doable"
                values = values.values
            values = values[:, np.newaxis]
        self.values = values
        self.value_columns = value_columns
        self.fill_value = fill_value

        if constructor is None:
            if self.is_sparse:
                self.constructor = SparseDataFrame
            else:
                self.constructor = DataFrame
        else:
            self.constructor = constructor

        if value_columns is None and values.shape[1] != 1:  # pragma: no cover
            raise ValueError('must pass column labels for multi-column data')

        self.index = index.remove_unused_levels()

        if isinstance(self.index, MultiIndex):
            if index._reference_duplicate_name(level):
                msg = ("Ambiguous reference to {level}. The index "
                       "names are not unique.".format(level=level))
                raise ValueError(msg)

        self.level = self.index._get_level_number(level)

        # when index includes `nan`, need to lift levels/strides by 1
        self.lift = 1 if -1 in self.index.labels[self.level] else 0

        self.new_index_levels = list(self.index.levels)
        self.new_index_names = list(self.index.names)

        self.removed_name = self.new_index_names.pop(self.level)
        self.removed_level = self.new_index_levels.pop(self.level)
        self.removed_level_full = index.levels[self.level]

        self._make_sorted_values_labels()
        self._make_selectors()
예제 #12
0
파일: concat.py 프로젝트: Aathi410/Pro123
def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
    """
    Helper function for `arr.astype(common_dtype)` but handling all special
    cases.
    """
    if is_dtype_equal(arr.dtype, dtype):
        return arr
    if (
        is_categorical_dtype(arr.dtype)
        and isinstance(dtype, np.dtype)
        and np.issubdtype(dtype, np.integer)
    ):
        # problem case: categorical of int -> gives int as result dtype,
        # but categorical can contain NAs -> fall back to object dtype
        try:
            return arr.astype(dtype, copy=False)
        except ValueError:
            return arr.astype(object, copy=False)

    if is_sparse(arr) and not is_sparse(dtype):
        # problem case: SparseArray.astype(dtype) doesn't follow the specified
        # dtype exactly, but converts this to Sparse[dtype] -> first manually
        # convert to dense array
        arr = cast(SparseArray, arr)
        return arr.to_dense().astype(dtype, copy=False)

    if (
        isinstance(arr, np.ndarray)
        and arr.dtype.kind in ["m", "M"]
        and dtype is np.dtype("object")
    ):
        # wrap datetime-likes in EA to ensure astype(object) gives Timestamp/Timedelta
        # this can happen when concat_compat is called directly on arrays (when arrays
        # are not coming from Index/Series._values), eg in BlockManager.quantile
        arr = ensure_wrapped_if_datetimelike(arr)

    if isinstance(dtype, ExtensionDtype):
        if isinstance(arr, np.ndarray):
            # numpy's astype cannot handle ExtensionDtypes
            return pd_array(arr, dtype=dtype, copy=False)
        return arr.astype(dtype, copy=False)

    return arr.astype(dtype, copy=False)
예제 #13
0
파일: concat.py 프로젝트: yeung-r/pandas
def _cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
    """
    Helper function for `arr.astype(common_dtype)` but handling all special
    cases.
    """
    if (
        is_categorical_dtype(arr.dtype)
        and isinstance(dtype, np.dtype)
        and np.issubdtype(dtype, np.integer)
    ):
        # problem case: categorical of int -> gives int as result dtype,
        # but categorical can contain NAs -> fall back to object dtype
        try:
            return arr.astype(dtype, copy=False)
        except ValueError:
            return arr.astype(object, copy=False)

    if is_sparse(arr) and not is_sparse(dtype):
        # problem case: SparseArray.astype(dtype) doesn't follow the specified
        # dtype exactly, but converts this to Sparse[dtype] -> first manually
        # convert to dense array
        arr = cast(SparseArray, arr)
        return arr.to_dense().astype(dtype, copy=False)

    if (
        isinstance(arr, np.ndarray)
        and arr.dtype.kind in ["m", "M"]
        and dtype is np.dtype("object")
    ):
        # wrap datetime-likes in EA to ensure astype(object) gives Timestamp/Timedelta
        # this can happen when concat_compat is called directly on arrays (when arrays
        # are not coming from Index/Series._values), eg in BlockManager.quantile
        arr = ensure_wrapped_if_datetimelike(arr)

    if is_extension_array_dtype(dtype) and isinstance(arr, np.ndarray):
        # numpy's astype cannot handle ExtensionDtypes
        return pd_array(arr, dtype=dtype, copy=False)
    # error: Argument 1 to "astype" of "_ArrayOrScalarCommon" has incompatible type
    # "Union[dtype[Any], ExtensionDtype]"; expected "Union[dtype[Any], None, type,
    # _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]],
    # List[Any], _DTypeDict, Tuple[Any, Any]]]"
    return arr.astype(dtype, copy=False)  # type: ignore[arg-type]
예제 #14
0
파일: concat.py 프로젝트: Axik/pandas
def _get_series_result_type(result):
    """
    return appropriate class of Series concat
    input is either dict or array-like
    """
    if isinstance(result, dict):
        # concat Series with axis 1
        if all(is_sparse(c) for c in compat.itervalues(result)):
            from pandas.core.sparse.api import SparseDataFrame
            return SparseDataFrame
        else:
            from pandas.core.frame import DataFrame
            return DataFrame

    elif is_sparse(result):
        # concat Series with axis 1
        from pandas.core.sparse.api import SparseSeries
        return SparseSeries
    else:
        from pandas.core.series import Series
        return Series
예제 #15
0
파일: concat.py 프로젝트: zhlijia/pandas
def _get_series_result_type(result):
    """
    return appropriate class of Series concat
    input is either dict or array-like
    """
    if isinstance(result, dict):
        # concat Series with axis 1
        if all(is_sparse(c) for c in compat.itervalues(result)):
            from pandas.core.sparse.api import SparseDataFrame
            return SparseDataFrame
        else:
            from pandas.core.frame import DataFrame
            return DataFrame

    elif is_sparse(result):
        # concat Series with axis 1
        from pandas.core.sparse.api import SparseSeries
        return SparseSeries
    else:
        from pandas.core.series import Series
        return Series
예제 #16
0
파일: reshape.py 프로젝트: chris-b1/pandas
    def __init__(self, values, index, level=-1, value_columns=None,
                 fill_value=None, constructor=None):

        self.is_categorical = None
        self.is_sparse = is_sparse(values)
        if values.ndim == 1:
            if isinstance(values, Categorical):
                self.is_categorical = values
                values = np.array(values)
            elif self.is_sparse:
                # XXX: Makes SparseArray *dense*, but it's supposedly
                # a single column at a time, so it's "doable"
                values = values.values
            values = values[:, np.newaxis]
        self.values = values
        self.value_columns = value_columns
        self.fill_value = fill_value

        if constructor is None:
            if self.is_sparse:
                self.constructor = SparseDataFrame
            else:
                self.constructor = DataFrame
        else:
            self.constructor = constructor

        if value_columns is None and values.shape[1] != 1:  # pragma: no cover
            raise ValueError('must pass column labels for multi-column data')

        self.index = index.remove_unused_levels()

        if isinstance(self.index, MultiIndex):
            if index._reference_duplicate_name(level):
                msg = ("Ambiguous reference to {level}. The index "
                       "names are not unique.".format(level=level))
                raise ValueError(msg)

        self.level = self.index._get_level_number(level)

        # when index includes `nan`, need to lift levels/strides by 1
        self.lift = 1 if -1 in self.index.labels[self.level] else 0

        self.new_index_levels = list(self.index.levels)
        self.new_index_names = list(self.index.names)

        self.removed_name = self.new_index_names.pop(self.level)
        self.removed_level = self.new_index_levels.pop(self.level)
        self.removed_level_full = index.levels[self.level]

        self._make_sorted_values_labels()
        self._make_selectors()
예제 #17
0
파일: concat.py 프로젝트: christlc/pandas
def _get_frame_result_type(result, objs):
    """
    return appropriate class of DataFrame-like concat
    if all blocks are sparse, return SparseDataFrame
    otherwise, return 1st obj
    """

    if (result.blocks and (
            all(is_sparse(b) for b in result.blocks) or
            all(isinstance(obj, ABCSparseDataFrame) for obj in objs))):
        from pandas.core.sparse.api import SparseDataFrame
        return SparseDataFrame
    else:
        return next(obj for obj in objs if not isinstance(obj,
                                                          ABCSparseDataFrame))
예제 #18
0
def _get_frame_result_type(result, objs):
    """
    return appropriate class of DataFrame-like concat
    if all blocks are sparse, return SparseDataFrame
    otherwise, return 1st obj
    """

    if (result.blocks
            and (all(is_sparse(b) for b in result.blocks)
                 or all(isinstance(obj, ABCSparseDataFrame) for obj in objs))):
        from pandas.core.sparse.api import SparseDataFrame
        return SparseDataFrame
    else:
        return next(obj for obj in objs
                    if not isinstance(obj, ABCSparseDataFrame))
예제 #19
0
파일: quantile.py 프로젝트: zacqed/pandas
def quantile_ea_compat(values: ExtensionArray, qs, interpolation: str,
                       axis: int) -> ExtensionArray:
    """
    ExtensionArray compatibility layer for quantile_with_mask.

    We pretend that an ExtensionArray with shape (N,) is actually (1, N,)
    for compatibility with non-EA code.

    Parameters
    ----------
    values : ExtensionArray
    qs : a scalar or list of the quantiles to be computed
    interpolation: str
    axis : int

    Returns
    -------
    ExtensionArray
    """
    # TODO(EA2D): make-believe not needed with 2D EAs
    orig = values

    # asarray needed for Sparse, see GH#24600
    mask = np.asarray(values.isna())
    mask = np.atleast_2d(mask)

    values, fill_value = values._values_for_factorize()
    values = np.atleast_2d(values)

    result = quantile_with_mask(values, mask, fill_value, qs, interpolation,
                                axis)

    if not is_sparse(orig.dtype):
        # shape[0] should be 1 as long as EAs are 1D

        if result.ndim == 1:
            # i.e. qs was originally a scalar
            assert result.shape == (1, ), result.shape
            result = type(orig)._from_factorized(result, orig)

        else:
            assert result.shape == (1, len(qs)), result.shape
            result = type(orig)._from_factorized(result[0], orig)

    return result
예제 #20
0
def _quantile_ea_compat(
    values: ExtensionArray, qs: np.ndarray, interpolation: str
) -> ExtensionArray:
    """
    ExtensionArray compatibility layer for _quantile_with_mask.

    We pretend that an ExtensionArray with shape (N,) is actually (1, N,)
    for compatibility with non-EA code.

    Parameters
    ----------
    values : ExtensionArray
    qs : np.ndarray[float64]
    interpolation: str

    Returns
    -------
    ExtensionArray
    """
    # TODO(EA2D): make-believe not needed with 2D EAs
    orig = values

    # asarray needed for Sparse, see GH#24600
    mask = np.asarray(values.isna())
    mask = np.atleast_2d(mask)

    arr, fill_value = values._values_for_factorize()
    arr = np.atleast_2d(arr)

    result = _quantile_with_mask(arr, mask, fill_value, qs, interpolation)

    if not is_sparse(orig.dtype):
        # shape[0] should be 1 as long as EAs are 1D

        if orig.ndim == 2:
            # i.e. DatetimeArray
            result = type(orig)._from_factorized(result, orig)

        else:
            assert result.shape == (1, len(qs)), result.shape
            result = type(orig)._from_factorized(result[0], orig)

    # error: Incompatible return value type (got "ndarray", expected "ExtensionArray")
    return result  # type: ignore[return-value]
예제 #21
0
파일: ops.py 프로젝트: selasley/pandas
    def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False):
        """
        Check if we can do this operation with our cython functions.

        Raises
        ------
        NotImplementedError
            This is either not a valid function for this dtype, or
            valid but not implemented in cython.
        """
        how = self.how

        if is_numeric:
            # never an invalid op for those dtypes, so return early as fastpath
            return

        if isinstance(dtype, CategoricalDtype):
            # NotImplementedError for methods that can fall back to a
            #  non-cython implementation.
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise TypeError(
                    f"{dtype} type does not support {how} operations")
            elif how not in ["rank"]:
                # only "rank" is implemented in cython
                raise NotImplementedError(f"{dtype} dtype not supported")
            elif not dtype.ordered:
                # TODO: TypeError?
                raise NotImplementedError(f"{dtype} dtype not supported")

        elif is_sparse(dtype):
            # categoricals are only 1d, so we
            #  are not setup for dim transforming
            raise NotImplementedError(f"{dtype} dtype not supported")
        elif is_datetime64_any_dtype(dtype):
            # TODO: same for period_dtype?  no for these methods with Period
            # we raise NotImplemented if this is an invalid operation
            #  entirely, e.g. adding datetimes
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise TypeError(
                    f"datetime64 type does not support {how} operations")
        elif is_timedelta64_dtype(dtype):
            if how in ["prod", "cumprod"]:
                raise TypeError(
                    f"timedelta64 type does not support {how} operations")
예제 #22
0
파일: concat.py 프로젝트: Moujunpeng/pandas
def _get_sliced_frame_result_type(data, obj):
    """
    return appropriate class of Series. When data is sparse
    it will return a SparseSeries, otherwise it will return
    the Series.

    Parameters
    ----------
    data : array-like
    obj : DataFrame

    Returns
    -------
    Series or SparseSeries
    """
    if is_sparse(data):
        from pandas.core.sparse.api import SparseSeries
        return SparseSeries
    return obj._constructor_sliced
예제 #23
0
파일: concat.py 프로젝트: ziggi0703/pandas
def _get_sliced_frame_result_type(data, obj):
    """
    return appropriate class of Series. When data is sparse
    it will return a SparseSeries, otherwise it will return
    the Series.

    Parameters
    ----------
    data : array-like
    obj : DataFrame

    Returns
    -------
    Series or SparseSeries
    """
    if is_sparse(data):
        from pandas.core.sparse.api import SparseSeries
        return SparseSeries
    return obj._constructor_sliced
예제 #24
0
    def is_na(self) -> bool:
        if self.block is None:
            return True

        if not self.block._can_hold_na:
            return False

        # Usually it's enough to check but a small fraction of values to see if
        # a block is NOT null, chunks should help in such cases.  1000 value
        # was chosen rather arbitrarily.
        values = self.block.values
        if is_sparse(self.block.values.dtype):
            return False
        elif self.block.is_extension:
            # TODO(EA2D): no need for special case with 2D EAs
            values_flat = values
        else:
            values_flat = values.ravel(order="K")

        return isna_all(values_flat)
예제 #25
0
파일: concat.py 프로젝트: christlc/pandas
def get_dtype_kinds(l):
    """
    Parameters
    ----------
    l : list of arrays

    Returns
    -------
    a set of kinds that exist in this list of arrays
    """

    typs = set()
    for arr in l:

        dtype = arr.dtype
        if is_categorical_dtype(dtype):
            typ = 'category'
        elif is_sparse(arr):
            typ = 'sparse'
        elif isinstance(arr, ABCRangeIndex):
            typ = 'range'
        elif is_datetimetz(arr):
            # if to_concat contains different tz,
            # the result must be object dtype
            typ = str(arr.dtype)
        elif is_datetime64_dtype(dtype):
            typ = 'datetime'
        elif is_timedelta64_dtype(dtype):
            typ = 'timedelta'
        elif is_object_dtype(dtype):
            typ = 'object'
        elif is_bool_dtype(dtype):
            typ = 'bool'
        elif is_period_dtype(dtype):
            typ = str(arr.dtype)
        elif is_interval_dtype(dtype):
            typ = str(arr.dtype)
        else:
            typ = dtype.kind
        typs.add(typ)
    return typs
예제 #26
0
def get_dtype_kinds(l):
    """
    Parameters
    ----------
    l : list of arrays

    Returns
    -------
    a set of kinds that exist in this list of arrays
    """

    typs = set()
    for arr in l:

        dtype = arr.dtype
        if is_categorical_dtype(dtype):
            typ = 'category'
        elif is_sparse(arr):
            typ = 'sparse'
        elif isinstance(arr, ABCRangeIndex):
            typ = 'range'
        elif is_datetimetz(arr):
            # if to_concat contains different tz,
            # the result must be object dtype
            typ = str(arr.dtype)
        elif is_datetime64_dtype(dtype):
            typ = 'datetime'
        elif is_timedelta64_dtype(dtype):
            typ = 'timedelta'
        elif is_object_dtype(dtype):
            typ = 'object'
        elif is_bool_dtype(dtype):
            typ = 'bool'
        elif is_period_dtype(dtype):
            typ = str(arr.dtype)
        elif is_interval_dtype(dtype):
            typ = str(arr.dtype)
        else:
            typ = dtype.kind
        typs.add(typ)
    return typs
예제 #27
0
def _get_series_result_type(result, objs=None):
    """
    return appropriate class of Series concat
    input is either dict or array-like
    """
    # concat Series with axis 1
    if isinstance(result, dict):
        # concat Series with axis 1
        if all(is_sparse(c) for c in compat.itervalues(result)):
            from pandas.core.sparse.api import SparseDataFrame
            return SparseDataFrame
        else:
            from pandas.core.frame import DataFrame
            return DataFrame

    # otherwise it is a SingleBlockManager (axis = 0)
    if result._block.is_sparse:
        from pandas.core.sparse.api import SparseSeries
        return SparseSeries
    else:
        return objs[0]._constructor
예제 #28
0
파일: reshape.py 프로젝트: zhabzhang/pandas
    def __init__(self,
                 values,
                 index,
                 level=-1,
                 value_columns=None,
                 fill_value=None):

        self.is_categorical = None
        self.is_sparse = is_sparse(values)
        if values.ndim == 1:
            if isinstance(values, Categorical):
                self.is_categorical = values
                values = np.array(values)
            elif self.is_sparse:
                # XXX: Makes SparseArray *dense*, but it's supposedly
                # a single column at a time, so it's "doable"
                values = values.values
            values = values[:, np.newaxis]
        self.values = values
        self.value_columns = value_columns
        self.fill_value = fill_value

        if value_columns is None and values.shape[1] != 1:  # pragma: no cover
            raise ValueError('must pass column labels for multi-column data')

        self.index = index

        self.level = self.index._get_level_number(level)

        # when index includes `nan`, need to lift levels/strides by 1
        self.lift = 1 if -1 in self.index.labels[self.level] else 0

        self.new_index_levels = list(index.levels)
        self.new_index_names = list(index.names)

        self.removed_name = self.new_index_names.pop(self.level)
        self.removed_level = self.new_index_levels.pop(self.level)

        self._make_sorted_values_labels()
        self._make_selectors()
예제 #29
0
파일: concat.py 프로젝트: zeyu-gong/pandas
def get_dtype_kinds(l):
    """
    Parameters
    ----------
    l : list of arrays

    Returns
    -------
    a set of kinds that exist in this list of arrays
    """

    typs = set()
    for arr in l:

        dtype = arr.dtype
        if is_categorical_dtype(dtype):
            typ = "category"
        elif is_sparse(arr):
            typ = "sparse"
        elif isinstance(arr, ABCRangeIndex):
            typ = "range"
        elif is_datetime64tz_dtype(arr):
            # if to_concat contains different tz,
            # the result must be object dtype
            typ = str(arr.dtype)
        elif is_datetime64_dtype(dtype):
            typ = "datetime"
        elif is_timedelta64_dtype(dtype):
            typ = "timedelta"
        elif is_object_dtype(dtype):
            typ = "object"
        elif is_bool_dtype(dtype):
            typ = "bool"
        elif is_extension_array_dtype(dtype):
            typ = str(arr.dtype)
        else:
            typ = dtype.kind
        typs.add(typ)
    return typs
예제 #30
0
def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str:
    """Select upcast class name based on dtype."""
    if is_categorical_dtype(dtype):
        return "category"
    elif is_datetime64tz_dtype(dtype):
        return "datetimetz"
    elif is_extension_array_dtype(dtype):
        return "extension"
    elif issubclass(dtype.type, np.bool_):
        return "bool"
    elif issubclass(dtype.type, np.object_):
        return "object"
    elif is_datetime64_dtype(dtype):
        return "datetime"
    elif is_timedelta64_dtype(dtype):
        return "timedelta"
    elif is_sparse(dtype):
        dtype = cast("SparseDtype", dtype)
        return dtype.subtype.name
    elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
        return dtype.name
    else:
        return "float"
예제 #31
0
파일: ops.py 프로젝트: 09acp/Dash-Examples
    def _cython_operation(self,
                          kind,
                          values,
                          how,
                          axis,
                          min_count=-1,
                          **kwargs):
        assert kind in ["transform", "aggregate"]

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values) or is_sparse(values):
            raise NotImplementedError(
                "{} are not support in cython ops".format(values.dtype))
        elif is_datetime64_any_dtype(values):
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise NotImplementedError(
                    "datetime64 type does not support {} "
                    "operations".format(how))
        elif is_timedelta64_dtype(values):
            if how in ["prod", "cumprod"]:
                raise NotImplementedError(
                    "timedelta64 type does not support {} "
                    "operations".format(how))

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError("arity of more than 1 is not "
                                          "supported for the 'how' argument")
            out_shape = (self.ngroups, ) + values.shape[1:]

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        try:
            func = self._get_cython_function(kind, how, values, is_numeric)
        except NotImplementedError:
            if is_numeric:
                values = ensure_float64(values)
                func = self._get_cython_function(kind, how, values, is_numeric)
            else:
                raise

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = "{kind}{itemsize}".format(
                    kind=values.dtype.kind, itemsize=values.dtype.itemsize)
            else:
                out_dtype = "object"

        labels, _, _ = self.group_info

        if kind == "aggregate":
            result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                 fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(
                result,
                counts,
                values,
                labels,
                func,
                is_numeric,
                is_datetimelike,
                min_count,
            )
        elif kind == "transform":
            result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                 fill_value=np.nan)

            # TODO: min_count
            result = self._transform(result, values, labels, func, is_numeric,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all(
        ):
            if result.ndim == 2:
                try:
                    result = lib.row_bool_subset(result,
                                                 (counts > 0).view(np.uint8))
                except ValueError:
                    result = lib.row_bool_subset_object(
                        ensure_object(result), (counts > 0).view(np.uint8))
            else:
                result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        if how in self._name_functions:
            # TODO
            names = self._name_functions[how]()
        else:
            names = None

        if swapped:
            result = result.swapaxes(0, axis)

        return result, names
예제 #32
0
파일: concat.py 프로젝트: yppdgr/pandas
def _get_empty_dtype_and_na(join_units):
    """
    Return dtype and N/A values to use when concatenating specified units.

    Returned N/A value may be None which means there was no casting involved.

    Returns
    -------
    dtype
    na
    """
    if len(join_units) == 1:
        blk = join_units[0].block
        if blk is None:
            return np.float64, np.nan

    if _is_uniform_reindex(join_units):
        # FIXME: integrate property
        empty_dtype = join_units[0].block.dtype
        upcasted_na = join_units[0].block.fill_value
        return empty_dtype, upcasted_na

    has_none_blocks = False
    dtypes = [None] * len(join_units)
    for i, unit in enumerate(join_units):
        if unit.block is None:
            has_none_blocks = True
        else:
            dtypes[i] = unit.dtype

    upcast_classes = defaultdict(list)
    null_upcast_classes = defaultdict(list)
    for dtype, unit in zip(dtypes, join_units):
        if dtype is None:
            continue

        if is_categorical_dtype(dtype):
            upcast_cls = "category"
        elif is_datetime64tz_dtype(dtype):
            upcast_cls = "datetimetz"
        elif issubclass(dtype.type, np.bool_):
            upcast_cls = "bool"
        elif issubclass(dtype.type, np.object_):
            upcast_cls = "object"
        elif is_datetime64_dtype(dtype):
            upcast_cls = "datetime"
        elif is_timedelta64_dtype(dtype):
            upcast_cls = "timedelta"
        elif is_sparse(dtype):
            upcast_cls = dtype.subtype.name
        elif is_extension_array_dtype(dtype):
            upcast_cls = "object"
        elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
            upcast_cls = dtype.name
        else:
            upcast_cls = "float"

        # Null blocks should not influence upcast class selection, unless there
        # are only null blocks, when same upcasting rules must be applied to
        # null upcast classes.
        if unit.is_na:
            null_upcast_classes[upcast_cls].append(dtype)
        else:
            upcast_classes[upcast_cls].append(dtype)

    if not upcast_classes:
        upcast_classes = null_upcast_classes

    # TODO: de-duplicate with maybe_promote?
    # create the result
    if "object" in upcast_classes:
        return np.dtype(np.object_), np.nan
    elif "bool" in upcast_classes:
        if has_none_blocks:
            return np.dtype(np.object_), np.nan
        else:
            return np.dtype(np.bool_), None
    elif "category" in upcast_classes:
        return np.dtype(np.object_), np.nan
    elif "datetimetz" in upcast_classes:
        # GH-25014. We use NaT instead of iNaT, since this eventually
        # ends up in DatetimeArray.take, which does not allow iNaT.
        dtype = upcast_classes["datetimetz"]
        return dtype[0], tslibs.NaT
    elif "datetime" in upcast_classes:
        return np.dtype("M8[ns]"), np.datetime64("NaT", "ns")
    elif "timedelta" in upcast_classes:
        return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns")
    else:  # pragma
        try:
            g = np.find_common_type(upcast_classes, [])
        except TypeError:
            # At least one is an ExtensionArray
            return np.dtype(np.object_), np.nan
        else:
            if is_float_dtype(g):
                return g, g.type(np.nan)
            elif is_numeric_dtype(g):
                if has_none_blocks:
                    return np.float64, np.nan
                else:
                    return g, None

    msg = "invalid dtype determination in get_concat_dtype"
    raise AssertionError(msg)
예제 #33
0
def get_empty_dtype_and_na(join_units):
    """
    Return dtype and N/A values to use when concatenating specified units.

    Returned N/A value may be None which means there was no casting involved.

    Returns
    -------
    dtype
    na
    """
    if len(join_units) == 1:
        blk = join_units[0].block
        if blk is None:
            return np.float64, np.nan

    if is_uniform_reindex(join_units):
        # XXX: integrate property
        empty_dtype = join_units[0].block.dtype
        upcasted_na = join_units[0].block.fill_value
        return empty_dtype, upcasted_na

    has_none_blocks = False
    dtypes = [None] * len(join_units)
    for i, unit in enumerate(join_units):
        if unit.block is None:
            has_none_blocks = True
        else:
            dtypes[i] = unit.dtype

    upcast_classes = defaultdict(list)
    null_upcast_classes = defaultdict(list)
    for dtype, unit in zip(dtypes, join_units):
        if dtype is None:
            continue

        if is_categorical_dtype(dtype):
            upcast_cls = 'category'
        elif is_datetimetz(dtype):
            upcast_cls = 'datetimetz'
        elif issubclass(dtype.type, np.bool_):
            upcast_cls = 'bool'
        elif issubclass(dtype.type, np.object_):
            upcast_cls = 'object'
        elif is_datetime64_dtype(dtype):
            upcast_cls = 'datetime'
        elif is_timedelta64_dtype(dtype):
            upcast_cls = 'timedelta'
        elif is_sparse(dtype):
            upcast_cls = dtype.subtype.name
        elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
            upcast_cls = dtype.name
        else:
            upcast_cls = 'float'

        # Null blocks should not influence upcast class selection, unless there
        # are only null blocks, when same upcasting rules must be applied to
        # null upcast classes.
        if unit.is_na:
            null_upcast_classes[upcast_cls].append(dtype)
        else:
            upcast_classes[upcast_cls].append(dtype)

    if not upcast_classes:
        upcast_classes = null_upcast_classes

    # create the result
    if 'object' in upcast_classes:
        return np.dtype(np.object_), np.nan
    elif 'bool' in upcast_classes:
        if has_none_blocks:
            return np.dtype(np.object_), np.nan
        else:
            return np.dtype(np.bool_), None
    elif 'category' in upcast_classes:
        return np.dtype(np.object_), np.nan
    elif 'datetimetz' in upcast_classes:
        dtype = upcast_classes['datetimetz']
        return dtype[0], tslibs.iNaT
    elif 'datetime' in upcast_classes:
        return np.dtype('M8[ns]'), tslibs.iNaT
    elif 'timedelta' in upcast_classes:
        return np.dtype('m8[ns]'), tslibs.iNaT
    else:  # pragma
        try:
            g = np.find_common_type(upcast_classes, [])
        except TypeError:
            # At least one is an ExtensionArray
            return np.dtype(np.object_), np.nan
        else:
            if is_float_dtype(g):
                return g, g.type(np.nan)
            elif is_numeric_dtype(g):
                if has_none_blocks:
                    return np.float64, np.nan
                else:
                    return g, None

    msg = "invalid dtype determination in get_concat_dtype"
    raise AssertionError(msg)
예제 #34
0
    def _cython_operation(
        self, kind: str, values, how: str, axis, min_count: int = -1, **kwargs
    ) -> Tuple[np.ndarray, Optional[List[str]]]:
        """
        Returns the values of a cython operation as a Tuple of [data, names].

        Names is only useful when dealing with 2D results, like ohlc
        (see self._name_functions).
        """
        assert kind in ["transform", "aggregate"]
        orig_values = values

        if values.ndim > 2:
            raise NotImplementedError("number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values) or is_sparse(values):
            raise NotImplementedError(f"{values.dtype} dtype not supported")
        elif is_datetime64_any_dtype(values):
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise NotImplementedError(
                    f"datetime64 type does not support {how} operations"
                )
        elif is_timedelta64_dtype(values):
            if how in ["prod", "cumprod"]:
                raise NotImplementedError(
                    f"timedelta64 type does not support {how} operations"
                )

        if is_datetime64tz_dtype(values.dtype):
            # Cast to naive; we'll cast back at the end of the function
            # TODO: possible need to reshape?  kludge can be avoided when
            #  2D EA is allowed.
            values = values.view("M8[ns]")

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError(
                    "arity of more than 1 is not supported for the 'how' argument"
                )
            out_shape = (self.ngroups,) + values.shape[1:]

        func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric)

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}"
            else:
                out_dtype = "object"

        codes, _, _ = self.group_info

        if kind == "aggregate":
            result = _maybe_fill(
                np.empty(out_shape, dtype=out_dtype), fill_value=np.nan
            )
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(result, counts, values, codes, func, min_count)
        elif kind == "transform":
            result = _maybe_fill(
                np.empty_like(values, dtype=out_dtype), fill_value=np.nan
            )

            # TODO: min_count
            result = self._transform(
                result, values, codes, func, is_datetimelike, **kwargs
            )

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan
        elif (
            how == "add"
            and is_integer_dtype(orig_values.dtype)
            and is_extension_array_dtype(orig_values.dtype)
        ):
            # We need this to ensure that Series[Int64Dtype].resample().sum()
            # remains int64 dtype.
            # Two options for avoiding this special case
            # 1. mask-aware ops and avoid casting to float with NaN above
            # 2. specify the result dtype when calling this method
            result = result.astype("int64")

        if kind == "aggregate" and self._filter_empty_groups and not counts.all():
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        names: Optional[List[str]] = self._name_functions.get(how, None)

        if swapped:
            result = result.swapaxes(0, axis)

        if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype(
            orig_values.dtype
        ):
            # We need to use the constructors directly for these dtypes
            # since numpy won't recognize them
            # https://github.com/pandas-dev/pandas/issues/31471
            result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype)
        elif is_datetimelike and kind == "aggregate":
            result = result.astype(orig_values.dtype)

        return result, names
예제 #35
0
def get_empty_dtype_and_na(join_units):
    """
    Return dtype and N/A values to use when concatenating specified units.

    Returned N/A value may be None which means there was no casting involved.

    Returns
    -------
    dtype
    na
    """
    if len(join_units) == 1:
        blk = join_units[0].block
        if blk is None:
            return np.float64, np.nan

    if is_uniform_reindex(join_units):
        # XXX: integrate property
        empty_dtype = join_units[0].block.dtype
        upcasted_na = join_units[0].block.fill_value
        return empty_dtype, upcasted_na

    has_none_blocks = False
    dtypes = [None] * len(join_units)
    for i, unit in enumerate(join_units):
        if unit.block is None:
            has_none_blocks = True
        else:
            dtypes[i] = unit.dtype

    upcast_classes = defaultdict(list)
    null_upcast_classes = defaultdict(list)
    for dtype, unit in zip(dtypes, join_units):
        if dtype is None:
            continue

        if is_categorical_dtype(dtype):
            upcast_cls = 'category'
        elif is_datetime64tz_dtype(dtype):
            upcast_cls = 'datetimetz'
        elif issubclass(dtype.type, np.bool_):
            upcast_cls = 'bool'
        elif issubclass(dtype.type, np.object_):
            upcast_cls = 'object'
        elif is_datetime64_dtype(dtype):
            upcast_cls = 'datetime'
        elif is_timedelta64_dtype(dtype):
            upcast_cls = 'timedelta'
        elif is_sparse(dtype):
            upcast_cls = dtype.subtype.name
        elif is_extension_array_dtype(dtype):
            upcast_cls = 'object'
        elif is_float_dtype(dtype) or is_numeric_dtype(dtype):
            upcast_cls = dtype.name
        else:
            upcast_cls = 'float'

        # Null blocks should not influence upcast class selection, unless there
        # are only null blocks, when same upcasting rules must be applied to
        # null upcast classes.
        if unit.is_na:
            null_upcast_classes[upcast_cls].append(dtype)
        else:
            upcast_classes[upcast_cls].append(dtype)

    if not upcast_classes:
        upcast_classes = null_upcast_classes

    # create the result
    if 'object' in upcast_classes:
        return np.dtype(np.object_), np.nan
    elif 'bool' in upcast_classes:
        if has_none_blocks:
            return np.dtype(np.object_), np.nan
        else:
            return np.dtype(np.bool_), None
    elif 'category' in upcast_classes:
        return np.dtype(np.object_), np.nan
    elif 'datetimetz' in upcast_classes:
        dtype = upcast_classes['datetimetz']
        return dtype[0], tslibs.iNaT
    elif 'datetime' in upcast_classes:
        return np.dtype('M8[ns]'), tslibs.iNaT
    elif 'timedelta' in upcast_classes:
        return np.dtype('m8[ns]'), tslibs.iNaT
    else:  # pragma
        try:
            g = np.find_common_type(upcast_classes, [])
        except TypeError:
            # At least one is an ExtensionArray
            return np.dtype(np.object_), np.nan
        else:
            if is_float_dtype(g):
                return g, g.type(np.nan)
            elif is_numeric_dtype(g):
                if has_none_blocks:
                    return np.float64, np.nan
                else:
                    return g, None

    msg = "invalid dtype determination in get_concat_dtype"
    raise AssertionError(msg)
예제 #36
0
def _concat_sparse(to_concat, axis=0, typs=None):
    """
    provide concatenation of an sparse/dense array of arrays each of which is a
    single dtype

    Parameters
    ----------
    to_concat : array of arrays
    axis : axis to provide concatenation
    typs : set of to_concat dtypes

    Returns
    -------
    a single array, preserving the combined dtypes
    """

    from pandas.core.sparse.array import SparseArray, _make_index

    def convert_sparse(x, axis):
        # coerce to native type
        if isinstance(x, SparseArray):
            x = x.get_values()
        x = x.ravel()
        if axis > 0:
            x = np.atleast_2d(x)
        return x

    if typs is None:
        typs = get_dtype_kinds(to_concat)

    if len(typs) == 1:
        # concat input as it is if all inputs are sparse
        # and have the same fill_value
        fill_values = set(c.fill_value for c in to_concat)
        if len(fill_values) == 1:
            sp_values = [c.sp_values for c in to_concat]
            indexes = [c.sp_index.to_int_index() for c in to_concat]

            indices = []
            loc = 0
            for idx in indexes:
                indices.append(idx.indices + loc)
                loc += idx.length
            sp_values = np.concatenate(sp_values)
            indices = np.concatenate(indices)
            sp_index = _make_index(loc, indices, kind=to_concat[0].sp_index)

            return SparseArray(sp_values, sparse_index=sp_index,
                               fill_value=to_concat[0].fill_value)

    # input may be sparse / dense mixed and may have different fill_value
    # input must contain sparse at least 1
    sparses = [c for c in to_concat if is_sparse(c)]
    fill_values = [c.fill_value for c in sparses]
    sp_indexes = [c.sp_index for c in sparses]

    # densify and regular concat
    to_concat = [convert_sparse(x, axis) for x in to_concat]
    result = np.concatenate(to_concat, axis=axis)

    if not len(typs - set(['sparse', 'f', 'i'])):
        # sparsify if inputs are sparse and dense numerics
        # first sparse input's fill_value and SparseIndex is used
        result = SparseArray(result.ravel(), fill_value=fill_values[0],
                             kind=sp_indexes[0])
    else:
        # coerce to object if needed
        result = result.astype('object')
    return result
예제 #37
0
def _try_cast(
    arr: Union[list, np.ndarray],
    dtype: Optional[DtypeObj],
    copy: bool,
    raise_cast_failure: bool,
) -> ArrayLike:
    """
    Convert input to numpy ndarray and optionally cast to a given dtype.

    Parameters
    ----------
    arr : ndarray or list
        Excludes: ExtensionArray, Series, Index.
    dtype : np.dtype, ExtensionDtype or None
    copy : bool
        If False, don't copy the data if not needed.
    raise_cast_failure : bool
        If True, and if a dtype is specified, raise errors during casting.
        Otherwise an object array is returned.

    Returns
    -------
    np.ndarray or ExtensionArray
    """
    # perf shortcut as this is the most common case
    if (isinstance(arr, np.ndarray) and maybe_castable(arr.dtype) and not copy
            and dtype is None):
        # error: Incompatible return value type (got "ndarray", expected
        # "ExtensionArray")
        return arr  # type: ignore[return-value]

    if isinstance(dtype, ExtensionDtype) and (dtype.kind != "M"
                                              or is_sparse(dtype)):
        # create an extension array from its dtype
        # DatetimeTZ case needs to go through maybe_cast_to_datetime but
        # SparseDtype does not
        array_type = dtype.construct_array_type()._from_sequence
        subarr = array_type(arr, dtype=dtype, copy=copy)
        return subarr

    if is_object_dtype(dtype) and not isinstance(arr, np.ndarray):
        subarr = construct_1d_object_array_from_listlike(arr)
        return subarr

    try:
        # GH#15832: Check if we are requesting a numeric dtype and
        # that we can convert the data to the requested dtype.
        if is_integer_dtype(dtype):
            # this will raise if we have e.g. floats

            # error: Argument 2 to "maybe_cast_to_integer_array" has incompatible type
            # "Union[dtype, ExtensionDtype, None]"; expected "Union[ExtensionDtype, str,
            # dtype, Type[str], Type[float], Type[int], Type[complex], Type[bool],
            # Type[object]]"
            maybe_cast_to_integer_array(arr, dtype)  # type: ignore[arg-type]
            subarr = arr
        else:
            subarr = maybe_cast_to_datetime(arr, dtype)
            if dtype is not None and dtype.kind == "M":
                return subarr

        if not isinstance(subarr, ABCExtensionArray):
            subarr = construct_1d_ndarray_preserving_na(subarr,
                                                        dtype,
                                                        copy=copy)
    except OutOfBoundsDatetime:
        # in case of out of bound datetime64 -> always raise
        raise
    except (ValueError, TypeError) as err:
        if dtype is not None and raise_cast_failure:
            raise
        elif "Cannot cast" in str(err):
            # via _disallow_mismatched_datetimelike
            raise
        else:
            subarr = np.array(arr, dtype=object, copy=copy)
    return subarr
예제 #38
0
파일: ops.py 프로젝트: zpion-id/pandas
    def _cython_operation(self,
                          kind: str,
                          values,
                          how,
                          axis,
                          min_count=-1,
                          **kwargs):
        assert kind in ["transform", "aggregate"]
        orig_values = values

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values) or is_sparse(values):
            raise NotImplementedError(
                "{dtype} dtype not supported".format(dtype=values.dtype))
        elif is_datetime64_any_dtype(values):
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise NotImplementedError(
                    "datetime64 type does not support {how} operations".format(
                        how=how))
        elif is_timedelta64_dtype(values):
            if how in ["prod", "cumprod"]:
                raise NotImplementedError(
                    "timedelta64 type does not support {how} operations".
                    format(how=how))

        if is_datetime64tz_dtype(values.dtype):
            # Cast to naive; we'll cast back at the end of the function
            # TODO: possible need to reshape?  kludge can be avoided when
            #  2D EA is allowed.
            values = values.view("M8[ns]")

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError(
                    "arity of more than 1 is not supported for the 'how' argument"
                )
            out_shape = (self.ngroups, ) + values.shape[1:]

        try:
            func = self._get_cython_function(kind, how, values, is_numeric)
        except NotImplementedError:
            if is_numeric:
                try:
                    values = ensure_float64(values)
                except TypeError:
                    if lib.infer_dtype(values, skipna=False) == "complex":
                        values = values.astype(complex)
                    else:
                        raise
                func = self._get_cython_function(kind, how, values, is_numeric)
            else:
                raise

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = "{kind}{itemsize}".format(
                    kind=values.dtype.kind, itemsize=values.dtype.itemsize)
            else:
                out_dtype = "object"

        labels, _, _ = self.group_info

        if kind == "aggregate":
            result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                 fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(result, counts, values, labels, func,
                                     is_datetimelike, min_count)
        elif kind == "transform":
            result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                 fill_value=np.nan)

            # TODO: min_count
            result = self._transform(result, values, labels, func,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all(
        ):
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        if how in self._name_functions:
            names = self._name_functions[how]()  # type: Optional[List[str]]
        else:
            names = None

        if swapped:
            result = result.swapaxes(0, axis)

        if is_datetime64tz_dtype(orig_values.dtype):
            result = type(orig_values)(result.astype(np.int64),
                                       dtype=orig_values.dtype)
        elif is_datetimelike and kind == "aggregate":
            result = result.astype(orig_values.dtype)

        return result, names