示例#1
0
def test_is_string_dtype():
    assert not com.is_string_dtype(int)
    assert not com.is_string_dtype(pd.Series([1, 2]))

    assert com.is_string_dtype(str)
    assert com.is_string_dtype(object)
    assert com.is_string_dtype(np.array(['a', 'b']))
示例#2
0
文件: all.py 项目: caskeep/interpret
def _assign_feature_type(feature_type, unique_count=0):
    if is_string_dtype(feature_type) or (
        is_numeric_dtype(feature_type) and unique_count <= 2
    ):
        return "categorical"
    elif is_numeric_dtype(feature_type):
        return "continuous"
    else:
        return "unknown"
示例#3
0
文件: all.py 项目: caskeep/interpret
def autogen_schema(X, ordinal_max_items=2, feature_names=None, feature_types=None):
    """ Generates data schema for a given dataset as JSON representable.

    Args:
        X: Dataframe/ndarray to build schema from.
        ordinal_max_items: If a numeric column's cardinality
            is at most this integer,
            consider it as ordinal instead of continuous.
        feature_names: Feature names
        feature_types: Feature types

    Returns:
        A dictionary - schema that encapsulates column information,
        such as type and domain.
    """
    schema = OrderedDict()
    col_number = 0
    if isinstance(X, np.ndarray):
        log.warning(
            "Passing a numpy array to schema autogen when it should be dataframe."
        )
        if feature_names is None:
            X = pd.DataFrame(
                X, columns=["col_" + str(i) for i in range(X.shape[1])]
            ).infer_objects()
        else:
            X = pd.DataFrame(X, columns=feature_names).infer_objects()
    if isinstance(X, NDFrame):
        for name, col_dtype in zip(X.dtypes.index, X.dtypes):
            schema[name] = {}
            if is_numeric_dtype(col_dtype):
                # schema[name]['type'] = 'continuous'
                # TODO: Fix this once we know it works.
                if len(set(X[name])) > ordinal_max_items:
                    schema[name]["type"] = "continuous"
                else:
                    # TODO: Work with ordinal later.
                    schema[name]["type"] = "categorical"
                    # schema[name]['type'] = 'ordinal'
                    # schema[name]['order'] = list(set(X[name]))
            elif is_string_dtype(col_dtype):
                schema[name]["type"] = "categorical"
            else:
                warnings.warn("Unknown column: " + name, RuntimeWarning)
                schema[name]["type"] = "unknown"
            schema[name]["column_number"] = col_number
            col_number += 1

        # Override if feature_types is passed as arg.
        if feature_types is not None:
            for idx, name in enumerate(X.dtypes.index):
                schema[name]["type"] = feature_types[idx]
    else:
        raise TypeError("GA2M only supports numpy arrays or pandas dataframes.")

    return schema
示例#4
0
文件: array.py 项目: sechilds/pandas
def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False):
    """
    Convert ndarray to sparse format

    Parameters
    ----------
    arr : ndarray
    kind : {'block', 'integer'}
    fill_value : NaN or another value
    dtype : np.dtype, optional
    copy : bool, default False

    Returns
    -------
    (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)
    """

    arr = _sanitize_values(arr)

    if arr.ndim > 1:
        raise TypeError("expected dimension <= 1 data")

    if fill_value is None:
        fill_value = na_value_for_dtype(arr.dtype)

    if isna(fill_value):
        mask = notna(arr)
    else:
        # For str arrays in NumPy 1.12.0, operator!= below isn't
        # element-wise but just returns False if fill_value is not str,
        # so cast to object comparison to be safe
        if is_string_dtype(arr):
            arr = arr.astype(object)

        if is_object_dtype(arr.dtype):
            # element-wise equality check method in numpy doesn't treat
            # each element type, eg. 0, 0.0, and False are treated as
            # same. So we have to check the both of its type and value.
            mask = splib.make_mask_object_ndarray(arr, fill_value)
        else:
            mask = arr != fill_value

    length = len(arr)
    if length != len(mask):
        # the arr is a SparseArray
        indices = mask.sp_index.indices
    else:
        indices = mask.nonzero()[0].astype(np.int32)

    index = _make_index(length, indices, kind)
    sparsified_values = arr[mask]
    if dtype is not None:
        sparsified_values = astype_nansafe(sparsified_values, dtype=dtype)
    # TODO: copy
    return sparsified_values, index, fill_value
示例#5
0
    def _simple_new(cls, left, right, closed=None,
                    copy=False, dtype=None, verify_integrity=True):
        result = IntervalMixin.__new__(cls)

        closed = closed or 'right'
        left = ensure_index(left, copy=copy)
        right = ensure_index(right, copy=copy)

        if dtype is not None:
            # GH 19262: dtype must be an IntervalDtype to override inferred
            dtype = pandas_dtype(dtype)
            if not is_interval_dtype(dtype):
                msg = 'dtype must be an IntervalDtype, got {dtype}'
                raise TypeError(msg.format(dtype=dtype))
            elif dtype.subtype is not None:
                left = left.astype(dtype.subtype)
                right = right.astype(dtype.subtype)

        # coerce dtypes to match if needed
        if is_float_dtype(left) and is_integer_dtype(right):
            right = right.astype(left.dtype)
        elif is_float_dtype(right) and is_integer_dtype(left):
            left = left.astype(right.dtype)

        if type(left) != type(right):
            msg = ('must not have differing left [{ltype}] and right '
                   '[{rtype}] types')
            raise ValueError(msg.format(ltype=type(left).__name__,
                                        rtype=type(right).__name__))
        elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype):
            # GH 19016
            msg = ('category, object, and string subtypes are not supported '
                   'for IntervalArray')
            raise TypeError(msg)
        elif isinstance(left, ABCPeriodIndex):
            msg = 'Period dtypes are not supported, use a PeriodIndex instead'
            raise ValueError(msg)
        elif (isinstance(left, ABCDatetimeIndex) and
                str(left.tz) != str(right.tz)):
            msg = ("left and right must have the same time zone, got "
                   "'{left_tz}' and '{right_tz}'")
            raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz))

        result._left = left
        result._right = right
        result._closed = closed
        if verify_integrity:
            result._validate()
        return result
示例#6
0
 def astype(self, dtype, copy=True):
     if is_object_dtype(dtype):
         return self._box_values_as_index()
     elif is_string_dtype(dtype) and not is_categorical_dtype(dtype):
         return Index(self.format(), name=self.name, dtype=object)
     elif is_integer_dtype(dtype):
         return Index(self.values.astype('i8', copy=copy), name=self.name,
                      dtype='i8')
     elif (is_datetime_or_timedelta_dtype(dtype) and
           not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype):
         # disallow conversion between datetime/timedelta,
         # and conversions for any datetimelike to float
         msg = 'Cannot cast {name} to dtype {dtype}'
         raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
     return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy)
示例#7
0
def as_json_table_type(x):
    """
    Convert a NumPy / pandas type to its corresponding json_table.

    Parameters
    ----------
    x : array or dtype

    Returns
    -------
    t : str
        the Table Schema data types

    Notes
    -----
    This table shows the relationship between NumPy / pandas dtypes,
    and Table Schema dtypes.

    ==============  =================
    Pandas type     Table Schema type
    ==============  =================
    int64           integer
    float64         number
    bool            boolean
    datetime64[ns]  datetime
    timedelta64[ns] duration
    object          str
    categorical     any
    =============== =================
    """
    if is_integer_dtype(x):
        return 'integer'
    elif is_bool_dtype(x):
        return 'boolean'
    elif is_numeric_dtype(x):
        return 'number'
    elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or
          is_period_dtype(x)):
        return 'datetime'
    elif is_timedelta64_dtype(x):
        return 'duration'
    elif is_categorical_dtype(x):
        return 'any'
    elif is_string_dtype(x):
        return 'string'
    else:
        return 'any'
示例#8
0
 def astype(self, dtype, copy=True):
     if is_object_dtype(dtype):
         return self._box_values_as_index()
     elif is_string_dtype(dtype) and not is_categorical_dtype(dtype):
         return Index(self.format(), name=self.name, dtype=object)
     elif is_integer_dtype(dtype):
         # TODO(DatetimeArray): use self._values here.
         # Can't use ._values currently, because that returns a
         # DatetimeIndex, which throws us in an infinite loop.
         return Index(self.values.astype('i8', copy=copy), name=self.name,
                      dtype='i8')
     elif (is_datetime_or_timedelta_dtype(dtype) and
           not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype):
         # disallow conversion between datetime/timedelta,
         # and conversions for any datetimelike to float
         msg = 'Cannot cast {name} to dtype {dtype}'
         raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
     return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy)
示例#9
0
    def __new__(cls, subtype=None):
        """
        Parameters
        ----------
        subtype : the dtype of the Interval
        """
        from pandas.core.dtypes.common import (
            is_categorical_dtype, is_string_dtype, pandas_dtype)

        if isinstance(subtype, IntervalDtype):
            return subtype
        elif subtype is None:
            # we are called as an empty constructor
            # generally for pickle compat
            u = object.__new__(cls)
            u.subtype = None
            return u
        elif (isinstance(subtype, compat.string_types) and
              subtype.lower() == 'interval'):
            subtype = None
        else:
            if isinstance(subtype, compat.string_types):
                m = cls._match.search(subtype)
                if m is not None:
                    subtype = m.group('subtype')

            try:
                subtype = pandas_dtype(subtype)
            except TypeError:
                raise TypeError("could not construct IntervalDtype")

        if is_categorical_dtype(subtype) or is_string_dtype(subtype):
            # GH 19016
            msg = ('category, object, and string subtypes are not supported '
                   'for IntervalDtype')
            raise TypeError(msg)

        try:
            return cls._cache[str(subtype)]
        except KeyError:
            u = object.__new__(cls)
            u.subtype = subtype
            cls._cache[str(subtype)] = u
            return u
示例#10
0
def make_sparse(arr, kind='block', fill_value=None):
    """
    Convert ndarray to sparse format

    Parameters
    ----------
    arr : ndarray
    kind : {'block', 'integer'}
    fill_value : NaN or another value

    Returns
    -------
    (sparse_values, index) : (ndarray, SparseIndex)
    """

    arr = _sanitize_values(arr)

    if arr.ndim > 1:
        raise TypeError("expected dimension <= 1 data")

    if fill_value is None:
        fill_value = na_value_for_dtype(arr.dtype)

    if isna(fill_value):
        mask = notna(arr)
    else:
        # For str arrays in NumPy 1.12.0, operator!= below isn't
        # element-wise but just returns False if fill_value is not str,
        # so cast to object comparison to be safe
        if is_string_dtype(arr):
            arr = arr.astype(object)

        mask = arr != fill_value

    length = len(arr)
    if length != mask.size:
        # the arr is a SparseArray
        indices = mask.sp_index.indices
    else:
        indices = mask.nonzero()[0].astype(np.int32)

    index = _make_index(length, indices, kind)
    sparsified_values = arr[mask]
    return sparsified_values, index, fill_value
示例#11
0
文件: period.py 项目: dsm054/pandas
    def astype(self, dtype, copy=True):
        # TODO: Figure out something better here...
        # We have DatetimeLikeArrayMixin ->
        #     super(...), which ends up being... DatetimeIndexOpsMixin?
        # this is complicated.
        # need a pandas_astype(arr, dtype).
        from pandas import Categorical

        dtype = pandas_dtype(dtype)

        if is_object_dtype(dtype):
            return np.asarray(self, dtype=object)
        elif is_string_dtype(dtype) and not is_categorical_dtype(dtype):
            return self._format_native_types()
        elif is_integer_dtype(dtype):
            values = self._data

            if values.dtype != dtype:
                # int32 vs. int64
                values = values.astype(dtype)

            elif copy:
                values = values.copy()

            return values
        elif (is_datetime_or_timedelta_dtype(dtype) and
              not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype):
            # disallow conversion between datetime/timedelta,
            # and conversions for any datetimelike to float
            msg = 'Cannot cast {name} to dtype {dtype}'
            raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
        elif is_categorical_dtype(dtype):
            return Categorical(self, dtype=dtype)
        elif is_period_dtype(dtype):
            return self.asfreq(dtype.freq)
        else:
            return np.asarray(self, dtype=dtype)
示例#12
0
文件: dtype.py 项目: sechilds/pandas
    def __init__(self, dtype=np.float64, fill_value=None):
        # type: (Union[str, np.dtype, 'ExtensionDtype', type], Any) -> None
        from pandas.core.dtypes.missing import na_value_for_dtype
        from pandas.core.dtypes.common import (
            pandas_dtype, is_string_dtype, is_scalar
        )

        if isinstance(dtype, type(self)):
            if fill_value is None:
                fill_value = dtype.fill_value
            dtype = dtype.subtype

        dtype = pandas_dtype(dtype)
        if is_string_dtype(dtype):
            dtype = np.dtype('object')

        if fill_value is None:
            fill_value = na_value_for_dtype(dtype)

        if not is_scalar(fill_value):
            raise ValueError("fill_value must be a scalar. Got {} "
                             "instead".format(fill_value))
        self._dtype = dtype
        self._fill_value = fill_value
示例#13
0
 def test_not_string(self):
     # though CategoricalDtype has object kind, it cannot be string
     assert not is_string_dtype(CategoricalDtype())
示例#14
0
    def test_apply_custom_column_types(self):
        data = {
            'to_numerical': [
                '150', '200', '50', '10', '5', '150', '200', '50', '10', '5',
                '1'
            ] * 10,
            'to_text': [
                '150', '200', '50', '10', '5', '150', '200', '50', '10', '5',
                '1'
            ] * 10,
            'to_categorical':
            [150, 200, 50, 10, 5, 150, 200, 50, 10, 5, 1] * 10,
            'stay_categorical': [
                '150', '200', '50', '10', '5', '150', '200', '50', '10', '5',
                '1'
            ] * 10
        }
        df1 = df2 = pd.DataFrame.from_dict(data)

        custom_column_types = {
            'to_numerical': ColumnType.numerical,
            'to_text': ColumnType.text,
            'to_categorical': ColumnType.categorical
        }

        store = Store(df1, df2, custom_column_types=custom_column_types)

        with self.subTest("Apply custom_column_types"):
            self.assertEqual(['to_categorical', 'stay_categorical'],
                             store.type_to_columns[ColumnType.categorical])
            self.assertEqual(['to_text'],
                             store.type_to_columns[ColumnType.text])
            self.assertEqual(['to_numerical'],
                             store.type_to_columns[ColumnType.numerical])

        with self.subTest(
                "Apply numerical conversion for custom_column_types to dataframes"
        ):
            self.assertTrue(is_numeric_dtype(store.df1['to_numerical']))
            self.assertTrue(store.df1['to_numerical'].equals(
                pd.Series([
                    150.0, 200.0, 50.0, 10.0, 5.0, 150.0, 200.0, 50.0, 10.0,
                    5.0, 1.0
                ] * 10)))

        with self.subTest(
                "Apply categorical conversion for custom_column_types to dataframes"
        ):
            self.assertTrue(is_string_dtype(store.df1['to_categorical']))
            self.assertTrue(store.df1['to_categorical'].equals(
                pd.Series([
                    '150', '200', '50', '10', '5', '150', '200', '50', '10',
                    '5', '1'
                ] * 10)))

        with self.subTest(
                "Apply textual conversion for custom_column_types to dataframes"
        ):
            self.assertTrue(is_string_dtype(store.df1['to_text']))
            self.assertTrue(store.df1['to_text'].equals(
                pd.Series([
                    '150', '200', '50', '10', '5', '150', '200', '50', '10',
                    '5', '1'
                ] * 10)))
示例#15
0
def array_equivalent(left, right, strict_nan: bool = False) -> bool:
    """
    True if two arrays, left and right, have equal non-NaN elements, and NaNs
    in corresponding locations.  False otherwise. It is assumed that left and
    right are NumPy arrays of the same dtype. The behavior of this function
    (particularly with respect to NaNs) is not defined if the dtypes are
    different.

    Parameters
    ----------
    left, right : ndarrays
    strict_nan : bool, default False
        If True, consider NaN and None to be different.

    Returns
    -------
    b : bool
        Returns True if the arrays are equivalent.

    Examples
    --------
    >>> array_equivalent(
    ...     np.array([1, 2, np.nan]),
    ...     np.array([1, 2, np.nan]))
    True
    >>> array_equivalent(
    ...     np.array([1, np.nan, 2]),
    ...     np.array([1, 2, np.nan]))
    False
    """
    left, right = np.asarray(left), np.asarray(right)

    # shape compat
    if left.shape != right.shape:
        return False

    # Object arrays can contain None, NaN and NaT.
    # string dtypes must be come to this path for NumPy 1.7.1 compat
    if is_string_dtype(left) or is_string_dtype(right):

        if not strict_nan:
            # isna considers NaN and None to be equivalent.
            return lib.array_equivalent_object(
                ensure_object(left.ravel()), ensure_object(right.ravel())
            )

        for left_value, right_value in zip(left, right):
            if left_value is NaT and right_value is not NaT:
                return False

            elif left_value is libmissing.NA and right_value is not libmissing.NA:
                return False

            elif isinstance(left_value, float) and np.isnan(left_value):
                if not isinstance(right_value, float) or not np.isnan(right_value):
                    return False
            else:
                try:
                    if np.any(np.asarray(left_value != right_value)):
                        return False
                except TypeError as err:
                    if "Cannot compare tz-naive" in str(err):
                        # tzawareness compat failure, see GH#28507
                        return False
                    elif "boolean value of NA is ambiguous" in str(err):
                        return False
                    raise
        return True

    # NaNs can occur in float and complex arrays.
    if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype):

        # empty
        if not (np.prod(left.shape) and np.prod(right.shape)):
            return True
        return ((left == right) | (isna(left) & isna(right))).all()

    elif is_datetimelike_v_numeric(left, right):
        # GH#29553 avoid numpy deprecation warning
        return False

    elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype):
        # datetime64, timedelta64, Period
        if not is_dtype_equal(left.dtype, right.dtype):
            return False

        left = left.view("i8")
        right = right.view("i8")

    # if we have structured dtypes, compare first
    if left.dtype.type is np.void or right.dtype.type is np.void:
        if left.dtype != right.dtype:
            return False

    return np.array_equal(left, right)
示例#16
0
def sanitize_array(
    data,
    index: Optional[Index],
    dtype: Optional[DtypeObj] = None,
    copy: bool = False,
    raise_cast_failure: bool = False,
) -> ArrayLike:
    """
    Sanitize input data to an ndarray or ExtensionArray, copy if specified,
    coerce to the dtype if specified.
    """

    if isinstance(data, ma.MaskedArray):
        mask = ma.getmaskarray(data)
        if mask.any():
            data, fill_value = maybe_upcast(data, copy=True)
            data.soften_mask()  # set hardmask False if it was True
            data[mask] = fill_value
        else:
            data = data.copy()

    # extract ndarray or ExtensionArray, ensure we have no PandasArray
    data = extract_array(data, extract_numpy=True)

    # GH#846
    if isinstance(data, np.ndarray):

        if dtype is not None and is_float_dtype(
                data.dtype) and is_integer_dtype(dtype):
            # possibility of nan -> garbage
            try:
                subarr = _try_cast(data, dtype, copy, True)
            except ValueError:
                if copy:
                    subarr = data.copy()
                else:
                    subarr = np.array(data, copy=False)
        else:
            # we will try to copy be-definition here
            subarr = _try_cast(data, dtype, copy, raise_cast_failure)

    elif isinstance(data, ABCExtensionArray):
        # it is already ensured above this is not a PandasArray
        subarr = data

        if dtype is not None:
            subarr = subarr.astype(dtype, copy=copy)
        elif copy:
            subarr = subarr.copy()
        return subarr

    elif isinstance(data,
                    (list, tuple, abc.Set, abc.ValuesView)) and len(data) > 0:
        if isinstance(data, set):
            # Raise only for unordered sets, e.g., not for dict_keys
            raise TypeError("Set type is unordered")
        data = list(data)

        if dtype is not None:
            subarr = _try_cast(data, dtype, copy, raise_cast_failure)
        else:
            subarr = maybe_convert_platform(data)

        subarr = maybe_cast_to_datetime(subarr, dtype)

    elif isinstance(data, range):
        # GH#16804
        arr = np.arange(data.start, data.stop, data.step, dtype="int64")
        subarr = _try_cast(arr, dtype, copy, raise_cast_failure)
    elif lib.is_scalar(data) and index is not None and dtype is not None:
        data = maybe_cast_to_datetime(data, dtype)
        if not lib.is_scalar(data):
            data = data[0]
        subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype)
    else:
        subarr = _try_cast(data, dtype, copy, raise_cast_failure)

    # scalar like, GH
    if getattr(subarr, "ndim", 0) == 0:
        if isinstance(data, list):  # pragma: no cover
            subarr = np.array(data, dtype=object)
        elif index is not None:
            value = data

            # figure out the dtype from the value (upcast if necessary)
            if dtype is None:
                dtype, value = infer_dtype_from_scalar(value,
                                                       pandas_dtype=True)
            else:
                # need to possibly convert the value here
                value = maybe_cast_to_datetime(value, dtype)

            subarr = construct_1d_arraylike_from_scalar(
                value, len(index), dtype)

        else:
            return subarr.item()

    # the result that we want
    elif subarr.ndim == 1:
        if index is not None:

            # a 1-element ndarray
            if len(subarr) != len(index) and len(subarr) == 1:
                subarr = construct_1d_arraylike_from_scalar(
                    subarr[0], len(index), subarr.dtype)

    elif subarr.ndim > 1:
        if isinstance(data, np.ndarray):
            raise Exception("Data must be 1-dimensional")
        else:
            subarr = com.asarray_tuplesafe(data, dtype=dtype)

    if not (is_extension_array_dtype(subarr.dtype)
            or is_extension_array_dtype(dtype)):
        # This is to prevent mixed-type Series getting all casted to
        # NumPy string type, e.g. NaN --> '-1#IND'.
        if issubclass(subarr.dtype.type, str):
            # GH#16605
            # If not empty convert the data to dtype
            # GH#19853: If data is a scalar, subarr has already the result
            if not lib.is_scalar(data):
                if not np.all(isna(data)):
                    data = np.array(data, dtype=dtype, copy=False)
                subarr = np.array(data, dtype=object, copy=copy)

        is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype(
            dtype)
        if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype:
            inferred = lib.infer_dtype(subarr, skipna=False)
            if inferred in {"interval", "period"}:
                subarr = array(subarr)

    return subarr
示例#17
0
 def test_not_string(self):
     # though PeriodDtype has object kind, it cannot be string
     self.assertFalse(is_string_dtype(PeriodDtype('D')))
示例#18
0
    def _convert_to_ndarrays(self,
                             dct,
                             na_values,
                             na_fvalues,
                             verbose=False,
                             converters=None,
                             dtypes=None):
        result = {}
        for c, values in dct.items():
            conv_f = None if converters is None else converters.get(c, None)
            if isinstance(dtypes, dict):
                cast_type = dtypes.get(c, None)
            else:
                # single dtype or None
                cast_type = dtypes

            if self.na_filter:
                col_na_values, col_na_fvalues = _get_na_values(
                    c, na_values, na_fvalues, self.keep_default_na)
            else:
                col_na_values, col_na_fvalues = set(), set()

            if conv_f is not None:
                # conv_f applied to data before inference
                if cast_type is not None:
                    warnings.warn(
                        ("Both a converter and dtype were specified "
                         f"for column {c} - only the converter will be used"),
                        ParserWarning,
                        stacklevel=7,
                    )

                try:
                    values = lib.map_infer(values, conv_f)
                except ValueError:
                    mask = algorithms.isin(values,
                                           list(na_values)).view(np.uint8)
                    values = lib.map_infer_mask(values, conv_f, mask)

                cvals, na_count = self._infer_types(values,
                                                    set(col_na_values)
                                                    | col_na_fvalues,
                                                    try_num_bool=False)
            else:
                is_ea = is_extension_array_dtype(cast_type)
                is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)
                # skip inference if specified dtype is object
                # or casting to an EA
                try_num_bool = not (cast_type and is_str_or_ea_dtype)

                # general type inference and conversion
                cvals, na_count = self._infer_types(
                    values,
                    set(col_na_values) | col_na_fvalues, try_num_bool)

                # type specified in dtype param or cast_type is an EA
                if cast_type and (not is_dtype_equal(cvals, cast_type)
                                  or is_extension_array_dtype(cast_type)):
                    if not is_ea and na_count > 0:
                        try:
                            if is_bool_dtype(cast_type):
                                raise ValueError(
                                    f"Bool column has NA values in column {c}")
                        except (AttributeError, TypeError):
                            # invalid input to is_bool_dtype
                            pass
                    cast_type = pandas_dtype(cast_type)
                    cvals = self._cast_types(cvals, cast_type, c)

            result[c] = cvals
            if verbose and na_count:
                print(f"Filled {na_count} NA values in column {c!s}")
        return result
示例#19
0
def maybe_cast_to_datetime(value, dtype, errors: str = "raise"):
    """ try to cast the array/value to a datetimelike dtype, converting float
    nan to iNaT
    """
    from pandas.core.tools.timedeltas import to_timedelta
    from pandas.core.tools.datetimes import to_datetime

    if dtype is not None:
        if isinstance(dtype, str):
            dtype = np.dtype(dtype)

        is_datetime64 = is_datetime64_dtype(dtype)
        is_datetime64tz = is_datetime64tz_dtype(dtype)
        is_timedelta64 = is_timedelta64_dtype(dtype)

        if is_datetime64 or is_datetime64tz or is_timedelta64:

            # Force the dtype if needed.
            msg = (f"The '{dtype.name}' dtype has no unit. "
                   f"Please pass in '{dtype.name}[ns]' instead.")

            if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE):

                # pandas supports dtype whose granularity is less than [ns]
                # e.g., [ps], [fs], [as]
                if dtype <= np.dtype("M8[ns]"):
                    if dtype.name == "datetime64":
                        raise ValueError(msg)
                    dtype = _NS_DTYPE
                else:
                    raise TypeError(
                        f"cannot convert datetimelike to dtype [{dtype}]")
            elif is_datetime64tz:

                # our NaT doesn't support tz's
                # this will coerce to DatetimeIndex with
                # a matching dtype below
                if is_scalar(value) and isna(value):
                    value = [value]

            elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE):

                # pandas supports dtype whose granularity is less than [ns]
                # e.g., [ps], [fs], [as]
                if dtype <= np.dtype("m8[ns]"):
                    if dtype.name == "timedelta64":
                        raise ValueError(msg)
                    dtype = _TD_DTYPE
                else:
                    raise TypeError(
                        f"cannot convert timedeltalike to dtype [{dtype}]")

            if is_scalar(value):
                if value == iNaT or isna(value):
                    value = iNaT
            else:
                value = np.array(value, copy=False)

                # have a scalar array-like (e.g. NaT)
                if value.ndim == 0:
                    value = iNaT

                # we have an array of datetime or timedeltas & nulls
                elif np.prod(
                        value.shape) or not is_dtype_equal(value.dtype, dtype):
                    try:
                        if is_datetime64:
                            value = to_datetime(value, errors=errors)
                            # GH 25843: Remove tz information since the dtype
                            # didn't specify one
                            if value.tz is not None:
                                value = value.tz_localize(None)
                            value = value._values
                        elif is_datetime64tz:
                            # The string check can be removed once issue #13712
                            # is solved. String data that is passed with a
                            # datetime64tz is assumed to be naive which should
                            # be localized to the timezone.
                            is_dt_string = is_string_dtype(value)
                            value = to_datetime(value, errors=errors).array
                            if is_dt_string:
                                # Strings here are naive, so directly localize
                                value = value.tz_localize(dtype.tz)
                            else:
                                # Numeric values are UTC at this point,
                                # so localize and convert
                                value = value.tz_localize("UTC").tz_convert(
                                    dtype.tz)
                        elif is_timedelta64:
                            value = to_timedelta(value, errors=errors)._values
                    except OutOfBoundsDatetime:
                        raise
                    except (AttributeError, ValueError, TypeError):
                        pass

        # coerce datetimelike to object
        elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype):
            if is_object_dtype(dtype):
                if value.dtype != _NS_DTYPE:
                    value = value.astype(_NS_DTYPE)
                ints = np.asarray(value).view("i8")
                return tslib.ints_to_pydatetime(ints)

            # we have a non-castable dtype that was passed
            raise TypeError(f"Cannot cast datetime64 to {dtype}")

    else:

        is_array = isinstance(value, np.ndarray)

        # catch a datetime/timedelta that is not of ns variety
        # and no coercion specified
        if is_array and value.dtype.kind in ["M", "m"]:
            dtype = value.dtype

            if dtype.kind == "M" and dtype != _NS_DTYPE:
                value = tslibs.conversion.ensure_datetime64ns(value)

            elif dtype.kind == "m" and dtype != _TD_DTYPE:
                value = to_timedelta(value)

        # only do this if we have an array and the dtype of the array is not
        # setup already we are not an integer/object, so don't bother with this
        # conversion
        elif not (is_array and not (issubclass(value.dtype.type, np.integer)
                                    or value.dtype == np.object_)):
            value = maybe_infer_to_datetimelike(value)

    return value
示例#20
0
    def _simple_new(cls,
                    left,
                    right,
                    closed=None,
                    copy=False,
                    dtype=None,
                    verify_integrity=True):
        result = IntervalMixin.__new__(cls)

        closed = closed or "right"
        left = ensure_index(left, copy=copy)
        right = ensure_index(right, copy=copy)

        if dtype is not None:
            # GH 19262: dtype must be an IntervalDtype to override inferred
            dtype = pandas_dtype(dtype)
            if not is_interval_dtype(dtype):
                msg = f"dtype must be an IntervalDtype, got {dtype}"
                raise TypeError(msg)
            elif dtype.subtype is not None:
                left = left.astype(dtype.subtype)
                right = right.astype(dtype.subtype)

        # coerce dtypes to match if needed
        if is_float_dtype(left) and is_integer_dtype(right):
            right = right.astype(left.dtype)
        elif is_float_dtype(right) and is_integer_dtype(left):
            left = left.astype(right.dtype)

        if type(left) != type(right):
            msg = (f"must not have differing left [{type(left).__name__}] and "
                   f"right [{type(right).__name__}] types")
            raise ValueError(msg)
        elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype):
            # GH 19016
            msg = ("category, object, and string subtypes are not supported "
                   "for IntervalArray")
            raise TypeError(msg)
        elif isinstance(left, ABCPeriodIndex):
            msg = "Period dtypes are not supported, use a PeriodIndex instead"
            raise ValueError(msg)
        elif isinstance(left,
                        ABCDatetimeIndex) and str(left.tz) != str(right.tz):
            msg = ("left and right must have the same time zone, got "
                   f"'{left.tz}' and '{right.tz}'")
            raise ValueError(msg)

        # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray
        left = ensure_wrapped_if_datetimelike(left)
        left = extract_array(left, extract_numpy=True)
        right = ensure_wrapped_if_datetimelike(right)
        right = extract_array(right, extract_numpy=True)

        lbase = getattr(left, "_ndarray", left).base
        rbase = getattr(right, "_ndarray", right).base
        if lbase is not None and lbase is rbase:
            # If these share area_data, then setitem could corrupt our IA
            right = right.copy()

        result._left = left
        result._right = right
        result._closed = closed
        if verify_integrity:
            result._validate()
        return result
示例#21
0
def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls,
                             default_dtype):
    if default_dtype.kind == "f":
        checker = is_float_dtype
    else:
        checker = is_integer_dtype

    inferred_type = None

    if dtype is None and hasattr(values, "dtype"):
        if checker(values.dtype):
            dtype = values.dtype

    if dtype is not None:
        dtype = dtype_cls._standardize_dtype(dtype)

    cls = dtype_cls.construct_array_type()
    if isinstance(values, cls):
        values, mask = values._data, values._mask
        if dtype is not None:
            values = values.astype(dtype.numpy_dtype, copy=False)

        if copy:
            values = values.copy()
            mask = mask.copy()
        return values, mask, dtype, inferred_type

    values = np.array(values, copy=copy)
    inferred_type = None
    if is_object_dtype(values.dtype) or is_string_dtype(values.dtype):
        inferred_type = lib.infer_dtype(values, skipna=True)
        if inferred_type == "empty":
            pass
        elif inferred_type == "boolean":
            name = dtype_cls.__name__.strip("_")
            raise TypeError(f"{values.dtype} cannot be converted to {name}")

    elif is_bool_dtype(values) and checker(dtype):
        values = np.array(values, dtype=default_dtype, copy=copy)

    elif not (is_integer_dtype(values) or is_float_dtype(values)):
        name = dtype_cls.__name__.strip("_")
        raise TypeError(f"{values.dtype} cannot be converted to {name}")

    if values.ndim != 1:
        raise TypeError("values must be a 1D list-like")

    if mask is None:
        mask = libmissing.is_numeric_na(values)
    else:
        assert len(mask) == len(values)

    if mask.ndim != 1:
        raise TypeError("mask must be a 1D list-like")

    # infer dtype if needed
    if dtype is None:
        dtype = default_dtype
    else:
        dtype = dtype.type

    # we copy as need to coerce here
    if mask.any():
        values = values.copy()
        values[mask] = cls._internal_fill_value
    if inferred_type in ("string", "unicode"):
        # casts from str are always safe since they raise
        # a ValueError if the str cannot be parsed into a float
        values = values.astype(dtype, copy=copy)
    else:
        values = dtype_cls._safe_cast(values, dtype, copy=False)

    return values, mask, dtype, inferred_type
示例#22
0
    def to_numpy(
        self,
        dtype: npt.DTypeLike | None = None,
        copy: bool = False,
        na_value: Scalar = lib.no_default,
    ) -> np.ndarray:
        """
        Convert to a NumPy Array.

        By default converts to an object-dtype NumPy array. Specify the `dtype` and
        `na_value` keywords to customize the conversion.

        Parameters
        ----------
        dtype : dtype, default object
            The numpy dtype to convert to.
        copy : bool, default False
            Whether to ensure that the returned value is a not a view on
            the array. Note that ``copy=False`` does not *ensure* that
            ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
            a copy is made, even if not strictly necessary. This is typically
            only possible when no missing values are present and `dtype`
            is the equivalent numpy dtype.
        na_value : scalar, optional
             Scalar missing value indicator to use in numpy array. Defaults
             to the native missing value indicator of this array (pd.NA).

        Returns
        -------
        numpy.ndarray

        Examples
        --------
        An object-dtype is the default result

        >>> a = pd.array([True, False, pd.NA], dtype="boolean")
        >>> a.to_numpy()
        array([True, False, <NA>], dtype=object)

        When no missing values are present, an equivalent dtype can be used.

        >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool")
        array([ True, False])
        >>> pd.array([1, 2], dtype="Int64").to_numpy("int64")
        array([1, 2])

        However, requesting such dtype will raise a ValueError if
        missing values are present and the default missing value :attr:`NA`
        is used.

        >>> a = pd.array([True, False, pd.NA], dtype="boolean")
        >>> a
        <BooleanArray>
        [True, False, <NA>]
        Length: 3, dtype: boolean

        >>> a.to_numpy(dtype="bool")
        Traceback (most recent call last):
        ...
        ValueError: cannot convert to bool numpy array in presence of missing values

        Specify a valid `na_value` instead

        >>> a.to_numpy(dtype="bool", na_value=False)
        array([ True, False, False])
        """
        if na_value is lib.no_default:
            na_value = libmissing.NA
        if dtype is None:
            dtype = object
        if self._hasna:
            if (not is_object_dtype(dtype) and not is_string_dtype(dtype)
                    and na_value is libmissing.NA):
                raise ValueError(
                    f"cannot convert to '{dtype}'-dtype NumPy array "
                    "with missing values. Specify an appropriate 'na_value' "
                    "for this dtype.")
            # don't pass copy to astype -> always need a copy since we are mutating
            data = self._data.astype(dtype)
            data[self._mask] = na_value
        else:
            data = self._data.astype(dtype, copy=copy)
        return data
示例#23
0
def sequence_to_td64ns(data,
                       copy: bool = False,
                       unit=None,
                       errors="raise") -> tuple[np.ndarray, Tick | None]:
    """
    Parameters
    ----------
    data : list-like
    copy : bool, default False
    unit : str, optional
        The timedelta unit to treat integers as multiples of. For numeric
        data this defaults to ``'ns'``.
        Must be un-specified if the data contains a str and ``errors=="raise"``.
    errors : {"raise", "coerce", "ignore"}, default "raise"
        How to handle elements that cannot be converted to timedelta64[ns].
        See ``pandas.to_timedelta`` for details.

    Returns
    -------
    converted : numpy.ndarray
        The sequence converted to a numpy array with dtype ``timedelta64[ns]``.
    inferred_freq : Tick or None
        The inferred frequency of the sequence.

    Raises
    ------
    ValueError : Data cannot be converted to timedelta64[ns].

    Notes
    -----
    Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause
    errors to be ignored; they are caught and subsequently ignored at a
    higher level.
    """
    inferred_freq = None
    if unit is not None:
        unit = parse_timedelta_unit(unit)

    # Unwrap whatever we have into a np.ndarray
    if not hasattr(data, "dtype"):
        # e.g. list, tuple
        if np.ndim(data) == 0:
            # i.e. generator
            data = list(data)
        data = np.array(data, copy=False)
    elif isinstance(data, ABCMultiIndex):
        raise TypeError("Cannot create a DatetimeArray from a MultiIndex.")
    else:
        data = extract_array(data, extract_numpy=True)

    if isinstance(data, IntegerArray):
        data = data.to_numpy("int64", na_value=iNaT)
    elif not isinstance(data, (np.ndarray, ExtensionArray)):
        # GH#24539 e.g. xarray, dask object
        data = np.asarray(data)
    elif isinstance(data, ABCCategorical):
        data = data.categories.take(data.codes, fill_value=NaT)._values
        copy = False

    if isinstance(data, TimedeltaArray):
        inferred_freq = data.freq

    # Convert whatever we have into timedelta64[ns] dtype
    if is_object_dtype(data.dtype) or is_string_dtype(data.dtype):
        # no need to make a copy, need to convert if string-dtyped
        data = objects_to_td64ns(data, unit=unit, errors=errors)
        copy = False

    elif is_integer_dtype(data.dtype):
        # treat as multiples of the given unit
        data, copy_made = ints_to_td64ns(data, unit=unit)
        copy = copy and not copy_made

    elif is_float_dtype(data.dtype):
        # cast the unit, multiply base/frac separately
        # to avoid precision issues from float -> int
        mask = np.isnan(data)
        # The next few lines are effectively a vectorized 'cast_from_unit'
        m, p = precision_from_unit(unit or "ns")
        base = data.astype(np.int64)
        frac = data - base
        if p:
            frac = np.round(frac, p)
        data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")
        data[mask] = iNaT
        copy = False

    elif is_timedelta64_dtype(data.dtype):
        if data.dtype != TD64NS_DTYPE:
            # non-nano unit
            data = ensure_timedelta64ns(data)
            copy = False

    else:
        # This includes datetime64-dtype, see GH#23539, GH#29794
        raise TypeError(
            f"dtype {data.dtype} cannot be converted to timedelta64[ns]")

    data = np.array(data, copy=copy)

    assert data.dtype == "m8[ns]", data
    return data, inferred_freq
示例#24
0
def sanitize_array(
    data,
    index: Index | None,
    dtype: DtypeObj | None = None,
    copy: bool = False,
    raise_cast_failure: bool = True,
) -> ArrayLike:
    """
    Sanitize input data to an ndarray or ExtensionArray, copy if specified,
    coerce to the dtype if specified.

    Parameters
    ----------
    data : Any
    index : Index or None, default None
    dtype : np.dtype, ExtensionDtype, or None, default None
    copy : bool, default False
    raise_cast_failure : bool, default True

    Returns
    -------
    np.ndarray or ExtensionArray

    Notes
    -----
    raise_cast_failure=False is only intended to be True when called from the
    DataFrame constructor, as the dtype keyword there may be interpreted as only
    applying to a subset of columns, see GH#24435.
    """
    if isinstance(data, ma.MaskedArray):
        data = sanitize_masked_array(data)

    # extract ndarray or ExtensionArray, ensure we have no PandasArray
    data = extract_array(data, extract_numpy=True)

    if isinstance(data, np.ndarray) and data.ndim == 0:
        if dtype is None:
            dtype = data.dtype
        data = lib.item_from_zerodim(data)
    elif isinstance(data, range):
        # GH#16804
        data = np.arange(data.start, data.stop, data.step, dtype="int64")
        copy = False

    if not is_list_like(data):
        if index is None:
            raise ValueError(
                "index must be specified when data is not list-like")
        data = construct_1d_arraylike_from_scalar(data, len(index), dtype)
        return data

    # GH#846
    if isinstance(data, np.ndarray):

        if dtype is not None and is_float_dtype(
                data.dtype) and is_integer_dtype(dtype):
            # possibility of nan -> garbage
            try:
                subarr = _try_cast(data, dtype, copy, True)
            except ValueError:
                subarr = np.array(data, copy=copy)
        else:
            # we will try to copy by-definition here
            subarr = _try_cast(data, dtype, copy, raise_cast_failure)

    elif isinstance(data, ABCExtensionArray):
        # it is already ensured above this is not a PandasArray
        subarr = data

        if dtype is not None:
            subarr = subarr.astype(dtype, copy=copy)
        elif copy:
            subarr = subarr.copy()
        return subarr

    else:
        if isinstance(data, (set, frozenset)):
            # Raise only for unordered sets, e.g., not for dict_keys
            raise TypeError(f"'{type(data).__name__}' type is unordered")

        # materialize e.g. generators, convert e.g. tuples, abc.ValueView
        # TODO: non-standard array-likes we can convert to ndarray more efficiently?
        data = list(data)

        if dtype is not None or len(data) == 0:
            subarr = _try_cast(data, dtype, copy, raise_cast_failure)
        else:
            subarr = maybe_convert_platform(data)
            # error: Incompatible types in assignment (expression has type
            # "Union[ExtensionArray, ndarray, List[Any]]", variable has type
            # "ExtensionArray")
            subarr = maybe_cast_to_datetime(subarr,
                                            dtype)  # type: ignore[assignment]

    subarr = _sanitize_ndim(subarr, data, dtype, index)

    if not (isinstance(subarr.dtype, ExtensionDtype)
            or isinstance(dtype, ExtensionDtype)):
        subarr = _sanitize_str_dtypes(subarr, data, dtype, copy)

        is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype(
            dtype)
        if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype:
            inferred = lib.infer_dtype(subarr, skipna=False)
            if inferred in {"interval", "period"}:
                subarr = array(subarr)
                subarr = extract_array(subarr, extract_numpy=True)

    return subarr
示例#25
0
    def _convert_to_ndarrays(
        self,
        dct: Mapping,
        na_values,
        na_fvalues,
        verbose: bool = False,
        converters=None,
        dtypes=None,
    ):
        result = {}
        for c, values in dct.items():
            conv_f = None if converters is None else converters.get(c, None)
            if isinstance(dtypes, dict):
                cast_type = dtypes.get(c, None)
            else:
                # single dtype or None
                cast_type = dtypes

            if self.na_filter:
                col_na_values, col_na_fvalues = _get_na_values(
                    c, na_values, na_fvalues, self.keep_default_na)
            else:
                col_na_values, col_na_fvalues = set(), set()

            if c in self._parse_date_cols:
                # GH#26203 Do not convert columns which get converted to dates
                # but replace nans to ensure to_datetime works
                mask = algorithms.isin(values,
                                       set(col_na_values) | col_na_fvalues)
                np.putmask(values, mask, np.nan)
                result[c] = values
                continue

            if conv_f is not None:
                # conv_f applied to data before inference
                if cast_type is not None:
                    warnings.warn(
                        ("Both a converter and dtype were specified "
                         f"for column {c} - only the converter will be used."),
                        ParserWarning,
                        stacklevel=find_stack_level(),
                    )

                try:
                    values = lib.map_infer(values, conv_f)
                except ValueError:
                    # error: Argument 2 to "isin" has incompatible type "List[Any]";
                    # expected "Union[Union[ExtensionArray, ndarray], Index, Series]"
                    mask = algorithms.isin(
                        values,
                        list(na_values)  # type: ignore[arg-type]
                    ).view(np.uint8)
                    values = lib.map_infer_mask(values, conv_f, mask)

                cvals, na_count = self._infer_types(values,
                                                    set(col_na_values)
                                                    | col_na_fvalues,
                                                    try_num_bool=False)
            else:
                is_ea = is_extension_array_dtype(cast_type)
                is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)
                # skip inference if specified dtype is object
                # or casting to an EA
                try_num_bool = not (cast_type and is_str_or_ea_dtype)

                # general type inference and conversion
                cvals, na_count = self._infer_types(
                    values,
                    set(col_na_values) | col_na_fvalues, try_num_bool)

                # type specified in dtype param or cast_type is an EA
                if cast_type and (not is_dtype_equal(cvals, cast_type)
                                  or is_extension_array_dtype(cast_type)):
                    if not is_ea and na_count > 0:
                        try:
                            if is_bool_dtype(cast_type):
                                raise ValueError(
                                    f"Bool column has NA values in column {c}")
                        except (AttributeError, TypeError):
                            # invalid input to is_bool_dtype
                            pass
                    cast_type = pandas_dtype(cast_type)
                    cvals = self._cast_types(cvals, cast_type, c)

            result[c] = cvals
            if verbose and na_count:
                print(f"Filled {na_count} NA values in column {c!s}")
        return result
示例#26
0
def test_is_string_dtype_nullable(nullable_string_dtype):
    assert com.is_string_dtype(
        pd.array(["a", "b"], dtype=nullable_string_dtype))
示例#27
0
 def test_not_string(self):
     # though PeriodDtype has object kind, it cannot be string
     assert not is_string_dtype(PeriodDtype("D"))
示例#28
0
def convert_polygon_feature_to_raster(feature,
                                      pixel_size,
                                      value=1,
                                      nodata_val=-9999,
                                      snap_extent_to_pixel=True):
    """ Convert a feature into a raster (block grid)

    The "value" argument can be a numeric, or the name of a feature column. If the feature column
    is a string, then the index value will be used to assign a value to the raster.

    By default, snap_extent_to_pixel is true and is used to snap the bounding coordinates to
    a pixel edge.

    Args:
       feature (pandas.core.series.Series): a polygon feature to convert to a raster
       pixel_size (int, float]: the pixel size for the raster
       value (int, float, str): pixel value. A column from the feature can be used
       nodata_val (int, float): nodata value
       snap_extent_to_pixel (bool): round the extent coordinates to be divisible by the pixel size

    Returns:
       burned (numpy.ndarray) : The array representing the raster data
       Dict[str, int]] : The rasterio meta kwargs required for writing the array to disk.
    """
    if not isinstance(feature, (pd.Series, geopandas.GeoSeries)):
        raise TypeError("Input feature should be a geopandas series")

    if not isinstance(pixel_size, (int, long, float)):
        raise TypeError('Pixel size must be an integer or floating number.')

    if isinstance(value, str) and 'value' not in list(feature.index):
        raise TypeError('Value string {} is not a column name')
    elif not isinstance(value, (int, long, float)):
        raise TypeError('Value should be a column name, or a number')

    transform, width, height, bbox = create_raster_transform(
        feature.geometry.bounds, pixel_size, snap_extent_to_pixel)

    gdf_feat = GeoDataFrame(GeoSeries(feature.geometry),
                            geometry=GeoSeries(feature.geometry).geometry)

    # create an affine transformation matrix to associate the array to the coordinates.
    if value in gdf_feat.columns:
        if is_string_dtype(gdf_feat[value]):
            shapes = ((geom, val)
                      for geom, val in zip(gdf_feat.geometry, (gdf_feat.index +
                                                               1)))
        else:
            shapes = ((geom, val)
                      for geom, val in zip(gdf_feat.geometry, gdf_feat[value]))
    else:
        shapes = ((geom, value) for geom in gdf_feat.geometry)

    burned = features.rasterize(shapes=shapes,
                                out_shape=(height, width),
                                fill=nodata_val,
                                transform=transform)
    meta = {}
    meta.update(height=height,
                width=width,
                transform=transform,
                nodata=nodata_val,
                dtype=rasterio.dtypes.get_minimum_dtype(burned))

    return burned, meta
示例#29
0
 def test_not_string(self):
     # GH30568: though IntervalDtype has object kind, it cannot be string
     assert not is_string_dtype(IntervalDtype())
示例#30
0
def coerce_to_array(values,
                    dtype,
                    mask=None,
                    copy: bool = False) -> tuple[np.ndarray, np.ndarray]:
    """
    Coerce the input values array to numpy arrays with a mask.

    Parameters
    ----------
    values : 1D list-like
    dtype : integer dtype
    mask : bool 1D array, optional
    copy : bool, default False
        if True, copy the input

    Returns
    -------
    tuple of (values, mask)
    """
    # if values is integer numpy array, preserve its dtype
    if dtype is None and hasattr(values, "dtype"):
        if is_integer_dtype(values.dtype):
            dtype = values.dtype

    if dtype is not None:
        if isinstance(dtype, str) and (dtype.startswith("Int")
                                       or dtype.startswith("UInt")):
            # Avoid DeprecationWarning from NumPy about np.dtype("Int64")
            # https://github.com/numpy/numpy/pull/7476
            dtype = dtype.lower()

        if not issubclass(type(dtype), _IntegerDtype):
            try:
                dtype = INT_STR_TO_DTYPE[str(np.dtype(dtype))]
            except KeyError as err:
                raise ValueError(f"invalid dtype specified {dtype}") from err

    if isinstance(values, IntegerArray):
        values, mask = values._data, values._mask
        if dtype is not None:
            values = values.astype(dtype.numpy_dtype, copy=False)

        if copy:
            values = values.copy()
            mask = mask.copy()
        return values, mask

    values = np.array(values, copy=copy)
    inferred_type = None
    if is_object_dtype(values) or is_string_dtype(values):
        inferred_type = lib.infer_dtype(values, skipna=True)
        if inferred_type == "empty":
            pass
        elif inferred_type not in [
                "floating",
                "integer",
                "mixed-integer",
                "integer-na",
                "mixed-integer-float",
                "string",
                "unicode",
        ]:
            raise TypeError(
                f"{values.dtype} cannot be converted to an IntegerDtype")

    elif is_bool_dtype(values) and is_integer_dtype(dtype):
        values = np.array(values, dtype=int, copy=copy)

    elif not (is_integer_dtype(values) or is_float_dtype(values)):
        raise TypeError(
            f"{values.dtype} cannot be converted to an IntegerDtype")

    if values.ndim != 1:
        raise TypeError("values must be a 1D list-like")

    if mask is None:
        mask = libmissing.is_numeric_na(values)
    else:
        assert len(mask) == len(values)

    if mask.ndim != 1:
        raise TypeError("mask must be a 1D list-like")

    # infer dtype if needed
    if dtype is None:
        dtype = np.dtype("int64")
    else:
        dtype = dtype.type

    # if we are float, let's make sure that we can
    # safely cast

    # we copy as need to coerce here
    if mask.any():
        values = values.copy()
        values[mask] = 1
    if inferred_type in ("string", "unicode"):
        # casts from str are always safe since they raise
        # a ValueError if the str cannot be parsed into an int
        values = values.astype(dtype, copy=copy)
    else:
        values = safe_cast(values, dtype, copy=False)

    return values, mask
示例#31
0
def convert_dtypes(
    input_array,
    convert_string: bool = True,
    convert_integer: bool = True,
    convert_boolean: bool = True,
) -> Dtype:
    """
    Convert objects to best possible type, and optionally,
    to types supporting ``pd.NA``.

    Parameters
    ----------
    input_array : ExtensionArray or PandasArray
    convert_string : bool, default True
        Whether object dtypes should be converted to ``StringDtype()``.
    convert_integer : bool, default True
        Whether, if possible, conversion can be done to integer extension types.
    convert_boolean : bool, defaults True
        Whether object dtypes should be converted to ``BooleanDtypes()``.

    Returns
    -------
    dtype
        new dtype
    """
    is_extension = is_extension_array_dtype(input_array.dtype)
    if (convert_string or convert_integer
            or convert_boolean) and not is_extension:
        try:
            inferred_dtype = lib.infer_dtype(input_array)
        except ValueError:
            # Required to catch due to Period.  Can remove once GH 23553 is fixed
            inferred_dtype = input_array.dtype

        if not convert_string and is_string_dtype(inferred_dtype):
            inferred_dtype = input_array.dtype

        if convert_integer:
            target_int_dtype = "Int64"

            if is_integer_dtype(input_array.dtype):
                from pandas.core.arrays.integer import _dtypes

                inferred_dtype = _dtypes.get(input_array.dtype.name,
                                             target_int_dtype)
            if not is_integer_dtype(input_array.dtype) and is_numeric_dtype(
                    input_array.dtype):
                inferred_dtype = target_int_dtype

        else:
            if is_integer_dtype(inferred_dtype):
                inferred_dtype = input_array.dtype

        if convert_boolean:
            if is_bool_dtype(input_array.dtype):
                inferred_dtype = "boolean"
        else:
            if isinstance(inferred_dtype, str) and inferred_dtype == "boolean":
                inferred_dtype = input_array.dtype

    else:
        inferred_dtype = input_array.dtype

    return inferred_dtype
示例#32
0
def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
    """
    Parameters
    ----------
    array : list-like
    copy : bool, default False
    unit : str, default "ns"
        The timedelta unit to treat integers as multiples of.
    errors : {"raise", "coerce", "ignore"}, default "raise"
        How to handle elements that cannot be converted to timedelta64[ns].
        See ``pandas.to_timedelta`` for details.

    Returns
    -------
    converted : numpy.ndarray
        The sequence converted to a numpy array with dtype ``timedelta64[ns]``.
    inferred_freq : Tick or None
        The inferred frequency of the sequence.

    Raises
    ------
    ValueError : Data cannot be converted to timedelta64[ns].

    Notes
    -----
    Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause
    errors to be ignored; they are caught and subsequently ignored at a
    higher level.
    """
    inferred_freq = None
    unit = parse_timedelta_unit(unit)

    # Unwrap whatever we have into a np.ndarray
    if not hasattr(data, "dtype"):
        # e.g. list, tuple
        if np.ndim(data) == 0:
            # i.e. generator
            data = list(data)
        data = np.array(data, copy=False)
    elif isinstance(data, ABCSeries):
        data = data._values
    elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)):
        inferred_freq = data.freq
        data = data._data

    # Convert whatever we have into timedelta64[ns] dtype
    if is_object_dtype(data.dtype) or is_string_dtype(data.dtype):
        # no need to make a copy, need to convert if string-dtyped
        data = objects_to_td64ns(data, unit=unit, errors=errors)
        copy = False

    elif is_integer_dtype(data.dtype):
        # treat as multiples of the given unit
        data, copy_made = ints_to_td64ns(data, unit=unit)
        copy = copy and not copy_made

    elif is_float_dtype(data.dtype):
        # cast the unit, multiply base/frace separately
        # to avoid precision issues from float -> int
        mask = np.isnan(data)
        m, p = precision_from_unit(unit)
        base = data.astype(np.int64)
        frac = data - base
        if p:
            frac = np.round(frac, p)
        data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")
        data[mask] = iNaT
        copy = False

    elif is_timedelta64_dtype(data.dtype):
        if data.dtype != _TD_DTYPE:
            # non-nano unit
            # TODO: watch out for overflows
            data = data.astype(_TD_DTYPE)
            copy = False

    else:
        # This includes datetime64-dtype, see GH#23539, GH#29794
        raise TypeError(
            f"dtype {data.dtype} cannot be converted to timedelta64[ns]")

    data = np.array(data, copy=copy)

    assert data.dtype == "m8[ns]", data
    return data, inferred_freq
示例#33
0
def maybe_downcast_numeric(result, dtype, do_round: bool = False):
    """
    Subset of maybe_downcast_to_dtype restricted to numeric dtypes.

    Parameters
    ----------
    result : ndarray or ExtensionArray
    dtype : np.dtype or ExtensionDtype
    do_round : bool

    Returns
    -------
    ndarray or ExtensionArray
    """
    if not isinstance(dtype, np.dtype):
        # e.g. SparseDtype has no itemsize attr
        return result

    if isinstance(result, list):
        # reached via groupoby.agg _ohlc; really this should be handled
        #  earlier
        result = np.array(result)

    def trans(x):
        if do_round:
            return x.round()
        return x

    if dtype.kind == result.dtype.kind:
        # don't allow upcasts here (except if empty)
        if result.dtype.itemsize <= dtype.itemsize and result.size:
            return result

    if is_bool_dtype(dtype) or is_integer_dtype(dtype):

        if not result.size:
            # if we don't have any elements, just astype it
            return trans(result).astype(dtype)

        # do a test on the first element, if it fails then we are done
        r = result.ravel()
        arr = np.array([r[0]])

        if isna(arr).any():
            # if we have any nulls, then we are done
            return result

        elif not isinstance(
                r[0], (np.integer, np.floating, np.bool, int, float, bool)):
            # a comparable, e.g. a Decimal may slip in here
            return result

        if (issubclass(result.dtype.type, (np.object_, np.number))
                and notna(result).all()):
            new_result = trans(result).astype(dtype)
            if new_result.dtype.kind == "O" or result.dtype.kind == "O":
                # np.allclose may raise TypeError on object-dtype
                if (new_result == result).all():
                    return new_result
            else:
                if np.allclose(new_result, result, rtol=0):
                    return new_result

    elif (issubclass(dtype.type, np.floating)
          and not is_bool_dtype(result.dtype)
          and not is_string_dtype(result.dtype)):
        return result.astype(dtype)

    return result
示例#34
0
文件: missing.py 项目: tc423/pandas
def array_equivalent(left,
                     right,
                     strict_nan: bool = False,
                     dtype_equal: bool = False) -> bool:
    """
    True if two arrays, left and right, have equal non-NaN elements, and NaNs
    in corresponding locations.  False otherwise. It is assumed that left and
    right are NumPy arrays of the same dtype. The behavior of this function
    (particularly with respect to NaNs) is not defined if the dtypes are
    different.

    Parameters
    ----------
    left, right : ndarrays
    strict_nan : bool, default False
        If True, consider NaN and None to be different.
    dtype_equal : bool, default False
        Whether `left` and `right` are known to have the same dtype
        according to `is_dtype_equal`. Some methods like `BlockManager.equals`.
        require that the dtypes match. Setting this to ``True`` can improve
        performance, but will give different results for arrays that are
        equal but different dtypes.

    Returns
    -------
    b : bool
        Returns True if the arrays are equivalent.

    Examples
    --------
    >>> array_equivalent(
    ...     np.array([1, 2, np.nan]),
    ...     np.array([1, 2, np.nan]))
    True
    >>> array_equivalent(
    ...     np.array([1, np.nan, 2]),
    ...     np.array([1, 2, np.nan]))
    False
    """
    left, right = np.asarray(left), np.asarray(right)

    # shape compat
    if left.shape != right.shape:
        return False

    if dtype_equal:
        # fastpath when we require that the dtypes match (Block.equals)
        if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype):
            return _array_equivalent_float(left, right)
        elif is_datetimelike_v_numeric(left.dtype, right.dtype):
            return False
        elif needs_i8_conversion(left.dtype):
            return _array_equivalent_datetimelike(left, right)
        elif is_string_dtype(left.dtype):
            # TODO: fastpath for pandas' StringDtype
            return _array_equivalent_object(left, right, strict_nan)
        else:
            return np.array_equal(left, right)

    # Slow path when we allow comparing different dtypes.
    # Object arrays can contain None, NaN and NaT.
    # string dtypes must be come to this path for NumPy 1.7.1 compat
    if is_string_dtype(left.dtype) or is_string_dtype(right.dtype):
        return _array_equivalent_object(left, right, strict_nan)

    # NaNs can occur in float and complex arrays.
    if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype):
        if not (np.prod(left.shape) and np.prod(right.shape)):
            return True
        return ((left == right) | (isna(left) & isna(right))).all()

    elif is_datetimelike_v_numeric(left, right):
        # GH#29553 avoid numpy deprecation warning
        return False

    elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype):
        # datetime64, timedelta64, Period
        if not is_dtype_equal(left.dtype, right.dtype):
            return False

        left = left.view("i8")
        right = right.view("i8")

    # if we have structured dtypes, compare first
    if (left.dtype.type is np.void
            or right.dtype.type is np.void) and left.dtype != right.dtype:
        return False

    return np.array_equal(left, right)
示例#35
0
def sanitize_array(
    data,
    index: Optional[Index],
    dtype: Optional[DtypeObj] = None,
    copy: bool = False,
    raise_cast_failure: bool = False,
) -> ArrayLike:
    """
    Sanitize input data to an ndarray or ExtensionArray, copy if specified,
    coerce to the dtype if specified.
    """

    if isinstance(data, ma.MaskedArray):
        data = sanitize_masked_array(data)

    # extract ndarray or ExtensionArray, ensure we have no PandasArray
    data = extract_array(data, extract_numpy=True)

    if isinstance(data, np.ndarray) and data.ndim == 0:
        if dtype is None:
            dtype = data.dtype
        data = lib.item_from_zerodim(data)

    # GH#846
    if isinstance(data, np.ndarray):

        if dtype is not None and is_float_dtype(
                data.dtype) and is_integer_dtype(dtype):
            # possibility of nan -> garbage
            try:
                subarr = _try_cast(data, dtype, copy, True)
            except ValueError:
                subarr = np.array(data, copy=copy)
        else:
            # we will try to copy by-definition here
            subarr = _try_cast(data, dtype, copy, raise_cast_failure)

    elif isinstance(data, ABCExtensionArray):
        # it is already ensured above this is not a PandasArray
        subarr = data

        if dtype is not None:
            subarr = subarr.astype(dtype, copy=copy)
        elif copy:
            subarr = subarr.copy()
        return subarr

    elif isinstance(data,
                    (list, tuple, abc.Set, abc.ValuesView)) and len(data) > 0:
        # TODO: deque, array.array
        if isinstance(data, set):
            # Raise only for unordered sets, e.g., not for dict_keys
            raise TypeError("Set type is unordered")
        data = list(data)

        if dtype is not None:
            subarr = _try_cast(data, dtype, copy, raise_cast_failure)
        else:
            subarr = maybe_convert_platform(data)
            subarr = maybe_cast_to_datetime(subarr, dtype)

    elif isinstance(data, range):
        # GH#16804
        arr = np.arange(data.start, data.stop, data.step, dtype="int64")
        subarr = _try_cast(arr, dtype, copy, raise_cast_failure)

    elif not is_list_like(data):
        if index is None:
            raise ValueError(
                "index must be specified when data is not list-like")
        subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype)

    else:
        subarr = _try_cast(data, dtype, copy, raise_cast_failure)

    subarr = _sanitize_ndim(subarr, data, dtype, index)

    if not (is_extension_array_dtype(subarr.dtype)
            or is_extension_array_dtype(dtype)):
        subarr = _sanitize_str_dtypes(subarr, data, dtype, copy)

        is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype(
            dtype)
        if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype:
            inferred = lib.infer_dtype(subarr, skipna=False)
            if inferred in {"interval", "period"}:
                subarr = array(subarr)

    return subarr
示例#36
0
文件: timedeltas.py 项目: vesh/pandas
def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
    """
    Parameters
    ----------
    array : list-like
    copy : bool, default False
    unit : str, default "ns"
        The timedelta unit to treat integers as multiples of.
    errors : {"raise", "coerce", "ignore"}, default "raise"
        How to handle elements that cannot be converted to timedelta64[ns].
        See ``pandas.to_timedelta`` for details.

    Returns
    -------
    converted : numpy.ndarray
        The sequence converted to a numpy array with dtype ``timedelta64[ns]``.
    inferred_freq : Tick or None
        The inferred frequency of the sequence.

    Raises
    ------
    ValueError : Data cannot be converted to timedelta64[ns].

    Notes
    -----
    Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause
    errors to be ignored; they are caught and subsequently ignored at a
    higher level.
    """
    inferred_freq = None
    unit = parse_timedelta_unit(unit)

    # Unwrap whatever we have into a np.ndarray
    if not hasattr(data, 'dtype'):
        # e.g. list, tuple
        if np.ndim(data) == 0:
            # i.e. generator
            data = list(data)
        data = np.array(data, copy=False)
    elif isinstance(data, ABCSeries):
        data = data._values
    elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArrayMixin)):
        inferred_freq = data.freq
        data = data._data

    # Convert whatever we have into timedelta64[ns] dtype
    if is_object_dtype(data) or is_string_dtype(data):
        # no need to make a copy, need to convert if string-dtyped
        data = objects_to_td64ns(data, unit=unit, errors=errors)
        copy = False

    elif is_integer_dtype(data):
        # treat as multiples of the given unit
        data, copy_made = ints_to_td64ns(data, unit=unit)
        copy = copy and not copy_made

    elif is_float_dtype(data):
        # treat as multiples of the given unit.  If after converting to nanos,
        #  there are fractional components left, these are truncated
        #  (i.e. NOT rounded)
        mask = np.isnan(data)
        coeff = np.timedelta64(1, unit) / np.timedelta64(1, 'ns')
        data = (coeff * data).astype(np.int64).view('timedelta64[ns]')
        data[mask] = iNaT
        copy = False

    elif is_timedelta64_dtype(data):
        if data.dtype != _TD_DTYPE:
            # non-nano unit
            # TODO: watch out for overflows
            data = data.astype(_TD_DTYPE)
            copy = False

    elif is_datetime64_dtype(data):
        # GH#23539
        warnings.warn(
            "Passing datetime64-dtype data to TimedeltaIndex is "
            "deprecated, will raise a TypeError in a future "
            "version",
            FutureWarning,
            stacklevel=3)
        data = ensure_int64(data).view(_TD_DTYPE)

    else:
        raise TypeError(
            "dtype {dtype} cannot be converted to timedelta64[ns]".format(
                dtype=data.dtype))

    data = np.array(data, copy=copy)
    assert data.dtype == 'm8[ns]', data
    return data, inferred_freq
示例#37
0
def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced,
                                               fill_value, box):
    dtype = np.dtype(any_numpy_dtype_reduced)
    boxed, box_dtype = box  # read from parametrized fixture

    if (dtype == bytes and not boxed and fill_value is not None
            and fill_value is not NaT):
        pytest.xfail("does not upcast to object")
    elif dtype == "uint64" and not boxed and fill_value == iNaT:
        pytest.xfail("does not upcast correctly")
    # below: opinionated that iNaT should be interpreted as missing value
    elif (not boxed and (is_float_dtype(dtype) or is_complex_dtype(dtype))
          and fill_value == iNaT):
        pytest.xfail("does not cast to missing value marker correctly")
    elif (is_string_dtype(dtype)
          or dtype == bool) and not boxed and fill_value == iNaT:
        pytest.xfail("does not cast to missing value marker correctly")

    if is_integer_dtype(dtype) and dtype == "uint64" and fill_value == iNaT:
        # uint64 + negative int casts to object; iNaT is considered as missing
        expected_dtype = np.dtype(object)
        exp_val_for_scalar = np.nan
    elif is_integer_dtype(dtype) and fill_value == iNaT:
        # other integer + iNaT casts to int64
        expected_dtype = np.int64
        exp_val_for_scalar = iNaT
    elif is_integer_dtype(dtype) and fill_value is not NaT:
        # integer + other missing value (np.nan / None) casts to float
        expected_dtype = np.float64
        exp_val_for_scalar = np.nan
    elif is_object_dtype(dtype) and (fill_value == iNaT or fill_value is NaT):
        # inserting into object does not cast the value
        # but *does* cast None to np.nan
        expected_dtype = np.dtype(object)
        exp_val_for_scalar = fill_value
    elif is_datetime_or_timedelta_dtype(dtype):
        # datetime / timedelta cast all missing values to iNaT
        expected_dtype = dtype
        exp_val_for_scalar = iNaT
    elif fill_value is NaT:
        # NaT upcasts everything that's not datetime/timedelta to object
        expected_dtype = np.dtype(object)
        exp_val_for_scalar = NaT
    elif is_float_dtype(dtype) or is_complex_dtype(dtype):
        # float / complex + missing value (!= NaT) stays the same
        expected_dtype = dtype
        exp_val_for_scalar = np.nan
    else:
        # all other cases cast to object, and use np.nan as missing value
        expected_dtype = np.dtype(object)
        exp_val_for_scalar = np.nan

    # array case has same expected_dtype; but returns corresponding na-marker
    if is_integer_dtype(expected_dtype):
        # integers cannot hold NaNs; maybe_promote_with_array returns None
        exp_val_for_array = None
    elif is_datetime_or_timedelta_dtype(expected_dtype):
        exp_val_for_array = iNaT
    else:  # expected_dtype = float / complex / object
        exp_val_for_array = np.nan

    _check_promote(
        dtype,
        fill_value,
        boxed,
        box_dtype,
        expected_dtype,
        exp_val_for_scalar,
        exp_val_for_array,
    )
示例#38
0
    def _str_map(self, f, na_value=None, dtype: Dtype | None = None):
        # TODO: de-duplicate with StringArray method. This method is moreless copy and
        # paste.

        from pandas.arrays import (
            BooleanArray,
            IntegerArray,
        )

        if dtype is None:
            dtype = self.dtype
        if na_value is None:
            na_value = self.dtype.na_value

        mask = isna(self)
        arr = np.asarray(self)

        if is_integer_dtype(dtype) or is_bool_dtype(dtype):
            constructor: type[IntegerArray] | type[BooleanArray]
            if is_integer_dtype(dtype):
                constructor = IntegerArray
            else:
                constructor = BooleanArray

            na_value_is_na = isna(na_value)
            if na_value_is_na:
                na_value = 1
            result = lib.map_infer_mask(
                arr,
                f,
                mask.view("uint8"),
                convert=False,
                na_value=na_value,
                # error: Value of type variable "_DTypeScalar" of "dtype" cannot be
                # "object"
                # error: Argument 1 to "dtype" has incompatible type
                # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
                # "Type[object]"
                dtype=np.dtype(dtype),  # type: ignore[type-var,arg-type]
            )

            if not na_value_is_na:
                mask[:] = False

            return constructor(result, mask)

        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
            # i.e. StringDtype
            result = lib.map_infer_mask(arr,
                                        f,
                                        mask.view("uint8"),
                                        convert=False,
                                        na_value=na_value)
            result = pa.array(result,
                              mask=mask,
                              type=pa.string(),
                              from_pandas=True)
            return type(self)(result)
        else:
            # This is when the result type is object. We reach this when
            # -> We know the result type is truly object (e.g. .encode returns bytes
            #    or .findall returns a list).
            # -> We don't know the result type. E.g. `.get` can return anything.
            return lib.map_infer_mask(arr, f, mask.view("uint8"))
示例#39
0
    def __new__(cls, subtype=None, closed: str_type | None = None):
        from pandas.core.dtypes.common import (
            is_string_dtype,
            pandas_dtype,
        )

        if closed is not None and closed not in {
                "right", "left", "both", "neither"
        }:
            raise ValueError(
                "closed must be one of 'right', 'left', 'both', 'neither'")

        if isinstance(subtype, IntervalDtype):
            if closed is not None and closed != subtype.closed:
                raise ValueError(
                    "dtype.closed and 'closed' do not match. "
                    "Try IntervalDtype(dtype.subtype, closed) instead.")
            return subtype
        elif subtype is None:
            # we are called as an empty constructor
            # generally for pickle compat
            u = object.__new__(cls)
            u._subtype = None
            u._closed = closed
            return u
        elif isinstance(subtype, str) and subtype.lower() == "interval":
            subtype = None
        else:
            if isinstance(subtype, str):
                m = cls._match.search(subtype)
                if m is not None:
                    gd = m.groupdict()
                    subtype = gd["subtype"]
                    if gd.get("closed", None) is not None:
                        if closed is not None:
                            if closed != gd["closed"]:
                                raise ValueError(
                                    "'closed' keyword does not match value "
                                    "specified in dtype string")
                        closed = gd["closed"]

            try:
                subtype = pandas_dtype(subtype)
            except TypeError as err:
                raise TypeError("could not construct IntervalDtype") from err

        if CategoricalDtype.is_dtype(subtype) or is_string_dtype(subtype):
            # GH 19016
            msg = ("category, object, and string subtypes are not supported "
                   "for IntervalDtype")
            raise TypeError(msg)

        key = str(subtype) + str(closed)
        try:
            return cls._cache_dtypes[key]
        except KeyError:
            u = object.__new__(cls)
            u._subtype = subtype
            u._closed = closed
            cls._cache_dtypes[key] = u
            return u
示例#40
0
    def read_attr_all(self, gages_ids):
        """read data from GAGES-II
        parameters:
            gages_ids:gages sites' ids
        :return
            out:ndarray
        """
        dir_gage_attr = self.all_configs.get("gage_files_dir")
        dir_out = self.all_configs.get("out_dir")
        f_dict = dict()  # factorize dict
        # each key-value pair for atts in a file (list)
        var_dict = dict()
        # all attrs
        var_lst = list()
        out_lst = list()
        # read all attrs
        var_des = pd.read_csv(os.path.join(dir_gage_attr,
                                           'variable_descriptions.txt'),
                              sep=',')
        var_des_map_values = var_des['VARIABLE_TYPE'].tolist()
        for i in range(len(var_des)):
            var_des_map_values[i] = var_des_map_values[i].lower()
        # sort by type
        key_lst = list(set(var_des_map_values))
        key_lst.sort(key=var_des_map_values.index)
        # remove x_region_names
        key_lst.remove('x_region_names')

        for key in key_lst:
            # in "spreadsheets-in-csv-format" directory, the name of "flow_record" file is conterm_flowrec.txt
            if key == 'flow_record':
                key = 'flowrec'
            data_file = os.path.join(dir_gage_attr, 'conterm_' + key + '.txt')
            # remove some unused atttrs in bas_classif
            if key == 'bas_classif':
                data_temp = pd.read_csv(data_file,
                                        sep=',',
                                        dtype={'STAID': str},
                                        usecols=range(0, 4))
            else:
                data_temp = pd.read_csv(data_file,
                                        sep=',',
                                        dtype={'STAID': str})
            if key == 'flowrec':
                # remove final column which is nan
                data_temp = data_temp.iloc[:, range(0, data_temp.shape[1] - 1)]
            # all attrs in files
            var_lst_temp = list(data_temp.columns[1:])
            var_dict[key] = var_lst_temp
            var_lst.extend(var_lst_temp)
            k = 0
            n_gage = len(gages_ids)
            out_temp = np.full(
                [n_gage, len(var_lst_temp)],
                np.nan)  # 1d:sites,2d: attrs in current data_file
            # sites intersection,ind2 is the index of sites in conterm_ files,set them in out_temp
            range1 = gages_ids
            range2 = data_temp.iloc[:, 0].astype(str).tolist()
            assert (all(x < y for x, y in zip(range2, range2[1:])))
            c, ind1, ind2 = np.intersect1d(range1, range2, return_indices=True)
            for field in var_lst_temp:
                if is_string_dtype(
                        data_temp[field]):  # str vars -> categorical vars
                    value, ref = pd.factorize(data_temp.loc[ind2, field],
                                              sort=True)
                    out_temp[:, k] = value
                    f_dict[field] = ref.tolist()
                elif is_numeric_dtype(data_temp[field]):
                    out_temp[:, k] = data_temp.loc[ind2, field].values
                k = k + 1
            out_lst.append(out_temp)
        out = np.concatenate(out_lst, 1)
        return out, var_lst, var_dict, f_dict
示例#41
0
def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
    """
    Parameters
    ----------
    array : list-like
    copy : bool, default False
    unit : str, default "ns"
        The timedelta unit to treat integers as multiples of.
    errors : {"raise", "coerce", "ignore"}, default "raise"
        How to handle elements that cannot be converted to timedelta64[ns].
        See ``pandas.to_timedelta`` for details.

    Returns
    -------
    converted : numpy.ndarray
        The sequence converted to a numpy array with dtype ``timedelta64[ns]``.
    inferred_freq : Tick or None
        The inferred frequency of the sequence.

    Raises
    ------
    ValueError : Data cannot be converted to timedelta64[ns].

    Notes
    -----
    Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause
    errors to be ignored; they are caught and subsequently ignored at a
    higher level.
    """
    inferred_freq = None
    unit = parse_timedelta_unit(unit)

    # Unwrap whatever we have into a np.ndarray
    if not hasattr(data, 'dtype'):
        # e.g. list, tuple
        if np.ndim(data) == 0:
            # i.e. generator
            data = list(data)
        data = np.array(data, copy=False)
    elif isinstance(data, ABCSeries):
        data = data._values
    elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)):
        inferred_freq = data.freq
        data = data._data

    # Convert whatever we have into timedelta64[ns] dtype
    if is_object_dtype(data.dtype) or is_string_dtype(data.dtype):
        # no need to make a copy, need to convert if string-dtyped
        data = objects_to_td64ns(data, unit=unit, errors=errors)
        copy = False

    elif is_integer_dtype(data.dtype):
        # treat as multiples of the given unit
        data, copy_made = ints_to_td64ns(data, unit=unit)
        copy = copy and not copy_made

    elif is_float_dtype(data.dtype):
        # cast the unit, multiply base/frace separately
        # to avoid precision issues from float -> int
        mask = np.isnan(data)
        m, p = precision_from_unit(unit)
        base = data.astype(np.int64)
        frac = data - base
        if p:
            frac = np.round(frac, p)
        data = (base * m + (frac * m).astype(np.int64)).view('timedelta64[ns]')
        data[mask] = iNaT
        copy = False

    elif is_timedelta64_dtype(data.dtype):
        if data.dtype != _TD_DTYPE:
            # non-nano unit
            # TODO: watch out for overflows
            data = data.astype(_TD_DTYPE)
            copy = False

    elif is_datetime64_dtype(data):
        # GH#23539
        warnings.warn("Passing datetime64-dtype data to TimedeltaIndex is "
                      "deprecated, will raise a TypeError in a future "
                      "version",
                      FutureWarning, stacklevel=4)
        data = ensure_int64(data).view(_TD_DTYPE)

    else:
        raise TypeError("dtype {dtype} cannot be converted to timedelta64[ns]"
                        .format(dtype=data.dtype))

    data = np.array(data, copy=copy)
    if data.ndim != 1:
        raise ValueError("Only 1-dimensional input arrays are supported.")

    assert data.dtype == 'm8[ns]', data
    return data, inferred_freq
示例#42
0
def autogen_schema(X,
                   ordinal_max_items=2,
                   feature_names=None,
                   feature_types=None):
    """ Generates data schema for a given dataset as JSON representable.

    Args:
        X: Dataframe/ndarray to build schema from.
        ordinal_max_items: If a numeric column's cardinality
            is at most this integer,
            consider it as ordinal instead of continuous.
        feature_names: Feature names
        feature_types: Feature types

    Returns:
        A dictionary - schema that encapsulates column information,
        such as type and domain.
    """
    schema = OrderedDict()
    col_number = 0
    if isinstance(X, np.ndarray):
        log.warning(
            "Passing a numpy array to schema autogen when it should be dataframe."
        )
        if feature_names is None:
            X = pd.DataFrame(X,
                             columns=[
                                 "col_" + str(i) for i in range(X.shape[1])
                             ]).infer_objects()
        else:
            X = pd.DataFrame(X, columns=feature_names).infer_objects()
    if isinstance(X, NDFrame):
        for name, col_dtype in zip(X.dtypes.index, X.dtypes):
            schema[name] = {}
            if is_numeric_dtype(col_dtype):
                # schema[name]['type'] = 'continuous'
                # TODO: Fix this once we know it works.
                if len(set(X[name])) > ordinal_max_items:
                    schema[name]["type"] = "continuous"
                else:
                    # TODO: Work with ordinal later.
                    schema[name]["type"] = "categorical"
                    # schema[name]['type'] = 'ordinal'
                    # schema[name]['order'] = list(set(X[name]))
            elif is_string_dtype(col_dtype):
                schema[name]["type"] = "categorical"
            else:
                warnings.warn("Unknown column: " + name, RuntimeWarning)
                schema[name]["type"] = "unknown"
            schema[name]["column_number"] = col_number
            col_number += 1

        # Override if feature_types is passed as arg.
        if feature_types is not None:
            for idx, name in enumerate(X.dtypes.index):
                schema[name]["type"] = feature_types[idx]
    else:
        raise TypeError(
            "GA2M only supports numpy arrays or pandas dataframes.")

    return schema
示例#43
0
 def test_not_string(self):
     # though PeriodDtype has object kind, it cannot be string
     assert not is_string_dtype(PeriodDtype('D'))
示例#44
0
def sequence_to_td64ns(
    data, copy: bool = False, unit=None, errors="raise"
) -> tuple[np.ndarray, Tick | None]:
    """
    Parameters
    ----------
    data : list-like
    copy : bool, default False
    unit : str, optional
        The timedelta unit to treat integers as multiples of. For numeric
        data this defaults to ``'ns'``.
        Must be un-specified if the data contains a str and ``errors=="raise"``.
    errors : {"raise", "coerce", "ignore"}, default "raise"
        How to handle elements that cannot be converted to timedelta64[ns].
        See ``pandas.to_timedelta`` for details.

    Returns
    -------
    converted : numpy.ndarray
        The sequence converted to a numpy array with dtype ``timedelta64[ns]``.
    inferred_freq : Tick or None
        The inferred frequency of the sequence.

    Raises
    ------
    ValueError : Data cannot be converted to timedelta64[ns].

    Notes
    -----
    Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause
    errors to be ignored; they are caught and subsequently ignored at a
    higher level.
    """
    assert unit not in ["Y", "y", "M"]  # caller is responsible for checking

    inferred_freq = None
    if unit is not None:
        unit = parse_timedelta_unit(unit)

    data, copy = dtl.ensure_arraylike_for_datetimelike(
        data, copy, cls_name="TimedeltaArray"
    )

    if isinstance(data, TimedeltaArray):
        inferred_freq = data.freq

    # Convert whatever we have into timedelta64[ns] dtype
    if is_object_dtype(data.dtype) or is_string_dtype(data.dtype):
        # no need to make a copy, need to convert if string-dtyped
        data = _objects_to_td64ns(data, unit=unit, errors=errors)
        copy = False

    elif is_integer_dtype(data.dtype):
        # treat as multiples of the given unit
        data, copy_made = ints_to_td64ns(data, unit=unit)
        copy = copy and not copy_made

    elif is_float_dtype(data.dtype):
        # cast the unit, multiply base/frac separately
        # to avoid precision issues from float -> int
        mask = np.isnan(data)
        # The next few lines are effectively a vectorized 'cast_from_unit'
        m, p = precision_from_unit(unit or "ns")
        base = data.astype(np.int64)
        frac = data - base
        if p:
            frac = np.round(frac, p)
        data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")
        data[mask] = iNaT
        copy = False

    elif is_timedelta64_dtype(data.dtype):
        if data.dtype != TD64NS_DTYPE:
            # non-nano unit
            data = astype_overflowsafe(data, dtype=TD64NS_DTYPE)
            copy = False

    else:
        # This includes datetime64-dtype, see GH#23539, GH#29794
        raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]")

    data = np.array(data, copy=copy)

    assert data.dtype == "m8[ns]", data
    return data, inferred_freq