def test_is_string_dtype(): assert not com.is_string_dtype(int) assert not com.is_string_dtype(pd.Series([1, 2])) assert com.is_string_dtype(str) assert com.is_string_dtype(object) assert com.is_string_dtype(np.array(['a', 'b']))
def _assign_feature_type(feature_type, unique_count=0): if is_string_dtype(feature_type) or ( is_numeric_dtype(feature_type) and unique_count <= 2 ): return "categorical" elif is_numeric_dtype(feature_type): return "continuous" else: return "unknown"
def autogen_schema(X, ordinal_max_items=2, feature_names=None, feature_types=None): """ Generates data schema for a given dataset as JSON representable. Args: X: Dataframe/ndarray to build schema from. ordinal_max_items: If a numeric column's cardinality is at most this integer, consider it as ordinal instead of continuous. feature_names: Feature names feature_types: Feature types Returns: A dictionary - schema that encapsulates column information, such as type and domain. """ schema = OrderedDict() col_number = 0 if isinstance(X, np.ndarray): log.warning( "Passing a numpy array to schema autogen when it should be dataframe." ) if feature_names is None: X = pd.DataFrame( X, columns=["col_" + str(i) for i in range(X.shape[1])] ).infer_objects() else: X = pd.DataFrame(X, columns=feature_names).infer_objects() if isinstance(X, NDFrame): for name, col_dtype in zip(X.dtypes.index, X.dtypes): schema[name] = {} if is_numeric_dtype(col_dtype): # schema[name]['type'] = 'continuous' # TODO: Fix this once we know it works. if len(set(X[name])) > ordinal_max_items: schema[name]["type"] = "continuous" else: # TODO: Work with ordinal later. schema[name]["type"] = "categorical" # schema[name]['type'] = 'ordinal' # schema[name]['order'] = list(set(X[name])) elif is_string_dtype(col_dtype): schema[name]["type"] = "categorical" else: warnings.warn("Unknown column: " + name, RuntimeWarning) schema[name]["type"] = "unknown" schema[name]["column_number"] = col_number col_number += 1 # Override if feature_types is passed as arg. if feature_types is not None: for idx, name in enumerate(X.dtypes.index): schema[name]["type"] = feature_types[idx] else: raise TypeError("GA2M only supports numpy arrays or pandas dataframes.") return schema
def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False): """ Convert ndarray to sparse format Parameters ---------- arr : ndarray kind : {'block', 'integer'} fill_value : NaN or another value dtype : np.dtype, optional copy : bool, default False Returns ------- (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar) """ arr = _sanitize_values(arr) if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") if fill_value is None: fill_value = na_value_for_dtype(arr.dtype) if isna(fill_value): mask = notna(arr) else: # For str arrays in NumPy 1.12.0, operator!= below isn't # element-wise but just returns False if fill_value is not str, # so cast to object comparison to be safe if is_string_dtype(arr): arr = arr.astype(object) if is_object_dtype(arr.dtype): # element-wise equality check method in numpy doesn't treat # each element type, eg. 0, 0.0, and False are treated as # same. So we have to check the both of its type and value. mask = splib.make_mask_object_ndarray(arr, fill_value) else: mask = arr != fill_value length = len(arr) if length != len(mask): # the arr is a SparseArray indices = mask.sp_index.indices else: indices = mask.nonzero()[0].astype(np.int32) index = _make_index(length, indices, kind) sparsified_values = arr[mask] if dtype is not None: sparsified_values = astype_nansafe(sparsified_values, dtype=dtype) # TODO: copy return sparsified_values, index, fill_value
def _simple_new(cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True): result = IntervalMixin.__new__(cls) closed = closed or 'right' left = ensure_index(left, copy=copy) right = ensure_index(right, copy=copy) if dtype is not None: # GH 19262: dtype must be an IntervalDtype to override inferred dtype = pandas_dtype(dtype) if not is_interval_dtype(dtype): msg = 'dtype must be an IntervalDtype, got {dtype}' raise TypeError(msg.format(dtype=dtype)) elif dtype.subtype is not None: left = left.astype(dtype.subtype) right = right.astype(dtype.subtype) # coerce dtypes to match if needed if is_float_dtype(left) and is_integer_dtype(right): right = right.astype(left.dtype) elif is_float_dtype(right) and is_integer_dtype(left): left = left.astype(right.dtype) if type(left) != type(right): msg = ('must not have differing left [{ltype}] and right ' '[{rtype}] types') raise ValueError(msg.format(ltype=type(left).__name__, rtype=type(right).__name__)) elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): # GH 19016 msg = ('category, object, and string subtypes are not supported ' 'for IntervalArray') raise TypeError(msg) elif isinstance(left, ABCPeriodIndex): msg = 'Period dtypes are not supported, use a PeriodIndex instead' raise ValueError(msg) elif (isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz)): msg = ("left and right must have the same time zone, got " "'{left_tz}' and '{right_tz}'") raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz)) result._left = left result._right = right result._closed = closed if verify_integrity: result._validate() return result
def astype(self, dtype, copy=True): if is_object_dtype(dtype): return self._box_values_as_index() elif is_string_dtype(dtype) and not is_categorical_dtype(dtype): return Index(self.format(), name=self.name, dtype=object) elif is_integer_dtype(dtype): return Index(self.values.astype('i8', copy=copy), name=self.name, dtype='i8') elif (is_datetime_or_timedelta_dtype(dtype) and not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype): # disallow conversion between datetime/timedelta, # and conversions for any datetimelike to float msg = 'Cannot cast {name} to dtype {dtype}' raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy)
def as_json_table_type(x): """ Convert a NumPy / pandas type to its corresponding json_table. Parameters ---------- x : array or dtype Returns ------- t : str the Table Schema data types Notes ----- This table shows the relationship between NumPy / pandas dtypes, and Table Schema dtypes. ============== ================= Pandas type Table Schema type ============== ================= int64 integer float64 number bool boolean datetime64[ns] datetime timedelta64[ns] duration object str categorical any =============== ================= """ if is_integer_dtype(x): return 'integer' elif is_bool_dtype(x): return 'boolean' elif is_numeric_dtype(x): return 'number' elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x)): return 'datetime' elif is_timedelta64_dtype(x): return 'duration' elif is_categorical_dtype(x): return 'any' elif is_string_dtype(x): return 'string' else: return 'any'
def astype(self, dtype, copy=True): if is_object_dtype(dtype): return self._box_values_as_index() elif is_string_dtype(dtype) and not is_categorical_dtype(dtype): return Index(self.format(), name=self.name, dtype=object) elif is_integer_dtype(dtype): # TODO(DatetimeArray): use self._values here. # Can't use ._values currently, because that returns a # DatetimeIndex, which throws us in an infinite loop. return Index(self.values.astype('i8', copy=copy), name=self.name, dtype='i8') elif (is_datetime_or_timedelta_dtype(dtype) and not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype): # disallow conversion between datetime/timedelta, # and conversions for any datetimelike to float msg = 'Cannot cast {name} to dtype {dtype}' raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy)
def __new__(cls, subtype=None): """ Parameters ---------- subtype : the dtype of the Interval """ from pandas.core.dtypes.common import ( is_categorical_dtype, is_string_dtype, pandas_dtype) if isinstance(subtype, IntervalDtype): return subtype elif subtype is None: # we are called as an empty constructor # generally for pickle compat u = object.__new__(cls) u.subtype = None return u elif (isinstance(subtype, compat.string_types) and subtype.lower() == 'interval'): subtype = None else: if isinstance(subtype, compat.string_types): m = cls._match.search(subtype) if m is not None: subtype = m.group('subtype') try: subtype = pandas_dtype(subtype) except TypeError: raise TypeError("could not construct IntervalDtype") if is_categorical_dtype(subtype) or is_string_dtype(subtype): # GH 19016 msg = ('category, object, and string subtypes are not supported ' 'for IntervalDtype') raise TypeError(msg) try: return cls._cache[str(subtype)] except KeyError: u = object.__new__(cls) u.subtype = subtype cls._cache[str(subtype)] = u return u
def make_sparse(arr, kind='block', fill_value=None): """ Convert ndarray to sparse format Parameters ---------- arr : ndarray kind : {'block', 'integer'} fill_value : NaN or another value Returns ------- (sparse_values, index) : (ndarray, SparseIndex) """ arr = _sanitize_values(arr) if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") if fill_value is None: fill_value = na_value_for_dtype(arr.dtype) if isna(fill_value): mask = notna(arr) else: # For str arrays in NumPy 1.12.0, operator!= below isn't # element-wise but just returns False if fill_value is not str, # so cast to object comparison to be safe if is_string_dtype(arr): arr = arr.astype(object) mask = arr != fill_value length = len(arr) if length != mask.size: # the arr is a SparseArray indices = mask.sp_index.indices else: indices = mask.nonzero()[0].astype(np.int32) index = _make_index(length, indices, kind) sparsified_values = arr[mask] return sparsified_values, index, fill_value
def astype(self, dtype, copy=True): # TODO: Figure out something better here... # We have DatetimeLikeArrayMixin -> # super(...), which ends up being... DatetimeIndexOpsMixin? # this is complicated. # need a pandas_astype(arr, dtype). from pandas import Categorical dtype = pandas_dtype(dtype) if is_object_dtype(dtype): return np.asarray(self, dtype=object) elif is_string_dtype(dtype) and not is_categorical_dtype(dtype): return self._format_native_types() elif is_integer_dtype(dtype): values = self._data if values.dtype != dtype: # int32 vs. int64 values = values.astype(dtype) elif copy: values = values.copy() return values elif (is_datetime_or_timedelta_dtype(dtype) and not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype): # disallow conversion between datetime/timedelta, # and conversions for any datetimelike to float msg = 'Cannot cast {name} to dtype {dtype}' raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) elif is_categorical_dtype(dtype): return Categorical(self, dtype=dtype) elif is_period_dtype(dtype): return self.asfreq(dtype.freq) else: return np.asarray(self, dtype=dtype)
def __init__(self, dtype=np.float64, fill_value=None): # type: (Union[str, np.dtype, 'ExtensionDtype', type], Any) -> None from pandas.core.dtypes.missing import na_value_for_dtype from pandas.core.dtypes.common import ( pandas_dtype, is_string_dtype, is_scalar ) if isinstance(dtype, type(self)): if fill_value is None: fill_value = dtype.fill_value dtype = dtype.subtype dtype = pandas_dtype(dtype) if is_string_dtype(dtype): dtype = np.dtype('object') if fill_value is None: fill_value = na_value_for_dtype(dtype) if not is_scalar(fill_value): raise ValueError("fill_value must be a scalar. Got {} " "instead".format(fill_value)) self._dtype = dtype self._fill_value = fill_value
def test_not_string(self): # though CategoricalDtype has object kind, it cannot be string assert not is_string_dtype(CategoricalDtype())
def test_apply_custom_column_types(self): data = { 'to_numerical': [ '150', '200', '50', '10', '5', '150', '200', '50', '10', '5', '1' ] * 10, 'to_text': [ '150', '200', '50', '10', '5', '150', '200', '50', '10', '5', '1' ] * 10, 'to_categorical': [150, 200, 50, 10, 5, 150, 200, 50, 10, 5, 1] * 10, 'stay_categorical': [ '150', '200', '50', '10', '5', '150', '200', '50', '10', '5', '1' ] * 10 } df1 = df2 = pd.DataFrame.from_dict(data) custom_column_types = { 'to_numerical': ColumnType.numerical, 'to_text': ColumnType.text, 'to_categorical': ColumnType.categorical } store = Store(df1, df2, custom_column_types=custom_column_types) with self.subTest("Apply custom_column_types"): self.assertEqual(['to_categorical', 'stay_categorical'], store.type_to_columns[ColumnType.categorical]) self.assertEqual(['to_text'], store.type_to_columns[ColumnType.text]) self.assertEqual(['to_numerical'], store.type_to_columns[ColumnType.numerical]) with self.subTest( "Apply numerical conversion for custom_column_types to dataframes" ): self.assertTrue(is_numeric_dtype(store.df1['to_numerical'])) self.assertTrue(store.df1['to_numerical'].equals( pd.Series([ 150.0, 200.0, 50.0, 10.0, 5.0, 150.0, 200.0, 50.0, 10.0, 5.0, 1.0 ] * 10))) with self.subTest( "Apply categorical conversion for custom_column_types to dataframes" ): self.assertTrue(is_string_dtype(store.df1['to_categorical'])) self.assertTrue(store.df1['to_categorical'].equals( pd.Series([ '150', '200', '50', '10', '5', '150', '200', '50', '10', '5', '1' ] * 10))) with self.subTest( "Apply textual conversion for custom_column_types to dataframes" ): self.assertTrue(is_string_dtype(store.df1['to_text'])) self.assertTrue(store.df1['to_text'].equals( pd.Series([ '150', '200', '50', '10', '5', '150', '200', '50', '10', '5', '1' ] * 10)))
def array_equivalent(left, right, strict_nan: bool = False) -> bool: """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and right are NumPy arrays of the same dtype. The behavior of this function (particularly with respect to NaNs) is not defined if the dtypes are different. Parameters ---------- left, right : ndarrays strict_nan : bool, default False If True, consider NaN and None to be different. Returns ------- b : bool Returns True if the arrays are equivalent. Examples -------- >>> array_equivalent( ... np.array([1, 2, np.nan]), ... np.array([1, 2, np.nan])) True >>> array_equivalent( ... np.array([1, np.nan, 2]), ... np.array([1, 2, np.nan])) False """ left, right = np.asarray(left), np.asarray(right) # shape compat if left.shape != right.shape: return False # Object arrays can contain None, NaN and NaT. # string dtypes must be come to this path for NumPy 1.7.1 compat if is_string_dtype(left) or is_string_dtype(right): if not strict_nan: # isna considers NaN and None to be equivalent. return lib.array_equivalent_object( ensure_object(left.ravel()), ensure_object(right.ravel()) ) for left_value, right_value in zip(left, right): if left_value is NaT and right_value is not NaT: return False elif left_value is libmissing.NA and right_value is not libmissing.NA: return False elif isinstance(left_value, float) and np.isnan(left_value): if not isinstance(right_value, float) or not np.isnan(right_value): return False else: try: if np.any(np.asarray(left_value != right_value)): return False except TypeError as err: if "Cannot compare tz-naive" in str(err): # tzawareness compat failure, see GH#28507 return False elif "boolean value of NA is ambiguous" in str(err): return False raise return True # NaNs can occur in float and complex arrays. if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype): # empty if not (np.prod(left.shape) and np.prod(right.shape)): return True return ((left == right) | (isna(left) & isna(right))).all() elif is_datetimelike_v_numeric(left, right): # GH#29553 avoid numpy deprecation warning return False elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): # datetime64, timedelta64, Period if not is_dtype_equal(left.dtype, right.dtype): return False left = left.view("i8") right = right.view("i8") # if we have structured dtypes, compare first if left.dtype.type is np.void or right.dtype.type is np.void: if left.dtype != right.dtype: return False return np.array_equal(left, right)
def sanitize_array( data, index: Optional[Index], dtype: Optional[DtypeObj] = None, copy: bool = False, raise_cast_failure: bool = False, ) -> ArrayLike: """ Sanitize input data to an ndarray or ExtensionArray, copy if specified, coerce to the dtype if specified. """ if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() # extract ndarray or ExtensionArray, ensure we have no PandasArray data = extract_array(data, extract_numpy=True) # GH#846 if isinstance(data, np.ndarray): if dtype is not None and is_float_dtype( data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) except ValueError: if copy: subarr = data.copy() else: subarr = np.array(data, copy=False) else: # we will try to copy be-definition here subarr = _try_cast(data, dtype, copy, raise_cast_failure) elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray subarr = data if dtype is not None: subarr = subarr.astype(dtype, copy=copy) elif copy: subarr = subarr.copy() return subarr elif isinstance(data, (list, tuple, abc.Set, abc.ValuesView)) and len(data) > 0: if isinstance(data, set): # Raise only for unordered sets, e.g., not for dict_keys raise TypeError("Set type is unordered") data = list(data) if dtype is not None: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: subarr = maybe_convert_platform(data) subarr = maybe_cast_to_datetime(subarr, dtype) elif isinstance(data, range): # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) elif lib.is_scalar(data) and index is not None and dtype is not None: data = maybe_cast_to_datetime(data, dtype) if not lib.is_scalar(data): data = data[0] subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype) else: subarr = _try_cast(data, dtype, copy, raise_cast_failure) # scalar like, GH if getattr(subarr, "ndim", 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: value = data # figure out the dtype from the value (upcast if necessary) if dtype is None: dtype, value = infer_dtype_from_scalar(value, pandas_dtype=True) else: # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) subarr = construct_1d_arraylike_from_scalar( value, len(index), dtype) else: return subarr.item() # the result that we want elif subarr.ndim == 1: if index is not None: # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: subarr = construct_1d_arraylike_from_scalar( subarr[0], len(index), subarr.dtype) elif subarr.ndim > 1: if isinstance(data, np.ndarray): raise Exception("Data must be 1-dimensional") else: subarr = com.asarray_tuplesafe(data, dtype=dtype) if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)): # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. if issubclass(subarr.dtype.type, str): # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, subarr has already the result if not lib.is_scalar(data): if not np.all(isna(data)): data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype( dtype) if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype: inferred = lib.infer_dtype(subarr, skipna=False) if inferred in {"interval", "period"}: subarr = array(subarr) return subarr
def test_not_string(self): # though PeriodDtype has object kind, it cannot be string self.assertFalse(is_string_dtype(PeriodDtype('D')))
def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, converters=None, dtypes=None): result = {} for c, values in dct.items(): conv_f = None if converters is None else converters.get(c, None) if isinstance(dtypes, dict): cast_type = dtypes.get(c, None) else: # single dtype or None cast_type = dtypes if self.na_filter: col_na_values, col_na_fvalues = _get_na_values( c, na_values, na_fvalues, self.keep_default_na) else: col_na_values, col_na_fvalues = set(), set() if conv_f is not None: # conv_f applied to data before inference if cast_type is not None: warnings.warn( ("Both a converter and dtype were specified " f"for column {c} - only the converter will be used"), ParserWarning, stacklevel=7, ) try: values = lib.map_infer(values, conv_f) except ValueError: mask = algorithms.isin(values, list(na_values)).view(np.uint8) values = lib.map_infer_mask(values, conv_f, mask) cvals, na_count = self._infer_types(values, set(col_na_values) | col_na_fvalues, try_num_bool=False) else: is_ea = is_extension_array_dtype(cast_type) is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) # skip inference if specified dtype is object # or casting to an EA try_num_bool = not (cast_type and is_str_or_ea_dtype) # general type inference and conversion cvals, na_count = self._infer_types( values, set(col_na_values) | col_na_fvalues, try_num_bool) # type specified in dtype param or cast_type is an EA if cast_type and (not is_dtype_equal(cvals, cast_type) or is_extension_array_dtype(cast_type)): if not is_ea and na_count > 0: try: if is_bool_dtype(cast_type): raise ValueError( f"Bool column has NA values in column {c}") except (AttributeError, TypeError): # invalid input to is_bool_dtype pass cast_type = pandas_dtype(cast_type) cvals = self._cast_types(cvals, cast_type, c) result[c] = cvals if verbose and na_count: print(f"Filled {na_count} NA values in column {c!s}") return result
def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ from pandas.core.tools.timedeltas import to_timedelta from pandas.core.tools.datetimes import to_datetime if dtype is not None: if isinstance(dtype, str): dtype = np.dtype(dtype) is_datetime64 = is_datetime64_dtype(dtype) is_datetime64tz = is_datetime64tz_dtype(dtype) is_timedelta64 = is_timedelta64_dtype(dtype) if is_datetime64 or is_datetime64tz or is_timedelta64: # Force the dtype if needed. msg = (f"The '{dtype.name}' dtype has no unit. " f"Please pass in '{dtype.name}[ns]' instead.") if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE): # pandas supports dtype whose granularity is less than [ns] # e.g., [ps], [fs], [as] if dtype <= np.dtype("M8[ns]"): if dtype.name == "datetime64": raise ValueError(msg) dtype = _NS_DTYPE else: raise TypeError( f"cannot convert datetimelike to dtype [{dtype}]") elif is_datetime64tz: # our NaT doesn't support tz's # this will coerce to DatetimeIndex with # a matching dtype below if is_scalar(value) and isna(value): value = [value] elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE): # pandas supports dtype whose granularity is less than [ns] # e.g., [ps], [fs], [as] if dtype <= np.dtype("m8[ns]"): if dtype.name == "timedelta64": raise ValueError(msg) dtype = _TD_DTYPE else: raise TypeError( f"cannot convert timedeltalike to dtype [{dtype}]") if is_scalar(value): if value == iNaT or isna(value): value = iNaT else: value = np.array(value, copy=False) # have a scalar array-like (e.g. NaT) if value.ndim == 0: value = iNaT # we have an array of datetime or timedeltas & nulls elif np.prod( value.shape) or not is_dtype_equal(value.dtype, dtype): try: if is_datetime64: value = to_datetime(value, errors=errors) # GH 25843: Remove tz information since the dtype # didn't specify one if value.tz is not None: value = value.tz_localize(None) value = value._values elif is_datetime64tz: # The string check can be removed once issue #13712 # is solved. String data that is passed with a # datetime64tz is assumed to be naive which should # be localized to the timezone. is_dt_string = is_string_dtype(value) value = to_datetime(value, errors=errors).array if is_dt_string: # Strings here are naive, so directly localize value = value.tz_localize(dtype.tz) else: # Numeric values are UTC at this point, # so localize and convert value = value.tz_localize("UTC").tz_convert( dtype.tz) elif is_timedelta64: value = to_timedelta(value, errors=errors)._values except OutOfBoundsDatetime: raise except (AttributeError, ValueError, TypeError): pass # coerce datetimelike to object elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype): if is_object_dtype(dtype): if value.dtype != _NS_DTYPE: value = value.astype(_NS_DTYPE) ints = np.asarray(value).view("i8") return tslib.ints_to_pydatetime(ints) # we have a non-castable dtype that was passed raise TypeError(f"Cannot cast datetime64 to {dtype}") else: is_array = isinstance(value, np.ndarray) # catch a datetime/timedelta that is not of ns variety # and no coercion specified if is_array and value.dtype.kind in ["M", "m"]: dtype = value.dtype if dtype.kind == "M" and dtype != _NS_DTYPE: value = tslibs.conversion.ensure_datetime64ns(value) elif dtype.kind == "m" and dtype != _TD_DTYPE: value = to_timedelta(value) # only do this if we have an array and the dtype of the array is not # setup already we are not an integer/object, so don't bother with this # conversion elif not (is_array and not (issubclass(value.dtype.type, np.integer) or value.dtype == np.object_)): value = maybe_infer_to_datetimelike(value) return value
def _simple_new(cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True): result = IntervalMixin.__new__(cls) closed = closed or "right" left = ensure_index(left, copy=copy) right = ensure_index(right, copy=copy) if dtype is not None: # GH 19262: dtype must be an IntervalDtype to override inferred dtype = pandas_dtype(dtype) if not is_interval_dtype(dtype): msg = f"dtype must be an IntervalDtype, got {dtype}" raise TypeError(msg) elif dtype.subtype is not None: left = left.astype(dtype.subtype) right = right.astype(dtype.subtype) # coerce dtypes to match if needed if is_float_dtype(left) and is_integer_dtype(right): right = right.astype(left.dtype) elif is_float_dtype(right) and is_integer_dtype(left): left = left.astype(right.dtype) if type(left) != type(right): msg = (f"must not have differing left [{type(left).__name__}] and " f"right [{type(right).__name__}] types") raise ValueError(msg) elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): # GH 19016 msg = ("category, object, and string subtypes are not supported " "for IntervalArray") raise TypeError(msg) elif isinstance(left, ABCPeriodIndex): msg = "Period dtypes are not supported, use a PeriodIndex instead" raise ValueError(msg) elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): msg = ("left and right must have the same time zone, got " f"'{left.tz}' and '{right.tz}'") raise ValueError(msg) # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray left = ensure_wrapped_if_datetimelike(left) left = extract_array(left, extract_numpy=True) right = ensure_wrapped_if_datetimelike(right) right = extract_array(right, extract_numpy=True) lbase = getattr(left, "_ndarray", left).base rbase = getattr(right, "_ndarray", right).base if lbase is not None and lbase is rbase: # If these share area_data, then setitem could corrupt our IA right = right.copy() result._left = left result._right = right result._closed = closed if verify_integrity: result._validate() return result
def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype): if default_dtype.kind == "f": checker = is_float_dtype else: checker = is_integer_dtype inferred_type = None if dtype is None and hasattr(values, "dtype"): if checker(values.dtype): dtype = values.dtype if dtype is not None: dtype = dtype_cls._standardize_dtype(dtype) cls = dtype_cls.construct_array_type() if isinstance(values, cls): values, mask = values._data, values._mask if dtype is not None: values = values.astype(dtype.numpy_dtype, copy=False) if copy: values = values.copy() mask = mask.copy() return values, mask, dtype, inferred_type values = np.array(values, copy=copy) inferred_type = None if is_object_dtype(values.dtype) or is_string_dtype(values.dtype): inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "empty": pass elif inferred_type == "boolean": name = dtype_cls.__name__.strip("_") raise TypeError(f"{values.dtype} cannot be converted to {name}") elif is_bool_dtype(values) and checker(dtype): values = np.array(values, dtype=default_dtype, copy=copy) elif not (is_integer_dtype(values) or is_float_dtype(values)): name = dtype_cls.__name__.strip("_") raise TypeError(f"{values.dtype} cannot be converted to {name}") if values.ndim != 1: raise TypeError("values must be a 1D list-like") if mask is None: mask = libmissing.is_numeric_na(values) else: assert len(mask) == len(values) if mask.ndim != 1: raise TypeError("mask must be a 1D list-like") # infer dtype if needed if dtype is None: dtype = default_dtype else: dtype = dtype.type # we copy as need to coerce here if mask.any(): values = values.copy() values[mask] = cls._internal_fill_value if inferred_type in ("string", "unicode"): # casts from str are always safe since they raise # a ValueError if the str cannot be parsed into a float values = values.astype(dtype, copy=copy) else: values = dtype_cls._safe_cast(values, dtype, copy=False) return values, mask, dtype, inferred_type
def to_numpy( self, dtype: npt.DTypeLike | None = None, copy: bool = False, na_value: Scalar = lib.no_default, ) -> np.ndarray: """ Convert to a NumPy Array. By default converts to an object-dtype NumPy array. Specify the `dtype` and `na_value` keywords to customize the conversion. Parameters ---------- dtype : dtype, default object The numpy dtype to convert to. copy : bool, default False Whether to ensure that the returned value is a not a view on the array. Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. This is typically only possible when no missing values are present and `dtype` is the equivalent numpy dtype. na_value : scalar, optional Scalar missing value indicator to use in numpy array. Defaults to the native missing value indicator of this array (pd.NA). Returns ------- numpy.ndarray Examples -------- An object-dtype is the default result >>> a = pd.array([True, False, pd.NA], dtype="boolean") >>> a.to_numpy() array([True, False, <NA>], dtype=object) When no missing values are present, an equivalent dtype can be used. >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool") array([ True, False]) >>> pd.array([1, 2], dtype="Int64").to_numpy("int64") array([1, 2]) However, requesting such dtype will raise a ValueError if missing values are present and the default missing value :attr:`NA` is used. >>> a = pd.array([True, False, pd.NA], dtype="boolean") >>> a <BooleanArray> [True, False, <NA>] Length: 3, dtype: boolean >>> a.to_numpy(dtype="bool") Traceback (most recent call last): ... ValueError: cannot convert to bool numpy array in presence of missing values Specify a valid `na_value` instead >>> a.to_numpy(dtype="bool", na_value=False) array([ True, False, False]) """ if na_value is lib.no_default: na_value = libmissing.NA if dtype is None: dtype = object if self._hasna: if (not is_object_dtype(dtype) and not is_string_dtype(dtype) and na_value is libmissing.NA): raise ValueError( f"cannot convert to '{dtype}'-dtype NumPy array " "with missing values. Specify an appropriate 'na_value' " "for this dtype.") # don't pass copy to astype -> always need a copy since we are mutating data = self._data.astype(dtype) data[self._mask] = na_value else: data = self._data.astype(dtype, copy=copy) return data
def sequence_to_td64ns(data, copy: bool = False, unit=None, errors="raise") -> tuple[np.ndarray, Tick | None]: """ Parameters ---------- data : list-like copy : bool, default False unit : str, optional The timedelta unit to treat integers as multiples of. For numeric data this defaults to ``'ns'``. Must be un-specified if the data contains a str and ``errors=="raise"``. errors : {"raise", "coerce", "ignore"}, default "raise" How to handle elements that cannot be converted to timedelta64[ns]. See ``pandas.to_timedelta`` for details. Returns ------- converted : numpy.ndarray The sequence converted to a numpy array with dtype ``timedelta64[ns]``. inferred_freq : Tick or None The inferred frequency of the sequence. Raises ------ ValueError : Data cannot be converted to timedelta64[ns]. Notes ----- Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause errors to be ignored; they are caught and subsequently ignored at a higher level. """ inferred_freq = None if unit is not None: unit = parse_timedelta_unit(unit) # Unwrap whatever we have into a np.ndarray if not hasattr(data, "dtype"): # e.g. list, tuple if np.ndim(data) == 0: # i.e. generator data = list(data) data = np.array(data, copy=False) elif isinstance(data, ABCMultiIndex): raise TypeError("Cannot create a DatetimeArray from a MultiIndex.") else: data = extract_array(data, extract_numpy=True) if isinstance(data, IntegerArray): data = data.to_numpy("int64", na_value=iNaT) elif not isinstance(data, (np.ndarray, ExtensionArray)): # GH#24539 e.g. xarray, dask object data = np.asarray(data) elif isinstance(data, ABCCategorical): data = data.categories.take(data.codes, fill_value=NaT)._values copy = False if isinstance(data, TimedeltaArray): inferred_freq = data.freq # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data.dtype) or is_string_dtype(data.dtype): # no need to make a copy, need to convert if string-dtyped data = objects_to_td64ns(data, unit=unit, errors=errors) copy = False elif is_integer_dtype(data.dtype): # treat as multiples of the given unit data, copy_made = ints_to_td64ns(data, unit=unit) copy = copy and not copy_made elif is_float_dtype(data.dtype): # cast the unit, multiply base/frac separately # to avoid precision issues from float -> int mask = np.isnan(data) # The next few lines are effectively a vectorized 'cast_from_unit' m, p = precision_from_unit(unit or "ns") base = data.astype(np.int64) frac = data - base if p: frac = np.round(frac, p) data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]") data[mask] = iNaT copy = False elif is_timedelta64_dtype(data.dtype): if data.dtype != TD64NS_DTYPE: # non-nano unit data = ensure_timedelta64ns(data) copy = False else: # This includes datetime64-dtype, see GH#23539, GH#29794 raise TypeError( f"dtype {data.dtype} cannot be converted to timedelta64[ns]") data = np.array(data, copy=copy) assert data.dtype == "m8[ns]", data return data, inferred_freq
def sanitize_array( data, index: Index | None, dtype: DtypeObj | None = None, copy: bool = False, raise_cast_failure: bool = True, ) -> ArrayLike: """ Sanitize input data to an ndarray or ExtensionArray, copy if specified, coerce to the dtype if specified. Parameters ---------- data : Any index : Index or None, default None dtype : np.dtype, ExtensionDtype, or None, default None copy : bool, default False raise_cast_failure : bool, default True Returns ------- np.ndarray or ExtensionArray Notes ----- raise_cast_failure=False is only intended to be True when called from the DataFrame constructor, as the dtype keyword there may be interpreted as only applying to a subset of columns, see GH#24435. """ if isinstance(data, ma.MaskedArray): data = sanitize_masked_array(data) # extract ndarray or ExtensionArray, ensure we have no PandasArray data = extract_array(data, extract_numpy=True) if isinstance(data, np.ndarray) and data.ndim == 0: if dtype is None: dtype = data.dtype data = lib.item_from_zerodim(data) elif isinstance(data, range): # GH#16804 data = np.arange(data.start, data.stop, data.step, dtype="int64") copy = False if not is_list_like(data): if index is None: raise ValueError( "index must be specified when data is not list-like") data = construct_1d_arraylike_from_scalar(data, len(index), dtype) return data # GH#846 if isinstance(data, np.ndarray): if dtype is not None and is_float_dtype( data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) except ValueError: subarr = np.array(data, copy=copy) else: # we will try to copy by-definition here subarr = _try_cast(data, dtype, copy, raise_cast_failure) elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray subarr = data if dtype is not None: subarr = subarr.astype(dtype, copy=copy) elif copy: subarr = subarr.copy() return subarr else: if isinstance(data, (set, frozenset)): # Raise only for unordered sets, e.g., not for dict_keys raise TypeError(f"'{type(data).__name__}' type is unordered") # materialize e.g. generators, convert e.g. tuples, abc.ValueView # TODO: non-standard array-likes we can convert to ndarray more efficiently? data = list(data) if dtype is not None or len(data) == 0: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: subarr = maybe_convert_platform(data) # error: Incompatible types in assignment (expression has type # "Union[ExtensionArray, ndarray, List[Any]]", variable has type # "ExtensionArray") subarr = maybe_cast_to_datetime(subarr, dtype) # type: ignore[assignment] subarr = _sanitize_ndim(subarr, data, dtype, index) if not (isinstance(subarr.dtype, ExtensionDtype) or isinstance(dtype, ExtensionDtype)): subarr = _sanitize_str_dtypes(subarr, data, dtype, copy) is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype( dtype) if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype: inferred = lib.infer_dtype(subarr, skipna=False) if inferred in {"interval", "period"}: subarr = array(subarr) subarr = extract_array(subarr, extract_numpy=True) return subarr
def _convert_to_ndarrays( self, dct: Mapping, na_values, na_fvalues, verbose: bool = False, converters=None, dtypes=None, ): result = {} for c, values in dct.items(): conv_f = None if converters is None else converters.get(c, None) if isinstance(dtypes, dict): cast_type = dtypes.get(c, None) else: # single dtype or None cast_type = dtypes if self.na_filter: col_na_values, col_na_fvalues = _get_na_values( c, na_values, na_fvalues, self.keep_default_na) else: col_na_values, col_na_fvalues = set(), set() if c in self._parse_date_cols: # GH#26203 Do not convert columns which get converted to dates # but replace nans to ensure to_datetime works mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues) np.putmask(values, mask, np.nan) result[c] = values continue if conv_f is not None: # conv_f applied to data before inference if cast_type is not None: warnings.warn( ("Both a converter and dtype were specified " f"for column {c} - only the converter will be used."), ParserWarning, stacklevel=find_stack_level(), ) try: values = lib.map_infer(values, conv_f) except ValueError: # error: Argument 2 to "isin" has incompatible type "List[Any]"; # expected "Union[Union[ExtensionArray, ndarray], Index, Series]" mask = algorithms.isin( values, list(na_values) # type: ignore[arg-type] ).view(np.uint8) values = lib.map_infer_mask(values, conv_f, mask) cvals, na_count = self._infer_types(values, set(col_na_values) | col_na_fvalues, try_num_bool=False) else: is_ea = is_extension_array_dtype(cast_type) is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) # skip inference if specified dtype is object # or casting to an EA try_num_bool = not (cast_type and is_str_or_ea_dtype) # general type inference and conversion cvals, na_count = self._infer_types( values, set(col_na_values) | col_na_fvalues, try_num_bool) # type specified in dtype param or cast_type is an EA if cast_type and (not is_dtype_equal(cvals, cast_type) or is_extension_array_dtype(cast_type)): if not is_ea and na_count > 0: try: if is_bool_dtype(cast_type): raise ValueError( f"Bool column has NA values in column {c}") except (AttributeError, TypeError): # invalid input to is_bool_dtype pass cast_type = pandas_dtype(cast_type) cvals = self._cast_types(cvals, cast_type, c) result[c] = cvals if verbose and na_count: print(f"Filled {na_count} NA values in column {c!s}") return result
def test_is_string_dtype_nullable(nullable_string_dtype): assert com.is_string_dtype( pd.array(["a", "b"], dtype=nullable_string_dtype))
def test_not_string(self): # though PeriodDtype has object kind, it cannot be string assert not is_string_dtype(PeriodDtype("D"))
def convert_polygon_feature_to_raster(feature, pixel_size, value=1, nodata_val=-9999, snap_extent_to_pixel=True): """ Convert a feature into a raster (block grid) The "value" argument can be a numeric, or the name of a feature column. If the feature column is a string, then the index value will be used to assign a value to the raster. By default, snap_extent_to_pixel is true and is used to snap the bounding coordinates to a pixel edge. Args: feature (pandas.core.series.Series): a polygon feature to convert to a raster pixel_size (int, float]: the pixel size for the raster value (int, float, str): pixel value. A column from the feature can be used nodata_val (int, float): nodata value snap_extent_to_pixel (bool): round the extent coordinates to be divisible by the pixel size Returns: burned (numpy.ndarray) : The array representing the raster data Dict[str, int]] : The rasterio meta kwargs required for writing the array to disk. """ if not isinstance(feature, (pd.Series, geopandas.GeoSeries)): raise TypeError("Input feature should be a geopandas series") if not isinstance(pixel_size, (int, long, float)): raise TypeError('Pixel size must be an integer or floating number.') if isinstance(value, str) and 'value' not in list(feature.index): raise TypeError('Value string {} is not a column name') elif not isinstance(value, (int, long, float)): raise TypeError('Value should be a column name, or a number') transform, width, height, bbox = create_raster_transform( feature.geometry.bounds, pixel_size, snap_extent_to_pixel) gdf_feat = GeoDataFrame(GeoSeries(feature.geometry), geometry=GeoSeries(feature.geometry).geometry) # create an affine transformation matrix to associate the array to the coordinates. if value in gdf_feat.columns: if is_string_dtype(gdf_feat[value]): shapes = ((geom, val) for geom, val in zip(gdf_feat.geometry, (gdf_feat.index + 1))) else: shapes = ((geom, val) for geom, val in zip(gdf_feat.geometry, gdf_feat[value])) else: shapes = ((geom, value) for geom in gdf_feat.geometry) burned = features.rasterize(shapes=shapes, out_shape=(height, width), fill=nodata_val, transform=transform) meta = {} meta.update(height=height, width=width, transform=transform, nodata=nodata_val, dtype=rasterio.dtypes.get_minimum_dtype(burned)) return burned, meta
def test_not_string(self): # GH30568: though IntervalDtype has object kind, it cannot be string assert not is_string_dtype(IntervalDtype())
def coerce_to_array(values, dtype, mask=None, copy: bool = False) -> tuple[np.ndarray, np.ndarray]: """ Coerce the input values array to numpy arrays with a mask. Parameters ---------- values : 1D list-like dtype : integer dtype mask : bool 1D array, optional copy : bool, default False if True, copy the input Returns ------- tuple of (values, mask) """ # if values is integer numpy array, preserve its dtype if dtype is None and hasattr(values, "dtype"): if is_integer_dtype(values.dtype): dtype = values.dtype if dtype is not None: if isinstance(dtype, str) and (dtype.startswith("Int") or dtype.startswith("UInt")): # Avoid DeprecationWarning from NumPy about np.dtype("Int64") # https://github.com/numpy/numpy/pull/7476 dtype = dtype.lower() if not issubclass(type(dtype), _IntegerDtype): try: dtype = INT_STR_TO_DTYPE[str(np.dtype(dtype))] except KeyError as err: raise ValueError(f"invalid dtype specified {dtype}") from err if isinstance(values, IntegerArray): values, mask = values._data, values._mask if dtype is not None: values = values.astype(dtype.numpy_dtype, copy=False) if copy: values = values.copy() mask = mask.copy() return values, mask values = np.array(values, copy=copy) inferred_type = None if is_object_dtype(values) or is_string_dtype(values): inferred_type = lib.infer_dtype(values, skipna=True) if inferred_type == "empty": pass elif inferred_type not in [ "floating", "integer", "mixed-integer", "integer-na", "mixed-integer-float", "string", "unicode", ]: raise TypeError( f"{values.dtype} cannot be converted to an IntegerDtype") elif is_bool_dtype(values) and is_integer_dtype(dtype): values = np.array(values, dtype=int, copy=copy) elif not (is_integer_dtype(values) or is_float_dtype(values)): raise TypeError( f"{values.dtype} cannot be converted to an IntegerDtype") if values.ndim != 1: raise TypeError("values must be a 1D list-like") if mask is None: mask = libmissing.is_numeric_na(values) else: assert len(mask) == len(values) if mask.ndim != 1: raise TypeError("mask must be a 1D list-like") # infer dtype if needed if dtype is None: dtype = np.dtype("int64") else: dtype = dtype.type # if we are float, let's make sure that we can # safely cast # we copy as need to coerce here if mask.any(): values = values.copy() values[mask] = 1 if inferred_type in ("string", "unicode"): # casts from str are always safe since they raise # a ValueError if the str cannot be parsed into an int values = values.astype(dtype, copy=copy) else: values = safe_cast(values, dtype, copy=False) return values, mask
def convert_dtypes( input_array, convert_string: bool = True, convert_integer: bool = True, convert_boolean: bool = True, ) -> Dtype: """ Convert objects to best possible type, and optionally, to types supporting ``pd.NA``. Parameters ---------- input_array : ExtensionArray or PandasArray convert_string : bool, default True Whether object dtypes should be converted to ``StringDtype()``. convert_integer : bool, default True Whether, if possible, conversion can be done to integer extension types. convert_boolean : bool, defaults True Whether object dtypes should be converted to ``BooleanDtypes()``. Returns ------- dtype new dtype """ is_extension = is_extension_array_dtype(input_array.dtype) if (convert_string or convert_integer or convert_boolean) and not is_extension: try: inferred_dtype = lib.infer_dtype(input_array) except ValueError: # Required to catch due to Period. Can remove once GH 23553 is fixed inferred_dtype = input_array.dtype if not convert_string and is_string_dtype(inferred_dtype): inferred_dtype = input_array.dtype if convert_integer: target_int_dtype = "Int64" if is_integer_dtype(input_array.dtype): from pandas.core.arrays.integer import _dtypes inferred_dtype = _dtypes.get(input_array.dtype.name, target_int_dtype) if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( input_array.dtype): inferred_dtype = target_int_dtype else: if is_integer_dtype(inferred_dtype): inferred_dtype = input_array.dtype if convert_boolean: if is_bool_dtype(input_array.dtype): inferred_dtype = "boolean" else: if isinstance(inferred_dtype, str) and inferred_dtype == "boolean": inferred_dtype = input_array.dtype else: inferred_dtype = input_array.dtype return inferred_dtype
def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): """ Parameters ---------- array : list-like copy : bool, default False unit : str, default "ns" The timedelta unit to treat integers as multiples of. errors : {"raise", "coerce", "ignore"}, default "raise" How to handle elements that cannot be converted to timedelta64[ns]. See ``pandas.to_timedelta`` for details. Returns ------- converted : numpy.ndarray The sequence converted to a numpy array with dtype ``timedelta64[ns]``. inferred_freq : Tick or None The inferred frequency of the sequence. Raises ------ ValueError : Data cannot be converted to timedelta64[ns]. Notes ----- Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause errors to be ignored; they are caught and subsequently ignored at a higher level. """ inferred_freq = None unit = parse_timedelta_unit(unit) # Unwrap whatever we have into a np.ndarray if not hasattr(data, "dtype"): # e.g. list, tuple if np.ndim(data) == 0: # i.e. generator data = list(data) data = np.array(data, copy=False) elif isinstance(data, ABCSeries): data = data._values elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)): inferred_freq = data.freq data = data._data # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data.dtype) or is_string_dtype(data.dtype): # no need to make a copy, need to convert if string-dtyped data = objects_to_td64ns(data, unit=unit, errors=errors) copy = False elif is_integer_dtype(data.dtype): # treat as multiples of the given unit data, copy_made = ints_to_td64ns(data, unit=unit) copy = copy and not copy_made elif is_float_dtype(data.dtype): # cast the unit, multiply base/frace separately # to avoid precision issues from float -> int mask = np.isnan(data) m, p = precision_from_unit(unit) base = data.astype(np.int64) frac = data - base if p: frac = np.round(frac, p) data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]") data[mask] = iNaT copy = False elif is_timedelta64_dtype(data.dtype): if data.dtype != _TD_DTYPE: # non-nano unit # TODO: watch out for overflows data = data.astype(_TD_DTYPE) copy = False else: # This includes datetime64-dtype, see GH#23539, GH#29794 raise TypeError( f"dtype {data.dtype} cannot be converted to timedelta64[ns]") data = np.array(data, copy=copy) assert data.dtype == "m8[ns]", data return data, inferred_freq
def maybe_downcast_numeric(result, dtype, do_round: bool = False): """ Subset of maybe_downcast_to_dtype restricted to numeric dtypes. Parameters ---------- result : ndarray or ExtensionArray dtype : np.dtype or ExtensionDtype do_round : bool Returns ------- ndarray or ExtensionArray """ if not isinstance(dtype, np.dtype): # e.g. SparseDtype has no itemsize attr return result if isinstance(result, list): # reached via groupoby.agg _ohlc; really this should be handled # earlier result = np.array(result) def trans(x): if do_round: return x.round() return x if dtype.kind == result.dtype.kind: # don't allow upcasts here (except if empty) if result.dtype.itemsize <= dtype.itemsize and result.size: return result if is_bool_dtype(dtype) or is_integer_dtype(dtype): if not result.size: # if we don't have any elements, just astype it return trans(result).astype(dtype) # do a test on the first element, if it fails then we are done r = result.ravel() arr = np.array([r[0]]) if isna(arr).any(): # if we have any nulls, then we are done return result elif not isinstance( r[0], (np.integer, np.floating, np.bool, int, float, bool)): # a comparable, e.g. a Decimal may slip in here return result if (issubclass(result.dtype.type, (np.object_, np.number)) and notna(result).all()): new_result = trans(result).astype(dtype) if new_result.dtype.kind == "O" or result.dtype.kind == "O": # np.allclose may raise TypeError on object-dtype if (new_result == result).all(): return new_result else: if np.allclose(new_result, result, rtol=0): return new_result elif (issubclass(dtype.type, np.floating) and not is_bool_dtype(result.dtype) and not is_string_dtype(result.dtype)): return result.astype(dtype) return result
def array_equivalent(left, right, strict_nan: bool = False, dtype_equal: bool = False) -> bool: """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and right are NumPy arrays of the same dtype. The behavior of this function (particularly with respect to NaNs) is not defined if the dtypes are different. Parameters ---------- left, right : ndarrays strict_nan : bool, default False If True, consider NaN and None to be different. dtype_equal : bool, default False Whether `left` and `right` are known to have the same dtype according to `is_dtype_equal`. Some methods like `BlockManager.equals`. require that the dtypes match. Setting this to ``True`` can improve performance, but will give different results for arrays that are equal but different dtypes. Returns ------- b : bool Returns True if the arrays are equivalent. Examples -------- >>> array_equivalent( ... np.array([1, 2, np.nan]), ... np.array([1, 2, np.nan])) True >>> array_equivalent( ... np.array([1, np.nan, 2]), ... np.array([1, 2, np.nan])) False """ left, right = np.asarray(left), np.asarray(right) # shape compat if left.shape != right.shape: return False if dtype_equal: # fastpath when we require that the dtypes match (Block.equals) if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype): return _array_equivalent_float(left, right) elif is_datetimelike_v_numeric(left.dtype, right.dtype): return False elif needs_i8_conversion(left.dtype): return _array_equivalent_datetimelike(left, right) elif is_string_dtype(left.dtype): # TODO: fastpath for pandas' StringDtype return _array_equivalent_object(left, right, strict_nan) else: return np.array_equal(left, right) # Slow path when we allow comparing different dtypes. # Object arrays can contain None, NaN and NaT. # string dtypes must be come to this path for NumPy 1.7.1 compat if is_string_dtype(left.dtype) or is_string_dtype(right.dtype): return _array_equivalent_object(left, right, strict_nan) # NaNs can occur in float and complex arrays. if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype): if not (np.prod(left.shape) and np.prod(right.shape)): return True return ((left == right) | (isna(left) & isna(right))).all() elif is_datetimelike_v_numeric(left, right): # GH#29553 avoid numpy deprecation warning return False elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): # datetime64, timedelta64, Period if not is_dtype_equal(left.dtype, right.dtype): return False left = left.view("i8") right = right.view("i8") # if we have structured dtypes, compare first if (left.dtype.type is np.void or right.dtype.type is np.void) and left.dtype != right.dtype: return False return np.array_equal(left, right)
def sanitize_array( data, index: Optional[Index], dtype: Optional[DtypeObj] = None, copy: bool = False, raise_cast_failure: bool = False, ) -> ArrayLike: """ Sanitize input data to an ndarray or ExtensionArray, copy if specified, coerce to the dtype if specified. """ if isinstance(data, ma.MaskedArray): data = sanitize_masked_array(data) # extract ndarray or ExtensionArray, ensure we have no PandasArray data = extract_array(data, extract_numpy=True) if isinstance(data, np.ndarray) and data.ndim == 0: if dtype is None: dtype = data.dtype data = lib.item_from_zerodim(data) # GH#846 if isinstance(data, np.ndarray): if dtype is not None and is_float_dtype( data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) except ValueError: subarr = np.array(data, copy=copy) else: # we will try to copy by-definition here subarr = _try_cast(data, dtype, copy, raise_cast_failure) elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray subarr = data if dtype is not None: subarr = subarr.astype(dtype, copy=copy) elif copy: subarr = subarr.copy() return subarr elif isinstance(data, (list, tuple, abc.Set, abc.ValuesView)) and len(data) > 0: # TODO: deque, array.array if isinstance(data, set): # Raise only for unordered sets, e.g., not for dict_keys raise TypeError("Set type is unordered") data = list(data) if dtype is not None: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: subarr = maybe_convert_platform(data) subarr = maybe_cast_to_datetime(subarr, dtype) elif isinstance(data, range): # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) elif not is_list_like(data): if index is None: raise ValueError( "index must be specified when data is not list-like") subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype) else: subarr = _try_cast(data, dtype, copy, raise_cast_failure) subarr = _sanitize_ndim(subarr, data, dtype, index) if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)): subarr = _sanitize_str_dtypes(subarr, data, dtype, copy) is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype( dtype) if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype: inferred = lib.infer_dtype(subarr, skipna=False) if inferred in {"interval", "period"}: subarr = array(subarr) return subarr
def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): """ Parameters ---------- array : list-like copy : bool, default False unit : str, default "ns" The timedelta unit to treat integers as multiples of. errors : {"raise", "coerce", "ignore"}, default "raise" How to handle elements that cannot be converted to timedelta64[ns]. See ``pandas.to_timedelta`` for details. Returns ------- converted : numpy.ndarray The sequence converted to a numpy array with dtype ``timedelta64[ns]``. inferred_freq : Tick or None The inferred frequency of the sequence. Raises ------ ValueError : Data cannot be converted to timedelta64[ns]. Notes ----- Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause errors to be ignored; they are caught and subsequently ignored at a higher level. """ inferred_freq = None unit = parse_timedelta_unit(unit) # Unwrap whatever we have into a np.ndarray if not hasattr(data, 'dtype'): # e.g. list, tuple if np.ndim(data) == 0: # i.e. generator data = list(data) data = np.array(data, copy=False) elif isinstance(data, ABCSeries): data = data._values elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArrayMixin)): inferred_freq = data.freq data = data._data # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data) or is_string_dtype(data): # no need to make a copy, need to convert if string-dtyped data = objects_to_td64ns(data, unit=unit, errors=errors) copy = False elif is_integer_dtype(data): # treat as multiples of the given unit data, copy_made = ints_to_td64ns(data, unit=unit) copy = copy and not copy_made elif is_float_dtype(data): # treat as multiples of the given unit. If after converting to nanos, # there are fractional components left, these are truncated # (i.e. NOT rounded) mask = np.isnan(data) coeff = np.timedelta64(1, unit) / np.timedelta64(1, 'ns') data = (coeff * data).astype(np.int64).view('timedelta64[ns]') data[mask] = iNaT copy = False elif is_timedelta64_dtype(data): if data.dtype != _TD_DTYPE: # non-nano unit # TODO: watch out for overflows data = data.astype(_TD_DTYPE) copy = False elif is_datetime64_dtype(data): # GH#23539 warnings.warn( "Passing datetime64-dtype data to TimedeltaIndex is " "deprecated, will raise a TypeError in a future " "version", FutureWarning, stacklevel=3) data = ensure_int64(data).view(_TD_DTYPE) else: raise TypeError( "dtype {dtype} cannot be converted to timedelta64[ns]".format( dtype=data.dtype)) data = np.array(data, copy=copy) assert data.dtype == 'm8[ns]', data return data, inferred_freq
def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, fill_value, box): dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture if (dtype == bytes and not boxed and fill_value is not None and fill_value is not NaT): pytest.xfail("does not upcast to object") elif dtype == "uint64" and not boxed and fill_value == iNaT: pytest.xfail("does not upcast correctly") # below: opinionated that iNaT should be interpreted as missing value elif (not boxed and (is_float_dtype(dtype) or is_complex_dtype(dtype)) and fill_value == iNaT): pytest.xfail("does not cast to missing value marker correctly") elif (is_string_dtype(dtype) or dtype == bool) and not boxed and fill_value == iNaT: pytest.xfail("does not cast to missing value marker correctly") if is_integer_dtype(dtype) and dtype == "uint64" and fill_value == iNaT: # uint64 + negative int casts to object; iNaT is considered as missing expected_dtype = np.dtype(object) exp_val_for_scalar = np.nan elif is_integer_dtype(dtype) and fill_value == iNaT: # other integer + iNaT casts to int64 expected_dtype = np.int64 exp_val_for_scalar = iNaT elif is_integer_dtype(dtype) and fill_value is not NaT: # integer + other missing value (np.nan / None) casts to float expected_dtype = np.float64 exp_val_for_scalar = np.nan elif is_object_dtype(dtype) and (fill_value == iNaT or fill_value is NaT): # inserting into object does not cast the value # but *does* cast None to np.nan expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value elif is_datetime_or_timedelta_dtype(dtype): # datetime / timedelta cast all missing values to iNaT expected_dtype = dtype exp_val_for_scalar = iNaT elif fill_value is NaT: # NaT upcasts everything that's not datetime/timedelta to object expected_dtype = np.dtype(object) exp_val_for_scalar = NaT elif is_float_dtype(dtype) or is_complex_dtype(dtype): # float / complex + missing value (!= NaT) stays the same expected_dtype = dtype exp_val_for_scalar = np.nan else: # all other cases cast to object, and use np.nan as missing value expected_dtype = np.dtype(object) exp_val_for_scalar = np.nan # array case has same expected_dtype; but returns corresponding na-marker if is_integer_dtype(expected_dtype): # integers cannot hold NaNs; maybe_promote_with_array returns None exp_val_for_array = None elif is_datetime_or_timedelta_dtype(expected_dtype): exp_val_for_array = iNaT else: # expected_dtype = float / complex / object exp_val_for_array = np.nan _check_promote( dtype, fill_value, boxed, box_dtype, expected_dtype, exp_val_for_scalar, exp_val_for_array, )
def _str_map(self, f, na_value=None, dtype: Dtype | None = None): # TODO: de-duplicate with StringArray method. This method is moreless copy and # paste. from pandas.arrays import ( BooleanArray, IntegerArray, ) if dtype is None: dtype = self.dtype if na_value is None: na_value = self.dtype.na_value mask = isna(self) arr = np.asarray(self) if is_integer_dtype(dtype) or is_bool_dtype(dtype): constructor: type[IntegerArray] | type[BooleanArray] if is_integer_dtype(dtype): constructor = IntegerArray else: constructor = BooleanArray na_value_is_na = isna(na_value) if na_value_is_na: na_value = 1 result = lib.map_infer_mask( arr, f, mask.view("uint8"), convert=False, na_value=na_value, # error: Value of type variable "_DTypeScalar" of "dtype" cannot be # "object" # error: Argument 1 to "dtype" has incompatible type # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected # "Type[object]" dtype=np.dtype(dtype), # type: ignore[type-var,arg-type] ) if not na_value_is_na: mask[:] = False return constructor(result, mask) elif is_string_dtype(dtype) and not is_object_dtype(dtype): # i.e. StringDtype result = lib.map_infer_mask(arr, f, mask.view("uint8"), convert=False, na_value=na_value) result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True) return type(self)(result) else: # This is when the result type is object. We reach this when # -> We know the result type is truly object (e.g. .encode returns bytes # or .findall returns a list). # -> We don't know the result type. E.g. `.get` can return anything. return lib.map_infer_mask(arr, f, mask.view("uint8"))
def __new__(cls, subtype=None, closed: str_type | None = None): from pandas.core.dtypes.common import ( is_string_dtype, pandas_dtype, ) if closed is not None and closed not in { "right", "left", "both", "neither" }: raise ValueError( "closed must be one of 'right', 'left', 'both', 'neither'") if isinstance(subtype, IntervalDtype): if closed is not None and closed != subtype.closed: raise ValueError( "dtype.closed and 'closed' do not match. " "Try IntervalDtype(dtype.subtype, closed) instead.") return subtype elif subtype is None: # we are called as an empty constructor # generally for pickle compat u = object.__new__(cls) u._subtype = None u._closed = closed return u elif isinstance(subtype, str) and subtype.lower() == "interval": subtype = None else: if isinstance(subtype, str): m = cls._match.search(subtype) if m is not None: gd = m.groupdict() subtype = gd["subtype"] if gd.get("closed", None) is not None: if closed is not None: if closed != gd["closed"]: raise ValueError( "'closed' keyword does not match value " "specified in dtype string") closed = gd["closed"] try: subtype = pandas_dtype(subtype) except TypeError as err: raise TypeError("could not construct IntervalDtype") from err if CategoricalDtype.is_dtype(subtype) or is_string_dtype(subtype): # GH 19016 msg = ("category, object, and string subtypes are not supported " "for IntervalDtype") raise TypeError(msg) key = str(subtype) + str(closed) try: return cls._cache_dtypes[key] except KeyError: u = object.__new__(cls) u._subtype = subtype u._closed = closed cls._cache_dtypes[key] = u return u
def read_attr_all(self, gages_ids): """read data from GAGES-II parameters: gages_ids:gages sites' ids :return out:ndarray """ dir_gage_attr = self.all_configs.get("gage_files_dir") dir_out = self.all_configs.get("out_dir") f_dict = dict() # factorize dict # each key-value pair for atts in a file (list) var_dict = dict() # all attrs var_lst = list() out_lst = list() # read all attrs var_des = pd.read_csv(os.path.join(dir_gage_attr, 'variable_descriptions.txt'), sep=',') var_des_map_values = var_des['VARIABLE_TYPE'].tolist() for i in range(len(var_des)): var_des_map_values[i] = var_des_map_values[i].lower() # sort by type key_lst = list(set(var_des_map_values)) key_lst.sort(key=var_des_map_values.index) # remove x_region_names key_lst.remove('x_region_names') for key in key_lst: # in "spreadsheets-in-csv-format" directory, the name of "flow_record" file is conterm_flowrec.txt if key == 'flow_record': key = 'flowrec' data_file = os.path.join(dir_gage_attr, 'conterm_' + key + '.txt') # remove some unused atttrs in bas_classif if key == 'bas_classif': data_temp = pd.read_csv(data_file, sep=',', dtype={'STAID': str}, usecols=range(0, 4)) else: data_temp = pd.read_csv(data_file, sep=',', dtype={'STAID': str}) if key == 'flowrec': # remove final column which is nan data_temp = data_temp.iloc[:, range(0, data_temp.shape[1] - 1)] # all attrs in files var_lst_temp = list(data_temp.columns[1:]) var_dict[key] = var_lst_temp var_lst.extend(var_lst_temp) k = 0 n_gage = len(gages_ids) out_temp = np.full( [n_gage, len(var_lst_temp)], np.nan) # 1d:sites,2d: attrs in current data_file # sites intersection,ind2 is the index of sites in conterm_ files,set them in out_temp range1 = gages_ids range2 = data_temp.iloc[:, 0].astype(str).tolist() assert (all(x < y for x, y in zip(range2, range2[1:]))) c, ind1, ind2 = np.intersect1d(range1, range2, return_indices=True) for field in var_lst_temp: if is_string_dtype( data_temp[field]): # str vars -> categorical vars value, ref = pd.factorize(data_temp.loc[ind2, field], sort=True) out_temp[:, k] = value f_dict[field] = ref.tolist() elif is_numeric_dtype(data_temp[field]): out_temp[:, k] = data_temp.loc[ind2, field].values k = k + 1 out_lst.append(out_temp) out = np.concatenate(out_lst, 1) return out, var_lst, var_dict, f_dict
def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): """ Parameters ---------- array : list-like copy : bool, default False unit : str, default "ns" The timedelta unit to treat integers as multiples of. errors : {"raise", "coerce", "ignore"}, default "raise" How to handle elements that cannot be converted to timedelta64[ns]. See ``pandas.to_timedelta`` for details. Returns ------- converted : numpy.ndarray The sequence converted to a numpy array with dtype ``timedelta64[ns]``. inferred_freq : Tick or None The inferred frequency of the sequence. Raises ------ ValueError : Data cannot be converted to timedelta64[ns]. Notes ----- Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause errors to be ignored; they are caught and subsequently ignored at a higher level. """ inferred_freq = None unit = parse_timedelta_unit(unit) # Unwrap whatever we have into a np.ndarray if not hasattr(data, 'dtype'): # e.g. list, tuple if np.ndim(data) == 0: # i.e. generator data = list(data) data = np.array(data, copy=False) elif isinstance(data, ABCSeries): data = data._values elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)): inferred_freq = data.freq data = data._data # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data.dtype) or is_string_dtype(data.dtype): # no need to make a copy, need to convert if string-dtyped data = objects_to_td64ns(data, unit=unit, errors=errors) copy = False elif is_integer_dtype(data.dtype): # treat as multiples of the given unit data, copy_made = ints_to_td64ns(data, unit=unit) copy = copy and not copy_made elif is_float_dtype(data.dtype): # cast the unit, multiply base/frace separately # to avoid precision issues from float -> int mask = np.isnan(data) m, p = precision_from_unit(unit) base = data.astype(np.int64) frac = data - base if p: frac = np.round(frac, p) data = (base * m + (frac * m).astype(np.int64)).view('timedelta64[ns]') data[mask] = iNaT copy = False elif is_timedelta64_dtype(data.dtype): if data.dtype != _TD_DTYPE: # non-nano unit # TODO: watch out for overflows data = data.astype(_TD_DTYPE) copy = False elif is_datetime64_dtype(data): # GH#23539 warnings.warn("Passing datetime64-dtype data to TimedeltaIndex is " "deprecated, will raise a TypeError in a future " "version", FutureWarning, stacklevel=4) data = ensure_int64(data).view(_TD_DTYPE) else: raise TypeError("dtype {dtype} cannot be converted to timedelta64[ns]" .format(dtype=data.dtype)) data = np.array(data, copy=copy) if data.ndim != 1: raise ValueError("Only 1-dimensional input arrays are supported.") assert data.dtype == 'm8[ns]', data return data, inferred_freq
def autogen_schema(X, ordinal_max_items=2, feature_names=None, feature_types=None): """ Generates data schema for a given dataset as JSON representable. Args: X: Dataframe/ndarray to build schema from. ordinal_max_items: If a numeric column's cardinality is at most this integer, consider it as ordinal instead of continuous. feature_names: Feature names feature_types: Feature types Returns: A dictionary - schema that encapsulates column information, such as type and domain. """ schema = OrderedDict() col_number = 0 if isinstance(X, np.ndarray): log.warning( "Passing a numpy array to schema autogen when it should be dataframe." ) if feature_names is None: X = pd.DataFrame(X, columns=[ "col_" + str(i) for i in range(X.shape[1]) ]).infer_objects() else: X = pd.DataFrame(X, columns=feature_names).infer_objects() if isinstance(X, NDFrame): for name, col_dtype in zip(X.dtypes.index, X.dtypes): schema[name] = {} if is_numeric_dtype(col_dtype): # schema[name]['type'] = 'continuous' # TODO: Fix this once we know it works. if len(set(X[name])) > ordinal_max_items: schema[name]["type"] = "continuous" else: # TODO: Work with ordinal later. schema[name]["type"] = "categorical" # schema[name]['type'] = 'ordinal' # schema[name]['order'] = list(set(X[name])) elif is_string_dtype(col_dtype): schema[name]["type"] = "categorical" else: warnings.warn("Unknown column: " + name, RuntimeWarning) schema[name]["type"] = "unknown" schema[name]["column_number"] = col_number col_number += 1 # Override if feature_types is passed as arg. if feature_types is not None: for idx, name in enumerate(X.dtypes.index): schema[name]["type"] = feature_types[idx] else: raise TypeError( "GA2M only supports numpy arrays or pandas dataframes.") return schema
def test_not_string(self): # though PeriodDtype has object kind, it cannot be string assert not is_string_dtype(PeriodDtype('D'))
def sequence_to_td64ns( data, copy: bool = False, unit=None, errors="raise" ) -> tuple[np.ndarray, Tick | None]: """ Parameters ---------- data : list-like copy : bool, default False unit : str, optional The timedelta unit to treat integers as multiples of. For numeric data this defaults to ``'ns'``. Must be un-specified if the data contains a str and ``errors=="raise"``. errors : {"raise", "coerce", "ignore"}, default "raise" How to handle elements that cannot be converted to timedelta64[ns]. See ``pandas.to_timedelta`` for details. Returns ------- converted : numpy.ndarray The sequence converted to a numpy array with dtype ``timedelta64[ns]``. inferred_freq : Tick or None The inferred frequency of the sequence. Raises ------ ValueError : Data cannot be converted to timedelta64[ns]. Notes ----- Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause errors to be ignored; they are caught and subsequently ignored at a higher level. """ assert unit not in ["Y", "y", "M"] # caller is responsible for checking inferred_freq = None if unit is not None: unit = parse_timedelta_unit(unit) data, copy = dtl.ensure_arraylike_for_datetimelike( data, copy, cls_name="TimedeltaArray" ) if isinstance(data, TimedeltaArray): inferred_freq = data.freq # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data.dtype) or is_string_dtype(data.dtype): # no need to make a copy, need to convert if string-dtyped data = _objects_to_td64ns(data, unit=unit, errors=errors) copy = False elif is_integer_dtype(data.dtype): # treat as multiples of the given unit data, copy_made = ints_to_td64ns(data, unit=unit) copy = copy and not copy_made elif is_float_dtype(data.dtype): # cast the unit, multiply base/frac separately # to avoid precision issues from float -> int mask = np.isnan(data) # The next few lines are effectively a vectorized 'cast_from_unit' m, p = precision_from_unit(unit or "ns") base = data.astype(np.int64) frac = data - base if p: frac = np.round(frac, p) data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]") data[mask] = iNaT copy = False elif is_timedelta64_dtype(data.dtype): if data.dtype != TD64NS_DTYPE: # non-nano unit data = astype_overflowsafe(data, dtype=TD64NS_DTYPE) copy = False else: # This includes datetime64-dtype, see GH#23539, GH#29794 raise TypeError(f"dtype {data.dtype} cannot be converted to timedelta64[ns]") data = np.array(data, copy=copy) assert data.dtype == "m8[ns]", data return data, inferred_freq