def test_datetimetz_dtype(self): for dtype in ['datetime64[ns, US/Eastern]', 'datetime64[ns, Asia/Tokyo]', 'datetime64[ns, UTC]']: assert com.pandas_dtype(dtype) is DatetimeTZDtype(dtype) assert com.pandas_dtype(dtype) == DatetimeTZDtype(dtype) assert com.pandas_dtype(dtype) == dtype
def test_invalid_dtype_error(self): msg = 'not understood' invalid_list = [pd.Timestamp, 'pd.Timestamp', list] for dtype in invalid_list: with tm.assert_raises_regex(TypeError, msg): com.pandas_dtype(dtype) valid_list = [object, 'float64', np.object_, np.dtype('object'), 'O', np.float64, float, np.dtype('float64')] for dtype in valid_list: com.pandas_dtype(dtype)
def astype(self, dtype, copy=True): # We handle # --> timedelta64[ns] # --> timedelta64 # DatetimeLikeArrayMixin super call handles other cases dtype = pandas_dtype(dtype) if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype): # by pandas convention, converting to non-nano timedelta64 # returns an int64-dtyped array with ints representing multiples # of the desired timedelta unit. This is essentially division if self._hasnans: # avoid double-copying result = self._data.astype(dtype, copy=False) values = self._maybe_mask_results(result, fill_value=None, convert='float64') return values result = self._data.astype(dtype, copy=copy) return result.astype('i8') elif is_timedelta64_ns_dtype(dtype): if copy: return self.copy() return self return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy)
def validate_dtype_freq(dtype, freq): """ If both a dtype and a freq are available, ensure they match. If only dtype is available, extract the implied freq. Parameters ---------- dtype : dtype freq : DateOffset or None Returns ------- freq : DateOffset Raises ------ ValueError : non-period dtype IncompatibleFrequency : mismatch between dtype and freq """ if freq is not None: freq = frequencies.to_offset(freq) if dtype is not None: dtype = pandas_dtype(dtype) if not is_period_dtype(dtype): raise ValueError('dtype must be PeriodDtype') if freq is None: freq = dtype.freq elif freq != dtype.freq: raise IncompatibleFrequency('specified freq and dtype ' 'are different') return freq
def __new__(cls, subtype=None): """ Parameters ---------- subtype : the dtype of the Interval """ if isinstance(subtype, IntervalDtype): return subtype elif subtype is None or (isinstance(subtype, compat.string_types) and subtype == 'interval'): subtype = None else: if isinstance(subtype, compat.string_types): m = cls._match.search(subtype) if m is not None: subtype = m.group('subtype') from pandas.core.dtypes.common import pandas_dtype try: subtype = pandas_dtype(subtype) except TypeError: raise ValueError("could not construct IntervalDtype") try: return cls._cache[str(subtype)] except KeyError: u = object.__new__(cls) u.subtype = subtype cls._cache[str(subtype)] = u return u
def test_astype(dtype): # Need to ensure ordinals are astyped correctly for both # int32 and 64 arr = period_array(['2000', '2001', None], freq='D') result = arr.astype(dtype) # need pandas_dtype to handle int32 vs. int64 correctly expected = pandas_dtype(dtype) assert result.dtype == expected
def astype(self, dtype, copy=True): # We handle Period[T] -> Period[U] # Our parent handles everything else. dtype = pandas_dtype(dtype) if is_period_dtype(dtype): return self.asfreq(dtype.freq) return super(PeriodArray, self).astype(dtype, copy=copy)
def unconvert(values, dtype, compress=None): as_is_ext = isinstance(values, ExtType) and values.code == 0 if as_is_ext: values = values.data if is_categorical_dtype(dtype): return values elif is_object_dtype(dtype): return np.array(values, dtype=object) dtype = pandas_dtype(dtype).base if not as_is_ext: values = values.encode('latin1') if compress: if compress == u'zlib': _check_zlib() decompress = zlib.decompress elif compress == u'blosc': _check_blosc() decompress = blosc.decompress else: raise ValueError("compress must be one of 'zlib' or 'blosc'") try: return np.frombuffer( _move_into_mutable_buffer(decompress(values)), dtype=dtype, ) except _BadMove as e: # Pull the decompressed data off of the `_BadMove` exception. # We don't just store this in the locals because we want to # minimize the risk of giving users access to a `bytes` object # whose data is also given to a mutable buffer. values = e.args[0] if len(values) > 1: # The empty string and single characters are memoized in many # string creating functions in the capi. This case should not # warn even though we need to make a copy because we are only # copying at most 1 byte. warnings.warn( 'copying data after decompressing; this may mean that' ' decompress is caching its result', PerformanceWarning, ) # fall through to copying `np.fromstring` # Copy the bytes into a numpy array. buf = np.frombuffer(values, dtype=dtype) buf = buf.copy() # required to not mutate the original data buf.flags.writeable = True return buf
def test_dtype_equal_strict(): # we are strict on kind equality for dtype in [np.int8, np.int16, np.int32]: assert not com.is_dtype_equal(np.int64, dtype) for dtype in [np.float32]: assert not com.is_dtype_equal(np.float64, dtype) # strict w.r.t. PeriodDtype assert not com.is_dtype_equal(PeriodDtype('D'), PeriodDtype('2D')) # strict w.r.t. datetime64 assert not com.is_dtype_equal( com.pandas_dtype('datetime64[ns, US/Eastern]'), com.pandas_dtype('datetime64[ns, CET]')) # see gh-15941: no exception should be raised assert not com.is_dtype_equal(None, None)
def astype(self, dtype, copy=True, how='start'): dtype = pandas_dtype(dtype) if is_integer_dtype(dtype): return self._int64index.copy() if copy else self._int64index elif is_datetime64_any_dtype(dtype): tz = getattr(dtype, 'tz', None) return self.to_timestamp(how=how).tz_localize(tz) elif is_period_dtype(dtype): return self.asfreq(freq=dtype.freq) return super(PeriodIndex, self).astype(dtype, copy=copy)
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype): # return an index (essentially this is division) result = self.values.astype(dtype, copy=copy) if self.hasnans: values = self._maybe_mask_results(result, convert='float64') return Index(values, name=self.name) return Index(result.astype('i8'), name=self.name) return super(TimedeltaIndex, self).astype(dtype, copy=copy)
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if needs_i8_conversion(dtype): msg = ('Cannot convert Float64Index to dtype {dtype}; integer ' 'values are required for conversion').format(dtype=dtype) raise TypeError(msg) elif is_integer_dtype(dtype) and self.hasnans: # GH 13149 raise ValueError('Cannot convert NA to integer') return super(Float64Index, self).astype(dtype, copy=copy)
def astype(self, dtype, copy=True, how='start'): dtype = pandas_dtype(dtype) if is_datetime64_any_dtype(dtype): # 'how' is index-specific, isn't part of the EA interface. tz = getattr(dtype, 'tz', None) return self.to_timestamp(how=how).tz_localize(tz) # TODO: should probably raise on `how` here, so we don't ignore it. return super(PeriodIndex, self).astype(dtype, copy=copy)
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype): # Have to repeat the check for 'timedelta64' (not ns) dtype # so that we can return a numeric index, since pandas will return # a TimedeltaIndex when dtype='timedelta' result = self._data.astype(dtype, copy=copy) if self.hasnans: return Index(result, name=self.name) return Index(result.astype('i8'), name=self.name) return DatetimeIndexOpsMixin.astype(self, dtype, copy=copy)
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if needs_i8_conversion(dtype): msg = ('Cannot convert Float64Index to dtype {dtype}; integer ' 'values are required for conversion').format(dtype=dtype) raise TypeError(msg) elif (is_integer_dtype(dtype) and not is_extension_array_dtype(dtype)) and self.hasnans: # TODO(jreback); this can change once we have an EA Index type # GH 13149 raise ValueError('Cannot convert NA to integer') return super().astype(dtype, copy=copy)
def astype(self, dtype, copy=True, how='start'): dtype = pandas_dtype(dtype) # We have a few special-cases for `dtype`. # Failing those, we fall back to astyping the values if is_datetime64_any_dtype(dtype): # 'how' is index-speicifc, isn't part of the EA interface. tz = getattr(dtype, 'tz', None) return self.to_timestamp(how=how).tz_localize(tz) result = self._data.astype(dtype, copy=copy) return Index(result, name=self.name, dtype=dtype, copy=False)
def _validate_td64_dtype(dtype): dtype = pandas_dtype(dtype) if is_dtype_equal(dtype, np.dtype("timedelta64")): dtype = _TD_DTYPE msg = textwrap.dedent("""\ Passing in 'timedelta' dtype with no precision is deprecated and will raise in a future version. Please pass in 'timedelta64[ns]' instead.""") warnings.warn(msg, FutureWarning, stacklevel=4) if not is_dtype_equal(dtype, _TD_DTYPE): raise ValueError(_BAD_DTYPE.format(dtype=dtype)) return dtype
def _simple_new(cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True): result = IntervalMixin.__new__(cls) closed = closed or 'right' left = ensure_index(left, copy=copy) right = ensure_index(right, copy=copy) if dtype is not None: # GH 19262: dtype must be an IntervalDtype to override inferred dtype = pandas_dtype(dtype) if not is_interval_dtype(dtype): msg = 'dtype must be an IntervalDtype, got {dtype}' raise TypeError(msg.format(dtype=dtype)) elif dtype.subtype is not None: left = left.astype(dtype.subtype) right = right.astype(dtype.subtype) # coerce dtypes to match if needed if is_float_dtype(left) and is_integer_dtype(right): right = right.astype(left.dtype) elif is_float_dtype(right) and is_integer_dtype(left): left = left.astype(right.dtype) if type(left) != type(right): msg = ('must not have differing left [{ltype}] and right ' '[{rtype}] types') raise ValueError(msg.format(ltype=type(left).__name__, rtype=type(right).__name__)) elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): # GH 19016 msg = ('category, object, and string subtypes are not supported ' 'for IntervalArray') raise TypeError(msg) elif isinstance(left, ABCPeriodIndex): msg = 'Period dtypes are not supported, use a PeriodIndex instead' raise ValueError(msg) elif (isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz)): msg = ("left and right must have the same time zone, got " "'{left_tz}' and '{right_tz}'") raise ValueError(msg.format(left_tz=left.tz, right_tz=right.tz)) result._left = left result._right = right result._closed = closed if verify_integrity: result._validate() return result
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_float_dtype(dtype): values = self._values.astype(dtype, copy=copy) elif is_integer_dtype(dtype): if self.hasnans: raise ValueError('cannot convert float NaN to integer') values = self._values.astype(dtype, copy=copy) elif is_object_dtype(dtype): values = self._values.astype('object', copy=copy) else: raise TypeError('Setting %s dtype to anything other than ' 'float64 or object is not supported' % self.__class__) return Index(values, name=self.name, dtype=dtype)
def astype(self, dtype, copy=True, how='start'): dtype = pandas_dtype(dtype) if is_object_dtype(dtype): return self.asobject elif is_integer_dtype(dtype): if copy: return self._int64index.copy() else: return self._int64index elif is_datetime64_dtype(dtype): return self.to_timestamp(how=how) elif is_datetime64tz_dtype(dtype): return self.to_timestamp(how=how).tz_localize(dtype.tz) elif is_period_dtype(dtype): return self.asfreq(freq=dtype.freq) raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype)
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_float_dtype(dtype): values = self._values.astype(dtype, copy=copy) elif is_integer_dtype(dtype): if self.hasnans: raise ValueError('cannot convert float NaN to integer') values = self._values.astype(dtype, copy=copy) elif is_object_dtype(dtype): values = self._values.astype('object', copy=copy) elif is_categorical_dtype(dtype): return CategoricalIndex(self, name=self.name, dtype=dtype, copy=copy) else: raise TypeError('Setting {cls} dtype to anything other than ' 'float64, object, or category is not supported' .format(cls=self.__class__)) return Index(values, name=self.name, dtype=dtype)
def __new__(cls, subtype=None): """ Parameters ---------- subtype : the dtype of the Interval """ from pandas.core.dtypes.common import ( is_categorical_dtype, is_string_dtype, pandas_dtype) if isinstance(subtype, IntervalDtype): return subtype elif subtype is None: # we are called as an empty constructor # generally for pickle compat u = object.__new__(cls) u.subtype = None return u elif (isinstance(subtype, compat.string_types) and subtype.lower() == 'interval'): subtype = None else: if isinstance(subtype, compat.string_types): m = cls._match.search(subtype) if m is not None: subtype = m.group('subtype') try: subtype = pandas_dtype(subtype) except TypeError: raise TypeError("could not construct IntervalDtype") if is_categorical_dtype(subtype) or is_string_dtype(subtype): # GH 19016 msg = ('category, object, and string subtypes are not supported ' 'for IntervalDtype') raise TypeError(msg) try: return cls._cache[str(subtype)] except KeyError: u = object.__new__(cls) u.subtype = subtype cls._cache[str(subtype)] = u return u
def astype(self, dtype, copy=True, how='start'): dtype = pandas_dtype(dtype) if is_object_dtype(dtype): return self._box_values_as_index() elif is_integer_dtype(dtype): if copy: return self._int64index.copy() else: return self._int64index elif is_datetime64_dtype(dtype): return self.to_timestamp(how=how) elif is_datetime64tz_dtype(dtype): return self.to_timestamp(how=how).tz_localize(dtype.tz) elif is_period_dtype(dtype): return self.asfreq(freq=dtype.freq) elif is_categorical_dtype(dtype): return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) raise TypeError('Cannot cast PeriodIndex to dtype %s' % dtype)
def astype(self, dtype, copy=True): """ Cast to an ExtensionArray or NumPy array with dtype 'dtype'. Parameters ---------- dtype : str or dtype Typecode or data-type to which the array is cast. copy : bool, default True Whether to copy the data, even if not necessary. If False, a copy is made only if the old dtype does not match the new dtype. Returns ------- array : ExtensionArray or ndarray ExtensionArray or NumPy ndarray with 'dtype' for its dtype. """ dtype = pandas_dtype(dtype) if is_interval_dtype(dtype): if dtype == self.dtype: return self.copy() if copy else self # need to cast to different subtype try: new_left = self.left.astype(dtype.subtype) new_right = self.right.astype(dtype.subtype) except TypeError: msg = ('Cannot convert {dtype} to {new_dtype}; subtypes are ' 'incompatible') raise TypeError(msg.format(dtype=self.dtype, new_dtype=dtype)) return self._shallow_copy(new_left, new_right) elif is_categorical_dtype(dtype): return Categorical(np.asarray(self)) # TODO: This try/except will be repeated. try: return np.asarray(self).astype(dtype, copy=copy) except (TypeError, ValueError): msg = 'Cannot cast {name} to dtype {dtype}' raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
def astype(self, dtype, copy=True): # TODO: Figure out something better here... # We have DatetimeLikeArrayMixin -> # super(...), which ends up being... DatetimeIndexOpsMixin? # this is complicated. # need a pandas_astype(arr, dtype). from pandas import Categorical dtype = pandas_dtype(dtype) if is_object_dtype(dtype): return np.asarray(self, dtype=object) elif is_string_dtype(dtype) and not is_categorical_dtype(dtype): return self._format_native_types() elif is_integer_dtype(dtype): values = self._data if values.dtype != dtype: # int32 vs. int64 values = values.astype(dtype) elif copy: values = values.copy() return values elif (is_datetime_or_timedelta_dtype(dtype) and not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype): # disallow conversion between datetime/timedelta, # and conversions for any datetimelike to float msg = 'Cannot cast {name} to dtype {dtype}' raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) elif is_categorical_dtype(dtype): return Categorical(self, dtype=dtype) elif is_period_dtype(dtype): return self.asfreq(dtype.freq) else: return np.asarray(self, dtype=dtype)
def __init__(self, dtype=np.float64, fill_value=None): # type: (Union[str, np.dtype, 'ExtensionDtype', type], Any) -> None from pandas.core.dtypes.missing import na_value_for_dtype from pandas.core.dtypes.common import ( pandas_dtype, is_string_dtype, is_scalar ) if isinstance(dtype, type(self)): if fill_value is None: fill_value = dtype.fill_value dtype = dtype.subtype dtype = pandas_dtype(dtype) if is_string_dtype(dtype): dtype = np.dtype('object') if fill_value is None: fill_value = na_value_for_dtype(dtype) if not is_scalar(fill_value): raise ValueError("fill_value must be a scalar. Got {} " "instead".format(fill_value)) self._dtype = dtype self._fill_value = fill_value
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if is_object_dtype(dtype): return self._box_values_as_index() elif is_timedelta64_ns_dtype(dtype): if copy is True: return self.copy() return self elif is_timedelta64_dtype(dtype): # return an index (essentially this is division) result = self.values.astype(dtype, copy=copy) if self.hasnans: return Index(self._maybe_mask_results(result, convert='float64'), name=self.name) return Index(result.astype('i8'), name=self.name) elif is_integer_dtype(dtype): return Index(self.values.astype('i8', copy=copy), dtype='i8', name=self.name) elif is_categorical_dtype(dtype): return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) raise TypeError('Cannot cast TimedeltaIndex to dtype %s' % dtype)
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if isinstance(dtype, type(self.dtype)): return type(self)(self._data, context=dtype.context) return super().astype(dtype, copy=copy)
def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): """ Cast the elements of an array to a given dtype a nan-safe manner. Parameters ---------- arr : ndarray dtype : np.dtype copy : bool, default True If False, a view will be attempted but may fail, if e.g. the item sizes don't align. skipna: bool, default False Whether or not we should skip NaN when casting as a string-type. Raises ------ ValueError The dtype was a datetime64/timedelta64 dtype, but it had no unit. """ # dispatch on extension dtype if needed if is_extension_array_dtype(dtype): return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy) if not isinstance(dtype, np.dtype): dtype = pandas_dtype(dtype) if issubclass(dtype.type, str): return lib.astype_str(arr.ravel(), skipna=skipna).reshape(arr.shape) elif is_datetime64_dtype(arr): if is_object_dtype(dtype): return tslib.ints_to_pydatetime(arr.view(np.int64)) elif dtype == np.int64: if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) # allow frequency conversions if dtype.kind == "M": return arr.astype(dtype) raise TypeError( f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]") elif is_timedelta64_dtype(arr): if is_object_dtype(dtype): return tslibs.ints_to_pytimedelta(arr.view(np.int64)) elif dtype == np.int64: if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") return arr.view(dtype) if dtype not in [_INT64_DTYPE, _TD_DTYPE]: # allow frequency conversions # we return a float here! if dtype.kind == "m": mask = isna(arr) result = arr.astype(dtype).astype(np.float64) result[mask] = np.nan return result elif dtype == _TD_DTYPE: return arr.astype(_TD_DTYPE, copy=copy) raise TypeError( f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]") elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype( dtype, np.integer): if not np.isfinite(arr).all(): raise ValueError( "Cannot convert non-finite values (NA or inf) to integer") elif is_object_dtype(arr): # work around NumPy brokenness, #1987 if np.issubdtype(dtype.type, np.integer): return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape) # if we have a datetime/timedelta array of objects # then coerce to a proper dtype and recall astype_nansafe elif is_datetime64_dtype(dtype): from pandas import to_datetime return astype_nansafe(to_datetime(arr).values, dtype, copy=copy) elif is_timedelta64_dtype(dtype): from pandas import to_timedelta return astype_nansafe(to_timedelta(arr).values, dtype, copy=copy) if dtype.name in ("datetime64", "timedelta64"): msg = (f"The '{dtype.name}' dtype has no unit. Please pass in " f"'{dtype.name}[ns]' instead.") raise ValueError(msg) if copy or is_object_dtype(arr) or is_object_dtype(dtype): # Explicit copy, or required since NumPy can't view from / to object. return arr.astype(dtype, copy=True) return arr.view(dtype)
def sanitize_array(data, index, dtype=None, copy=False, raise_cast_failure=False): """ Sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified. """ if dtype is not None: dtype = pandas_dtype(dtype) if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() data = extract_array(data, extract_numpy=True) # GH#846 if isinstance(data, np.ndarray): if dtype is not None: subarr = np.array(data, copy=False) # possibility of nan -> garbage if is_float_dtype(data.dtype) and is_integer_dtype(dtype): try: subarr = _try_cast(data, True, dtype, copy, True) except ValueError: if copy: subarr = data.copy() else: subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) elif isinstance(data, Index): # don't coerce Index types # e.g. indexes can have different conversions (so don't fast path # them) # GH#6140 subarr = sanitize_index(data, index, copy=copy) else: # we will try to copy be-definition here subarr = _try_cast(data, True, dtype, copy, raise_cast_failure) elif isinstance(data, ExtensionArray): if isinstance(data, ABCPandasArray): # We don't want to let people put our PandasArray wrapper # (the output of Series/Index.array), into a Series. So # we explicitly unwrap it here. subarr = data.to_numpy() else: subarr = data # everything else in this block must also handle ndarray's, # becuase we've unwrapped PandasArray into an ndarray. if dtype is not None: subarr = data.astype(dtype) if copy: subarr = data.copy() return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: if dtype is not None: try: subarr = _try_cast(data, False, dtype, copy, raise_cast_failure) except Exception: if raise_cast_failure: # pragma: no cover raise subarr = np.array(data, dtype=object, copy=copy) subarr = lib.maybe_convert_objects(subarr) else: subarr = maybe_convert_platform(data) subarr = maybe_cast_to_datetime(subarr, dtype) elif isinstance(data, range): # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype='int64') subarr = _try_cast(arr, False, dtype, copy, raise_cast_failure) else: subarr = _try_cast(data, False, dtype, copy, raise_cast_failure) # scalar like, GH if getattr(subarr, 'ndim', 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: value = data # figure out the dtype from the value (upcast if necessary) if dtype is None: dtype, value = infer_dtype_from_scalar(value) else: # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) subarr = construct_1d_arraylike_from_scalar( value, len(index), dtype) else: return subarr.item() # the result that we want elif subarr.ndim == 1: if index is not None: # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: subarr = construct_1d_arraylike_from_scalar( subarr[0], len(index), subarr.dtype) elif subarr.ndim > 1: if isinstance(data, np.ndarray): raise Exception('Data must be 1-dimensional') else: subarr = com.asarray_tuplesafe(data, dtype=dtype) # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. if issubclass(subarr.dtype.type, str): # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, subarr has already the result if not lib.is_scalar(data): if not np.all(isna(data)): data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) if is_object_dtype(subarr.dtype) and dtype != 'object': inferred = lib.infer_dtype(subarr, skipna=False) if inferred == 'period': try: subarr = period_array(subarr) except IncompatibleFrequency: pass return subarr
def astype(self, dtype=None, copy=True): """ Change the dtype of a SparseArray. The output will always be a SparseArray. To convert to a dense ndarray with a certain dtype, use :meth:`numpy.asarray`. Parameters ---------- dtype : np.dtype or ExtensionDtype For SparseDtype, this changes the dtype of ``self.sp_values`` and the ``self.fill_value``. For other dtypes, this only changes the dtype of ``self.sp_values``. copy : bool, default True Whether to ensure a copy is made, even if not necessary. Returns ------- SparseArray Examples -------- >>> arr = SparseArray([0, 0, 1, 2]) >>> arr [0, 0, 1, 2] Fill: 0 IntIndex Indices: array([2, 3], dtype=int32) >>> arr.astype(np.dtype('int32')) [0, 0, 1, 2] Fill: 0 IntIndex Indices: array([2, 3], dtype=int32) Using a NumPy dtype with a different kind (e.g. float) will coerce just ``self.sp_values``. >>> arr.astype(np.dtype('float64')) ... # doctest: +NORMALIZE_WHITESPACE [0, 0, 1.0, 2.0] Fill: 0 IntIndex Indices: array([2, 3], dtype=int32) Use a SparseDtype if you wish to be change the fill value as well. >>> arr.astype(SparseDtype("float64", fill_value=np.nan)) ... # doctest: +NORMALIZE_WHITESPACE [nan, nan, 1.0, 2.0] Fill: nan IntIndex Indices: array([2, 3], dtype=int32) """ dtype = pandas_dtype(dtype) if not isinstance(dtype, SparseDtype): dtype = SparseDtype(dtype, fill_value=self.fill_value) sp_values = astype_nansafe(self.sp_values, dtype.subtype, copy=copy) if sp_values is self.sp_values and copy: sp_values = sp_values.copy() return self._simple_new(sp_values, self.sp_index, dtype)
def _convert_to_ndarrays( self, dct: dict, na_values, na_fvalues, verbose: bool = False, converters=None, dtypes=None, ): result = {} for c, values in dct.items(): conv_f = None if converters is None else converters.get(c, None) if isinstance(dtypes, dict): cast_type = dtypes.get(c, None) else: # single dtype or None cast_type = dtypes if self.na_filter: col_na_values, col_na_fvalues = _get_na_values( c, na_values, na_fvalues, self.keep_default_na) else: col_na_values, col_na_fvalues = set(), set() if conv_f is not None: # conv_f applied to data before inference if cast_type is not None: warnings.warn( ("Both a converter and dtype were specified " f"for column {c} - only the converter will be used."), ParserWarning, stacklevel=find_stack_level(), ) try: values = lib.map_infer(values, conv_f) except ValueError: # error: Argument 2 to "isin" has incompatible type "List[Any]"; # expected "Union[Union[ExtensionArray, ndarray], Index, Series]" mask = algorithms.isin( values, list(na_values) # type: ignore[arg-type] ).view(np.uint8) values = lib.map_infer_mask(values, conv_f, mask) cvals, na_count = self._infer_types(values, set(col_na_values) | col_na_fvalues, try_num_bool=False) else: is_ea = is_extension_array_dtype(cast_type) is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) # skip inference if specified dtype is object # or casting to an EA try_num_bool = not (cast_type and is_str_or_ea_dtype) # general type inference and conversion cvals, na_count = self._infer_types( values, set(col_na_values) | col_na_fvalues, try_num_bool) # type specified in dtype param or cast_type is an EA if cast_type and (not is_dtype_equal(cvals, cast_type) or is_extension_array_dtype(cast_type)): if not is_ea and na_count > 0: try: if is_bool_dtype(cast_type): raise ValueError( f"Bool column has NA values in column {c}") except (AttributeError, TypeError): # invalid input to is_bool_dtype pass cast_type = pandas_dtype(cast_type) cvals = self._cast_types(cvals, cast_type, c) result[c] = cvals if verbose and na_count: print(f"Filled {na_count} NA values in column {c!s}") return result
def __init__( self, data, sparse_index=None, index=None, fill_value=None, kind="integer", dtype=None, copy=False, ): if fill_value is None and isinstance(dtype, SparseDtype): fill_value = dtype.fill_value if isinstance(data, type(self)): # disable normal inference on dtype, sparse_index, & fill_value if sparse_index is None: sparse_index = data.sp_index if fill_value is None: fill_value = data.fill_value if dtype is None: dtype = data.dtype # TODO: make kind=None, and use data.kind? data = data.sp_values # Handle use-provided dtype if isinstance(dtype, str): # Two options: dtype='int', regular numpy dtype # or dtype='Sparse[int]', a sparse dtype try: dtype = SparseDtype.construct_from_string(dtype) except TypeError: dtype = pandas_dtype(dtype) if isinstance(dtype, SparseDtype): if fill_value is None: fill_value = dtype.fill_value dtype = dtype.subtype if index is not None and not is_scalar(data): raise Exception("must only pass scalars with an index ") if is_scalar(data): if index is not None: if data is None: data = np.nan if index is not None: npoints = len(index) elif sparse_index is None: npoints = 1 else: npoints = sparse_index.length dtype = infer_dtype_from_scalar(data)[0] data = construct_1d_arraylike_from_scalar(data, npoints, dtype) if dtype is not None: dtype = pandas_dtype(dtype) # TODO: disentangle the fill_value dtype inference from # dtype inference if data is None: # XXX: What should the empty dtype be? Object or float? data = np.array([], dtype=dtype) if not is_array_like(data): try: # probably shared code in sanitize_series data = sanitize_array(data, index=None) except ValueError: # NumPy may raise a ValueError on data like [1, []] # we retry with object dtype here. if dtype is None: dtype = object data = np.atleast_1d(np.asarray(data, dtype=dtype)) else: raise if copy: # TODO: avoid double copy when dtype forces cast. data = data.copy() if fill_value is None: fill_value_dtype = data.dtype if dtype is None else dtype if fill_value_dtype is None: fill_value = np.nan else: fill_value = na_value_for_dtype(fill_value_dtype) if isinstance(data, type(self)) and sparse_index is None: sparse_index = data._sparse_index sparse_values = np.asarray(data.sp_values, dtype=dtype) elif sparse_index is None: sparse_values, sparse_index, fill_value = make_sparse( data, kind=kind, fill_value=fill_value, dtype=dtype) else: sparse_values = np.asarray(data, dtype=dtype) if len(sparse_values) != sparse_index.npoints: raise AssertionError( f"Non array-like type {type(sparse_values)} must " "have the same length as the index") self._sparse_index = sparse_index self._sparse_values = sparse_values self._dtype = SparseDtype(sparse_values.dtype, fill_value)
def test_period_dtype(self, dtype): assert com.pandas_dtype(dtype) is PeriodDtype(dtype) assert com.pandas_dtype(dtype) == PeriodDtype(dtype) assert com.pandas_dtype(dtype) == dtype
"period[D]", "period[3M]", "period[U]", "Period[D]", "Period[3M]", "Period[U]", ], ) def test_period_dtype(self, dtype): assert com.pandas_dtype(dtype) is PeriodDtype(dtype) assert com.pandas_dtype(dtype) == PeriodDtype(dtype) assert com.pandas_dtype(dtype) == dtype dtypes = dict( datetime_tz=com.pandas_dtype("datetime64[ns, US/Eastern]"), datetime=com.pandas_dtype("datetime64[ns]"), timedelta=com.pandas_dtype("timedelta64[ns]"), period=PeriodDtype("D"), integer=np.dtype(np.int64), float=np.dtype(np.float64), object=np.dtype(np.object), category=com.pandas_dtype("category"), ) @pytest.mark.parametrize("name1,dtype1", list(dtypes.items()), ids=lambda x: str(x)) @pytest.mark.parametrize("name2,dtype2", list(dtypes.items()),
def test_period_dtype(self, dtype): assert com.pandas_dtype(dtype) is PeriodDtype(dtype) assert com.pandas_dtype(dtype) == PeriodDtype(dtype) assert com.pandas_dtype(dtype) == dtype
def decode(obj): """ Decoder for deserializing numpy data types. """ typ = obj.get(u'typ') if typ is None: return obj elif typ == u'timestamp': freq = obj[u'freq'] if 'freq' in obj else obj[u'offset'] return Timestamp(obj[u'value'], tz=obj[u'tz'], freq=freq) elif typ == u'nat': return NaT elif typ == u'period': return Period(ordinal=obj[u'ordinal'], freq=obj[u'freq']) elif typ == u'index': dtype = dtype_for(obj[u'dtype']) data = unconvert(obj[u'data'], dtype, obj.get(u'compress')) return Index(data, dtype=dtype, name=obj[u'name']) elif typ == u'range_index': return RangeIndex(obj[u'start'], obj[u'stop'], obj[u'step'], name=obj[u'name']) elif typ == u'multi_index': dtype = dtype_for(obj[u'dtype']) data = unconvert(obj[u'data'], dtype, obj.get(u'compress')) data = [tuple(x) for x in data] return MultiIndex.from_tuples(data, names=obj[u'names']) elif typ == u'period_index': data = unconvert(obj[u'data'], np.int64, obj.get(u'compress')) d = dict(name=obj[u'name'], freq=obj[u'freq']) freq = d.pop('freq', None) return PeriodIndex(PeriodArray(data, freq), **d) elif typ == u'datetime_index': data = unconvert(obj[u'data'], np.int64, obj.get(u'compress')) d = dict(name=obj[u'name'], freq=obj[u'freq']) result = DatetimeIndex(data, **d) tz = obj[u'tz'] # reverse tz conversion if tz is not None: result = result.tz_localize('UTC').tz_convert(tz) return result elif typ in (u'interval_index', 'interval_array'): return globals()[obj[u'klass']].from_arrays(obj[u'left'], obj[u'right'], obj[u'closed'], name=obj[u'name']) elif typ == u'category': from_codes = globals()[obj[u'klass']].from_codes return from_codes(codes=obj[u'codes'], categories=obj[u'categories'], ordered=obj[u'ordered']) elif typ == u'interval': return Interval(obj[u'left'], obj[u'right'], obj[u'closed']) elif typ == u'series': dtype = dtype_for(obj[u'dtype']) pd_dtype = pandas_dtype(dtype) index = obj[u'index'] result = Series(unconvert(obj[u'data'], dtype, obj[u'compress']), index=index, dtype=pd_dtype, name=obj[u'name']) return result elif typ == u'block_manager': axes = obj[u'axes'] def create_block(b): values = _safe_reshape(unconvert( b[u'values'], dtype_for(b[u'dtype']), b[u'compress']), b[u'shape']) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 if u'locs' in b: placement = b[u'locs'] else: placement = axes[0].get_indexer(b[u'items']) if is_datetime64tz_dtype(b[u'dtype']): assert isinstance(values, np.ndarray), type(values) assert values.dtype == 'M8[ns]', values.dtype values = DatetimeArray(values, dtype=b[u'dtype']) return make_block(values=values, klass=getattr(internals, b[u'klass']), placement=placement, dtype=b[u'dtype']) blocks = [create_block(b) for b in obj[u'blocks']] return globals()[obj[u'klass']](BlockManager(blocks, axes)) elif typ == u'datetime': return parse(obj[u'data']) elif typ == u'datetime64': return np.datetime64(parse(obj[u'data'])) elif typ == u'date': return parse(obj[u'data']).date() elif typ == u'timedelta': return timedelta(*obj[u'data']) elif typ == u'timedelta64': return np.timedelta64(int(obj[u'data'])) # elif typ == 'sparse_series': # dtype = dtype_for(obj['dtype']) # return SparseSeries( # unconvert(obj['sp_values'], dtype, obj['compress']), # sparse_index=obj['sp_index'], index=obj['index'], # fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name']) # elif typ == 'sparse_dataframe': # return SparseDataFrame( # obj['data'], columns=obj['columns'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind'] # ) # elif typ == 'sparse_panel': # return SparsePanel( # obj['data'], items=obj['items'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind']) elif typ == u'block_index': return globals()[obj[u'klass']](obj[u'length'], obj[u'blocs'], obj[u'blengths']) elif typ == u'int_index': return globals()[obj[u'klass']](obj[u'length'], obj[u'indices']) elif typ == u'ndarray': return unconvert(obj[u'data'], np.typeDict[obj[u'dtype']], obj.get(u'compress')).reshape(obj[u'shape']) elif typ == u'np_scalar': if obj.get(u'sub_typ') == u'np_complex': return c2f(obj[u'real'], obj[u'imag'], obj[u'dtype']) else: dtype = dtype_for(obj[u'dtype']) try: return dtype(obj[u'data']) except (ValueError, TypeError): return dtype.type(obj[u'data']) elif typ == u'np_complex': return complex(obj[u'real'] + u'+' + obj[u'imag'] + u'j') elif isinstance(obj, (dict, list, set)): return obj else: return obj
def test_categorical_dtype(self): assert com.pandas_dtype("category") == CategoricalDtype()
def test_numpy_string_dtype(self): # do not parse freq-like string as period dtype assert com.pandas_dtype("U") == np.dtype("U") assert com.pandas_dtype("S") == np.dtype("S")
def test_numpy_dtype(self, dtype): assert com.pandas_dtype(dtype) == np.dtype(dtype)
def get_dtype(dtype, coerce_int=None): if coerce_int is False and "int" in dtype: return None return pandas_dtype(dtype)
def __init__(self, *dtypes): super().__init__() self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes)
def test_pandas_dtype_valid(self, dtype): assert com.pandas_dtype(dtype) == dtype
def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, periods=None, copy=False, name=None, tz=None, dtype=None, **kwargs): if periods is not None: if is_float(periods): periods = int(periods) elif not is_integer(periods): msg = 'periods must be a number, got {periods}' raise TypeError(msg.format(periods=periods)) if name is None and hasattr(data, 'name'): name = data.name if dtype is not None: dtype = pandas_dtype(dtype) if not is_period_dtype(dtype): raise ValueError('dtype must be PeriodDtype') if freq is None: freq = dtype.freq elif freq != dtype.freq: msg = 'specified freq and dtype are different' raise IncompatibleFrequency(msg) # coerce freq to freq object, otherwise it can be coerced elementwise # which is slow if freq: freq = Period._maybe_convert_freq(freq) if data is None: if ordinal is not None: data = np.asarray(ordinal, dtype=np.int64) else: data, freq = cls._generate_range(start, end, periods, freq, kwargs) return cls._from_ordinals(data, name=name, freq=freq) if isinstance(data, PeriodIndex): if freq is None or freq == data.freq: # no freq change freq = data.freq data = data._values else: base1, _ = _gfc(data.freq) base2, _ = _gfc(freq) data = period.period_asfreq_arr(data._values, base1, base2, 1) return cls._simple_new(data, name=name, freq=freq) # not array / index if not isinstance( data, (np.ndarray, PeriodIndex, DatetimeIndex, Int64Index)): if is_scalar(data) or isinstance(data, Period): cls._scalar_data_error(data) # other iterable of some kind if not isinstance(data, (list, tuple)): data = list(data) data = np.asarray(data) # datetime other than period if is_datetime64_dtype(data.dtype): data = dt64arr_to_periodarr(data, freq, tz) return cls._from_ordinals(data, name=name, freq=freq) # check not floats if infer_dtype(data) == 'floating' and len(data) > 0: raise TypeError("PeriodIndex does not allow " "floating point in construction") # anything else, likely an array of strings or periods data = _ensure_object(data) freq = freq or period.extract_freq(data) data = period.extract_ordinals(data, freq) return cls._from_ordinals(data, name=name, freq=freq)
def sanitize_array( data, index, dtype=None, copy: bool = False, raise_cast_failure: bool = False ): """ Sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified. """ if dtype is not None: dtype = pandas_dtype(dtype) if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() # extract ndarray or ExtensionArray, ensure we have no PandasArray data = extract_array(data, extract_numpy=True) # GH#846 if isinstance(data, np.ndarray): if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) except ValueError: if copy: subarr = data.copy() else: subarr = np.array(data, copy=False) else: # we will try to copy be-definition here subarr = _try_cast(data, dtype, copy, raise_cast_failure) elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray subarr = data if dtype is not None: subarr = subarr.astype(dtype, copy=copy) elif copy: subarr = subarr.copy() return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: if dtype is not None: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: subarr = maybe_convert_platform(data) subarr = maybe_cast_to_datetime(subarr, dtype) elif isinstance(data, range): # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) else: subarr = _try_cast(data, dtype, copy, raise_cast_failure) # scalar like, GH if getattr(subarr, "ndim", 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: value = data # figure out the dtype from the value (upcast if necessary) if dtype is None: dtype, value = infer_dtype_from_scalar(value) else: # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype) else: return subarr.item() # the result that we want elif subarr.ndim == 1: if index is not None: # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: subarr = construct_1d_arraylike_from_scalar( subarr[0], len(index), subarr.dtype ) elif subarr.ndim > 1: if isinstance(data, np.ndarray): raise Exception("Data must be 1-dimensional") else: subarr = com.asarray_tuplesafe(data, dtype=dtype) if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)): # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. if issubclass(subarr.dtype.type, str): # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, subarr has already the result if not lib.is_scalar(data): if not np.all(isna(data)): data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype): inferred = lib.infer_dtype(subarr, skipna=False) if inferred == "period": from pandas.core.arrays import period_array try: subarr = period_array(subarr) except IncompatibleFrequency: pass return subarr
def __new__(cls, subtype=None, closed: str_type | None = None): from pandas.core.dtypes.common import ( is_string_dtype, pandas_dtype, ) if closed is not None and closed not in { "right", "left", "both", "neither" }: raise ValueError( "closed must be one of 'right', 'left', 'both', 'neither'") if isinstance(subtype, IntervalDtype): if closed is not None and closed != subtype.closed: raise ValueError( "dtype.closed and 'closed' do not match. " "Try IntervalDtype(dtype.subtype, closed) instead.") return subtype elif subtype is None: # we are called as an empty constructor # generally for pickle compat u = object.__new__(cls) u._subtype = None u._closed = closed return u elif isinstance(subtype, str) and subtype.lower() == "interval": subtype = None else: if isinstance(subtype, str): m = cls._match.search(subtype) if m is not None: gd = m.groupdict() subtype = gd["subtype"] if gd.get("closed", None) is not None: if closed is not None: if closed != gd["closed"]: raise ValueError( "'closed' keyword does not match value " "specified in dtype string") closed = gd["closed"] try: subtype = pandas_dtype(subtype) except TypeError as err: raise TypeError("could not construct IntervalDtype") from err if CategoricalDtype.is_dtype(subtype) or is_string_dtype(subtype): # GH 19016 msg = ("category, object, and string subtypes are not supported " "for IntervalDtype") raise TypeError(msg) key = str(subtype) + str(closed) try: return cls._cache_dtypes[key] except KeyError: u = object.__new__(cls) u._subtype = subtype u._closed = closed cls._cache_dtypes[key] = u return u
def _cast_types(self, values, cast_type, column): """ Cast values to specified type Parameters ---------- values : ndarray cast_type : string or np.dtype dtype to cast values to column : string column name - used only for error reporting Returns ------- converted : ndarray """ if is_categorical_dtype(cast_type): known_cats = (isinstance(cast_type, CategoricalDtype) and cast_type.categories is not None) if not is_object_dtype(values) and not known_cats: # TODO: this is for consistency with # c-parser which parses all categories # as strings values = astype_nansafe(values, np.dtype(str)) cats = Index(values).unique().dropna() values = Categorical._from_inferred_categories( cats, cats.get_indexer(values), cast_type, true_values=self.true_values) # use the EA's implementation of casting elif is_extension_array_dtype(cast_type): # ensure cast_type is an actual dtype and not a string cast_type = pandas_dtype(cast_type) array_type = cast_type.construct_array_type() try: if is_bool_dtype(cast_type): return array_type._from_sequence_of_strings( values, dtype=cast_type, true_values=self.true_values, false_values=self.false_values, ) else: return array_type._from_sequence_of_strings( values, dtype=cast_type) except NotImplementedError as err: raise NotImplementedError( f"Extension Array: {array_type} must implement " "_from_sequence_of_strings in order to be used in parser methods" ) from err else: try: values = astype_nansafe(values, cast_type, copy=True, skipna=True) except ValueError as err: raise ValueError( f"Unable to convert column {column} to type {cast_type}" ) from err return values
def _cmp_method(self, other, op): # ensure pandas array for list-like and eliminate non-interval scalars if is_list_like(other): if len(self) != len(other): raise ValueError("Lengths must match to compare") other = array(other) elif not isinstance(other, Interval): # non-interval scalar -> no matches return invalid_comparison(self, other, op) # determine the dtype of the elements we want to compare if isinstance(other, Interval): other_dtype = pandas_dtype("interval") elif not is_categorical_dtype(other.dtype): other_dtype = other.dtype else: # for categorical defer to categories for dtype other_dtype = other.categories.dtype # extract intervals if we have interval categories with matching closed if is_interval_dtype(other_dtype): if self.closed != other.categories.closed: return invalid_comparison(self, other, op) other = other.categories.take( other.codes, allow_fill=True, fill_value=other.categories._na_value) # interval-like -> need same closed and matching endpoints if is_interval_dtype(other_dtype): if self.closed != other.closed: return invalid_comparison(self, other, op) elif not isinstance(other, Interval): other = type(self)(other) if op is operator.eq: return (self._left == other.left) & (self._right == other.right) elif op is operator.ne: return (self._left != other.left) | (self._right != other.right) elif op is operator.gt: return (self._left > other.left) | ( (self._left == other.left) & (self._right > other.right)) elif op is operator.ge: return (self == other) | (self > other) elif op is operator.lt: return (self._left < other.left) | ( (self._left == other.left) & (self._right < other.right)) else: # operator.lt return (self == other) | (self < other) # non-interval/non-object dtype -> no matches if not is_object_dtype(other_dtype): return invalid_comparison(self, other, op) # object dtype -> iteratively check for intervals result = np.zeros(len(self), dtype=bool) for i, obj in enumerate(other): try: result[i] = op(self[i], obj) except TypeError: if obj is NA: # comparison with np.nan returns NA # github.com/pandas-dev/pandas/pull/37124#discussion_r509095092 result[i] = op is operator.ne else: raise return result
"period[D]", "period[3M]", "period[U]", "Period[D]", "Period[3M]", "Period[U]", ], ) def test_period_dtype(self, dtype): assert com.pandas_dtype(dtype) is PeriodDtype(dtype) assert com.pandas_dtype(dtype) == PeriodDtype(dtype) assert com.pandas_dtype(dtype) == dtype dtypes = { "datetime_tz": com.pandas_dtype("datetime64[ns, US/Eastern]"), "datetime": com.pandas_dtype("datetime64[ns]"), "timedelta": com.pandas_dtype("timedelta64[ns]"), "period": PeriodDtype("D"), "integer": np.dtype(np.int64), "float": np.dtype(np.float64), "object": np.dtype(object), "category": com.pandas_dtype("category"), } @pytest.mark.parametrize("name1,dtype1", list(dtypes.items()), ids=lambda x: str(x)) @pytest.mark.parametrize("name2,dtype2", list(dtypes.items()),
def _simple_new(cls, left, right, closed=None, copy=False, dtype=None, verify_integrity=True): result = IntervalMixin.__new__(cls) closed = closed or "right" left = ensure_index(left, copy=copy) right = ensure_index(right, copy=copy) if dtype is not None: # GH 19262: dtype must be an IntervalDtype to override inferred dtype = pandas_dtype(dtype) if not is_interval_dtype(dtype): msg = f"dtype must be an IntervalDtype, got {dtype}" raise TypeError(msg) elif dtype.subtype is not None: left = left.astype(dtype.subtype) right = right.astype(dtype.subtype) # coerce dtypes to match if needed if is_float_dtype(left) and is_integer_dtype(right): right = right.astype(left.dtype) elif is_float_dtype(right) and is_integer_dtype(left): left = left.astype(right.dtype) if type(left) != type(right): msg = (f"must not have differing left [{type(left).__name__}] and " f"right [{type(right).__name__}] types") raise ValueError(msg) elif is_categorical_dtype(left.dtype) or is_string_dtype(left.dtype): # GH 19016 msg = ("category, object, and string subtypes are not supported " "for IntervalArray") raise TypeError(msg) elif isinstance(left, ABCPeriodIndex): msg = "Period dtypes are not supported, use a PeriodIndex instead" raise ValueError(msg) elif isinstance(left, ABCDatetimeIndex) and str(left.tz) != str(right.tz): msg = ("left and right must have the same time zone, got " f"'{left.tz}' and '{right.tz}'") raise ValueError(msg) # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray left = ensure_wrapped_if_datetimelike(left) left = extract_array(left, extract_numpy=True) right = ensure_wrapped_if_datetimelike(right) right = extract_array(right, extract_numpy=True) lbase = getattr(left, "_ndarray", left).base rbase = getattr(right, "_ndarray", right).base if lbase is not None and lbase is rbase: # If these share area_data, then setitem could corrupt our IA right = right.copy() result._left = left result._right = right result._closed = closed if verify_integrity: result._validate() return result
def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, periods=None, copy=False, name=None, tz=None, dtype=None, **kwargs): if periods is not None: if is_float(periods): periods = int(periods) elif not is_integer(periods): msg = 'periods must be a number, got {periods}' raise TypeError(msg.format(periods=periods)) if name is None and hasattr(data, 'name'): name = data.name if dtype is not None: dtype = pandas_dtype(dtype) if not is_period_dtype(dtype): raise ValueError('dtype must be PeriodDtype') if freq is None: freq = dtype.freq elif freq != dtype.freq: msg = 'specified freq and dtype are different' raise IncompatibleFrequency(msg) # coerce freq to freq object, otherwise it can be coerced elementwise # which is slow if freq: freq = Period._maybe_convert_freq(freq) if data is None: if ordinal is not None: data = np.asarray(ordinal, dtype=np.int64) else: data, freq = cls._generate_range(start, end, periods, freq, kwargs) return cls._from_ordinals(data, name=name, freq=freq) if isinstance(data, PeriodIndex): if freq is None or freq == data.freq: # no freq change freq = data.freq data = data._values else: base1, _ = _gfc(data.freq) base2, _ = _gfc(freq) data = period.period_asfreq_arr(data._values, base1, base2, 1) return cls._simple_new(data, name=name, freq=freq) # not array / index if not isinstance(data, (np.ndarray, PeriodIndex, DatetimeIndex, Int64Index)): if is_scalar(data) or isinstance(data, Period): cls._scalar_data_error(data) # other iterable of some kind if not isinstance(data, (list, tuple)): data = list(data) data = np.asarray(data) # datetime other than period if is_datetime64_dtype(data.dtype): data = dt64arr_to_periodarr(data, freq, tz) return cls._from_ordinals(data, name=name, freq=freq) # check not floats if infer_dtype(data) == 'floating' and len(data) > 0: raise TypeError("PeriodIndex does not allow " "floating point in construction") # anything else, likely an array of strings or periods data = _ensure_object(data) freq = freq or period.extract_freq(data) data = period.extract_ordinals(data, freq) return cls._from_ordinals(data, name=name, freq=freq)
def test_datetimetz_dtype(self, dtype): assert com.pandas_dtype(dtype) is DatetimeTZDtype(dtype) assert com.pandas_dtype(dtype) == DatetimeTZDtype(dtype) assert com.pandas_dtype(dtype) == dtype
def unconvert(values, dtype, compress=None): as_is_ext = isinstance(values, ExtType) and values.code == 0 if as_is_ext: values = values.data if is_categorical_dtype(dtype): return values elif is_object_dtype(dtype): return np.array(values, dtype=object) dtype = pandas_dtype(dtype).base if not as_is_ext: values = values.encode('latin1') if compress: if compress == 'zlib': zlib = import_optional_dependency( "zlib", extra="zlib is required when `compress='zlib'`." ) decompress = zlib.decompress elif compress == 'blosc': blosc = import_optional_dependency( "blosc", extra="zlib is required when `compress='blosc'`." ) decompress = blosc.decompress else: raise ValueError("compress must be one of 'zlib' or 'blosc'") try: return np.frombuffer( _move_into_mutable_buffer(decompress(values)), dtype=dtype, ) except _BadMove as e: # Pull the decompressed data off of the `_BadMove` exception. # We don't just store this in the locals because we want to # minimize the risk of giving users access to a `bytes` object # whose data is also given to a mutable buffer. values = e.args[0] if len(values) > 1: # The empty string and single characters are memoized in many # string creating functions in the capi. This case should not # warn even though we need to make a copy because we are only # copying at most 1 byte. warnings.warn( 'copying data after decompressing; this may mean that' ' decompress is caching its result', PerformanceWarning, ) # fall through to copying `np.fromstring` # Copy the bytes into a numpy array. buf = np.frombuffer(values, dtype=dtype) buf = buf.copy() # required to not mutate the original data buf.flags.writeable = True return buf
def test_invalid_dtype_error(self, box): with tm.assert_raises_regex(TypeError, 'not understood'): com.pandas_dtype(box)
def astype(self, dtype: Optional[Dtype] = None, copy=True): """ Change the dtype of a SparseArray. The output will always be a SparseArray. To convert to a dense ndarray with a certain dtype, use :meth:`numpy.asarray`. Parameters ---------- dtype : np.dtype or ExtensionDtype For SparseDtype, this changes the dtype of ``self.sp_values`` and the ``self.fill_value``. For other dtypes, this only changes the dtype of ``self.sp_values``. copy : bool, default True Whether to ensure a copy is made, even if not necessary. Returns ------- SparseArray Examples -------- >>> arr = pd.arrays.SparseArray([0, 0, 1, 2]) >>> arr [0, 0, 1, 2] Fill: 0 IntIndex Indices: array([2, 3], dtype=int32) >>> arr.astype(np.dtype('int32')) [0, 0, 1, 2] Fill: 0 IntIndex Indices: array([2, 3], dtype=int32) Using a NumPy dtype with a different kind (e.g. float) will coerce just ``self.sp_values``. >>> arr.astype(np.dtype('float64')) ... # doctest: +NORMALIZE_WHITESPACE [0.0, 0.0, 1.0, 2.0] Fill: 0.0 IntIndex Indices: array([2, 3], dtype=int32) Use a SparseDtype if you wish to be change the fill value as well. >>> arr.astype(SparseDtype("float64", fill_value=np.nan)) ... # doctest: +NORMALIZE_WHITESPACE [nan, nan, 1.0, 2.0] Fill: nan IntIndex Indices: array([2, 3], dtype=int32) """ if is_dtype_equal(dtype, self._dtype): if not copy: return self else: return self.copy() dtype = self.dtype.update_dtype(dtype) subtype = pandas_dtype(dtype._subtype_with_str) # TODO copy=False is broken for astype_nansafe with int -> float, so cannot # passthrough copy keyword: https://github.com/pandas-dev/pandas/issues/34456 sp_values = astype_nansafe(self.sp_values, subtype, copy=True) if sp_values is self.sp_values and copy: sp_values = sp_values.copy() return self._simple_new(sp_values, self.sp_index, dtype)
assert com.pandas_dtype(dtype) == dtype def test_categorical_dtype(self): assert com.pandas_dtype('category') == CategoricalDtype() @pytest.mark.parametrize('dtype', [ 'period[D]', 'period[3M]', 'period[U]', 'Period[D]', 'Period[3M]', 'Period[U]' ]) def test_period_dtype(self, dtype): assert com.pandas_dtype(dtype) is PeriodDtype(dtype) assert com.pandas_dtype(dtype) == PeriodDtype(dtype) assert com.pandas_dtype(dtype) == dtype dtypes = dict(datetime_tz=com.pandas_dtype('datetime64[ns, US/Eastern]'), datetime=com.pandas_dtype('datetime64[ns]'), timedelta=com.pandas_dtype('timedelta64[ns]'), period=PeriodDtype('D'), integer=np.dtype(np.int64), float=np.dtype(np.float64), object=np.dtype(np.object), category=com.pandas_dtype('category')) @pytest.mark.parametrize('name1,dtype1', list(dtypes.items()), ids=lambda x: str(x)) @pytest.mark.parametrize('name2,dtype2', list(dtypes.items()), ids=lambda x: str(x))
def astype(self, dtype, copy: bool = True) -> ArrayLike: """ Cast to a NumPy array or ExtensionArray with 'dtype'. Parameters ---------- dtype : str or dtype Typecode or data-type to which the array is cast. copy : bool, default True Whether to copy the data, even if not necessary. If False, a copy is made only if the old dtype does not match the new dtype. Returns ------- ndarray or ExtensionArray NumPy ndarray, BooleanArray or IntegerArray with 'dtype' for its dtype. Raises ------ TypeError if incompatible type with an BooleanDtype, equivalent of same_kind casting """ from pandas.core.arrays.string_ import StringDtype dtype = pandas_dtype(dtype) if isinstance(dtype, BooleanDtype): values, mask = coerce_to_array(self, copy=copy) if not copy: return self else: return BooleanArray(values, mask, copy=False) elif isinstance(dtype, StringDtype): return dtype.construct_array_type()._from_sequence(self, copy=False) if is_bool_dtype(dtype): # astype_nansafe converts np.nan to True if self._hasna: raise ValueError("cannot convert float NaN to bool") else: return self._data.astype(dtype, copy=copy) if is_extension_array_dtype(dtype) and is_integer_dtype(dtype): from pandas.core.arrays import IntegerArray return IntegerArray(self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False) # for integer, error if there are missing values if is_integer_dtype(dtype): if self._hasna: raise ValueError("cannot convert NA to integer") # for float dtype, ensure we use np.nan before casting (numpy cannot # deal with pd.NA) na_value = self._na_value if is_float_dtype(dtype): na_value = np.nan # coerce return self.to_numpy(dtype=dtype, na_value=na_value, copy=False)
def test_categorical_dtype(self): assert com.pandas_dtype('category') == CategoricalDtype()
def test_datetimetz_dtype(self, dtype): assert com.pandas_dtype( dtype) == DatetimeTZDtype.construct_from_string(dtype) assert com.pandas_dtype(dtype) == dtype
def test_invalid_dtype_error(self, box): with pytest.raises(TypeError, match="not understood"): com.pandas_dtype(box)