def _isfinite(values): if is_datetime_or_timedelta_dtype(values): return isna(values) if (is_complex_dtype(values) or is_float_dtype(values) or is_integer_dtype(values) or is_bool_dtype(values)): return ~np.isfinite(values) return ~np.isfinite(values.astype('float64'))
def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True, mask=None, compute_mask=True): """ utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value copy = True will force the copy """ if skipna: compute_mask = True if is_datetime64tz_dtype(values): # com.values_from_object returns M8[ns] dtype instead of tz-aware, # so this case must be handled separately from the rest dtype = values.dtype values = getattr(values, "_values", values) else: values = com.values_from_object(values) dtype = values.dtype if mask is None and compute_mask: if isfinite: mask = _isfinite(values) else: mask = isna(values) if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = getattr(values, "asi8", values) values = values.view(np.int64) dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna: if copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = maybe_upcast_putmask(values, mask, fill_value) elif copy: values = values.copy() # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max, fill_value
def test_is_datetime_or_timedelta_dtype(): assert not com.is_datetime_or_timedelta_dtype(int) assert not com.is_datetime_or_timedelta_dtype(str) assert not com.is_datetime_or_timedelta_dtype(pd.Series([1, 2])) assert not com.is_datetime_or_timedelta_dtype(np.array(['a', 'b'])) assert not com.is_datetime_or_timedelta_dtype( DatetimeTZDtype("ns", "US/Eastern")) assert com.is_datetime_or_timedelta_dtype(np.datetime64) assert com.is_datetime_or_timedelta_dtype(np.timedelta64) assert com.is_datetime_or_timedelta_dtype( np.array([], dtype=np.timedelta64)) assert com.is_datetime_or_timedelta_dtype( np.array([], dtype=np.datetime64))
def test_is_datetime_or_timedelta_dtype(): assert not com.is_datetime_or_timedelta_dtype(int) assert not com.is_datetime_or_timedelta_dtype(str) assert not com.is_datetime_or_timedelta_dtype(pd.Series([1, 2])) assert not com.is_datetime_or_timedelta_dtype(np.array(['a', 'b'])) # TODO(jreback), this is sligthly suspect assert not com.is_datetime_or_timedelta_dtype( DatetimeTZDtype("ns", "US/Eastern")) assert com.is_datetime_or_timedelta_dtype(np.datetime64) assert com.is_datetime_or_timedelta_dtype(np.timedelta64) assert com.is_datetime_or_timedelta_dtype( np.array([], dtype=np.timedelta64)) assert com.is_datetime_or_timedelta_dtype( np.array([], dtype=np.datetime64))
def _bn_ok_dtype(dt, name): # Bottleneck chokes on datetime64 if (not is_object_dtype(dt) and not is_datetime_or_timedelta_dtype(dt)): # bottleneck does not properly upcast during the sum # so can overflow if name == 'nansum': if dt.itemsize < 8: return False return True return False
def _get_next_label(label): dtype = getattr(label, 'dtype', type(label)) if isinstance(label, (Timestamp, Timedelta)): dtype = 'datetime64' if is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype): return label + np.timedelta64(1, 'ns') elif is_integer_dtype(dtype): return label + 1 elif is_float_dtype(dtype): return np.nextafter(label, np.infty) else: raise TypeError('cannot determine next label for type {typ!r}' .format(typ=type(label)))
def _get_prev_label(label): dtype = getattr(label, 'dtype', type(label)) if isinstance(label, (Timestamp, Timedelta)): dtype = 'datetime64' if is_datetime_or_timedelta_dtype(dtype): return label - np.timedelta64(1, 'ns') elif is_integer_dtype(dtype): return label - 1 elif is_float_dtype(dtype): return np.nextafter(label, -np.infty) else: raise TypeError('cannot determine next label for type %r' % type(label))
def astype(self, dtype, copy=True): if is_object_dtype(dtype): return self._box_values_as_index() elif is_string_dtype(dtype) and not is_categorical_dtype(dtype): return Index(self.format(), name=self.name, dtype=object) elif is_integer_dtype(dtype): return Index(self.values.astype('i8', copy=copy), name=self.name, dtype='i8') elif (is_datetime_or_timedelta_dtype(dtype) and not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype): # disallow conversion between datetime/timedelta, # and conversions for any datetimelike to float msg = 'Cannot cast {name} to dtype {dtype}' raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy)
def _bn_ok_dtype(dt, name): # Bottleneck chokes on datetime64 if (not is_object_dtype(dt) and not is_datetime_or_timedelta_dtype(dt)): # GH 15507 # bottleneck does not properly upcast during the sum # so can overflow # GH 9422 # further we also want to preserve NaN when all elements # are NaN, unlinke bottleneck/numpy which consider this # to be 0 if name in ['nansum', 'nanprod']: return False return True return False
def astype(self, dtype, copy=True): if is_object_dtype(dtype): return self._box_values_as_index() elif is_string_dtype(dtype) and not is_categorical_dtype(dtype): return Index(self.format(), name=self.name, dtype=object) elif is_integer_dtype(dtype): # TODO(DatetimeArray): use self._values here. # Can't use ._values currently, because that returns a # DatetimeIndex, which throws us in an infinite loop. return Index(self.values.astype('i8', copy=copy), name=self.name, dtype='i8') elif (is_datetime_or_timedelta_dtype(dtype) and not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype): # disallow conversion between datetime/timedelta, # and conversions for any datetimelike to float msg = 'Cannot cast {name} to dtype {dtype}' raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy)
def _convert_bin_to_datelike_type(bins, dtype): """ Convert bins to a DatetimeIndex or TimedeltaIndex if the orginal dtype is datelike Parameters ---------- bins : list-like of bins dtype : dtype of data Returns ------- bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is datelike """ if is_datetime64tz_dtype(dtype) or is_datetime_or_timedelta_dtype(dtype): bins = Index(bins.astype(np.int64), dtype=dtype) return bins
def astype(self, dtype, copy=True): # TODO: Figure out something better here... # We have DatetimeLikeArrayMixin -> # super(...), which ends up being... DatetimeIndexOpsMixin? # this is complicated. # need a pandas_astype(arr, dtype). from pandas import Categorical dtype = pandas_dtype(dtype) if is_object_dtype(dtype): return np.asarray(self, dtype=object) elif is_string_dtype(dtype) and not is_categorical_dtype(dtype): return self._format_native_types() elif is_integer_dtype(dtype): values = self._data if values.dtype != dtype: # int32 vs. int64 values = values.astype(dtype) elif copy: values = values.copy() return values elif (is_datetime_or_timedelta_dtype(dtype) and not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype): # disallow conversion between datetime/timedelta, # and conversions for any datetimelike to float msg = 'Cannot cast {name} to dtype {dtype}' raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) elif is_categorical_dtype(dtype): return Categorical(self, dtype=dtype) elif is_period_dtype(dtype): return self.asfreq(dtype.freq) else: return np.asarray(self, dtype=dtype)
def test_maybe_promote_any_numpy_dtype_with_na(any_numpy_dtype_reduced, fill_value, box): dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture if (dtype == bytes and not boxed and fill_value is not None and fill_value is not NaT): pytest.xfail("does not upcast to object") elif dtype == "uint64" and not boxed and fill_value == iNaT: pytest.xfail("does not upcast correctly") # below: opinionated that iNaT should be interpreted as missing value elif (not boxed and (is_float_dtype(dtype) or is_complex_dtype(dtype)) and fill_value == iNaT): pytest.xfail("does not cast to missing value marker correctly") elif (is_string_dtype(dtype) or dtype == bool) and not boxed and fill_value == iNaT: pytest.xfail("does not cast to missing value marker correctly") if is_integer_dtype(dtype) and dtype == "uint64" and fill_value == iNaT: # uint64 + negative int casts to object; iNaT is considered as missing expected_dtype = np.dtype(object) exp_val_for_scalar = np.nan elif is_integer_dtype(dtype) and fill_value == iNaT: # other integer + iNaT casts to int64 expected_dtype = np.int64 exp_val_for_scalar = iNaT elif is_integer_dtype(dtype) and fill_value is not NaT: # integer + other missing value (np.nan / None) casts to float expected_dtype = np.float64 exp_val_for_scalar = np.nan elif is_object_dtype(dtype) and (fill_value == iNaT or fill_value is NaT): # inserting into object does not cast the value # but *does* cast None to np.nan expected_dtype = np.dtype(object) exp_val_for_scalar = fill_value elif is_datetime_or_timedelta_dtype(dtype): # datetime / timedelta cast all missing values to iNaT expected_dtype = dtype exp_val_for_scalar = iNaT elif fill_value is NaT: # NaT upcasts everything that's not datetime/timedelta to object expected_dtype = np.dtype(object) exp_val_for_scalar = NaT elif is_float_dtype(dtype) or is_complex_dtype(dtype): # float / complex + missing value (!= NaT) stays the same expected_dtype = dtype exp_val_for_scalar = np.nan else: # all other cases cast to object, and use np.nan as missing value expected_dtype = np.dtype(object) exp_val_for_scalar = np.nan # array case has same expected_dtype; but returns corresponding na-marker if is_integer_dtype(expected_dtype): # integers cannot hold NaNs; maybe_promote_with_array returns None exp_val_for_array = None elif is_datetime_or_timedelta_dtype(expected_dtype): exp_val_for_array = iNaT else: # expected_dtype = float / complex / object exp_val_for_array = np.nan _check_promote( dtype, fill_value, boxed, box_dtype, expected_dtype, exp_val_for_scalar, exp_val_for_array, )
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray if isinstance(values, ABCSeries): if columns is None: if values.name is not None: columns = Index([values.name]) if index is None: index = values.index else: values = values.reindex(index) # zero len case (GH #2234) if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) vdtype = getattr(values, "dtype", None) if is_1d_only_ea_dtype(vdtype) or isinstance(dtype, ExtensionDtype): # GH#19157 if isinstance(values, np.ndarray) and values.ndim > 1: # GH#12513 a EA dtype passed with a 2D array, split into # multiple EAs that view the values values = [values[:, n] for n in range(values.shape[1])] else: values = [values] if columns is None: columns = Index(range(len(values))) return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ) if is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): # i.e. Datetime64TZ values = extract_array(values, extract_numpy=True) if copy: values = values.copy() if values.ndim == 1: values = values.reshape(-1, 1) else: # by definition an array here # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) if dtype is not None and not is_dtype_equal(values.dtype, dtype): shape = values.shape flat = values.ravel() if not is_integer_dtype(dtype): # TODO: skipping integer_dtype is needed to keep the tests passing, # not clear it is correct # Note: we really only need _try_cast, but keeping to exposed funcs values = sanitize_array(flat, None, dtype=dtype, copy=copy, raise_cast_failure=True) else: try: values = construct_1d_ndarray_preserving_na(flat, dtype=dtype, copy=False) except IntCastingNaNError: # following Series, we ignore the dtype and retain floating # values instead of casting nans to meaningless ints pass values = values.reshape(shape) # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes(values.shape[0], values.shape[1], index=index, columns=columns) _check_values_indices_shape_match(values, index, columns) if typ == "array": if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) if dtype is None and is_object_dtype(values.dtype): arrays = [ ensure_wrapped_if_datetimelike( maybe_infer_to_datetimelike(values[:, i].copy())) for i in range(values.shape[1]) ] else: if is_datetime_or_timedelta_dtype(values.dtype): values = ensure_wrapped_if_datetimelike(values) arrays = [values[:, i].copy() for i in range(values.shape[1])] return ArrayManager(arrays, [index, columns], verify_integrity=False) values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values.dtype): if values.ndim == 2 and values.shape[0] != 1: # transpose and separate blocks dtlike_vals = [maybe_infer_to_datetimelike(row) for row in values] dvals_list = [ensure_block_shape(dval, 2) for dval in dtlike_vals] # TODO: What about re-joining object columns? block_values = [ new_block(dvals_list[n], placement=n, ndim=2) for n in range(len(dvals_list)) ] else: datelike_vals = maybe_infer_to_datetimelike(values) nb = new_block(datelike_vals, placement=slice(len(columns)), ndim=2) block_values = [nb] else: nb = new_block(values, placement=slice(len(columns)), ndim=2) block_values = [nb] if len(columns) == 0: block_values = [] return create_block_manager_from_blocks(block_values, [columns, index])
def _metric_aggs( self, query_compiler, func, field_types=None, numeric_only=None, keep_original_dtype=False, ): """ Parameters ---------- field_types: str, default None if `aggregatable` use only field_names whose fields in elasticseach are aggregatable. If `None`, use only numeric fields. keep_original_dtype : bool, default False if `True` the output values should keep the same domain as the input values, i.e. booleans should be booleans Returns ------- pandas.Series Series containing results of `func` applied to the field_name(s) """ query_params, post_processing = self._resolve_tasks(query_compiler) size = self._size(query_params, post_processing) if size is not None: raise NotImplementedError( f"Can not count field matches if size is set {size}") body = Query(query_params["query"]) results = {} # some metrics aggs (including cardinality) work on all aggregatable fields # therefore we include an optional all parameter on operations # that call _metric_aggs if field_types == "aggregatable": aggregatable_field_names = ( query_compiler._mappings.aggregatable_field_names()) for field in aggregatable_field_names.keys(): body.metric_aggs(field, func, field) response = query_compiler._client.search( index=query_compiler._index_pattern, size=0, body=body.to_search_body()) # Results are of the form # "aggregations" : { # "customer_full_name.keyword" : { # "value" : 10 # } # } # map aggregatable (e.g. x.keyword) to field_name for key, value in aggregatable_field_names.items(): results[value] = response["aggregations"][key]["value"] else: if numeric_only: ( pd_dtypes, source_fields, date_formats, ) = query_compiler._mappings.metric_source_fields( include_bool=True) else: # The only non-numerics we support are bool and timestamps currently # strings are not supported by metric aggs in ES # TODO - sum isn't supported for Timestamp in pandas - although ES does attempt to do it ( pd_dtypes, source_fields, date_formats, ) = query_compiler._mappings.metric_source_fields( include_bool=True, include_timestamp=True) for field in source_fields: body.metric_aggs(field, func, field) response = query_compiler._client.search( index=query_compiler._index_pattern, size=0, body=body.to_search_body()) # Results are of the form # "aggregations" : { # "AvgTicketPrice" : { # "value" : 628.2536888148849 # }, # "timestamp": { # "value": 1.5165624455644382E12, # "value_as_string": "2018-01-21T19:20:45.564Z" # } # } for pd_dtype, field, date_format in zip(pd_dtypes, source_fields, date_formats): if is_datetime_or_timedelta_dtype(pd_dtype): results[field] = elasticsearch_date_to_pandas_date( response["aggregations"][field]["value_as_string"], date_format) elif keep_original_dtype: results[field] = pd_dtype.type( response["aggregations"][field]["value"]) else: results[field] = response["aggregations"][field]["value"] # Return single value if this is a series # if len(numeric_source_fields) == 1: # return np.float64(results[numeric_source_fields[0]]) s = pd.Series(data=results, index=results.keys()) return s
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray if isinstance(values, ABCSeries): if columns is None: if values.name is not None: columns = Index([values.name]) if index is None: index = values.index else: values = values.reindex(index) # zero len case (GH #2234) if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) vdtype = getattr(values, "dtype", None) if is_1d_only_ea_dtype(vdtype) or isinstance(dtype, ExtensionDtype): # GH#19157 if isinstance(values, np.ndarray) and values.ndim > 1: # GH#12513 a EA dtype passed with a 2D array, split into # multiple EAs that view the values values = [values[:, n] for n in range(values.shape[1])] else: values = [values] if columns is None: columns = Index(range(len(values))) else: columns = ensure_index(columns) return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ) elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): # i.e. Datetime64TZ values = extract_array(values, extract_numpy=True) if copy: values = values.copy() if values.ndim == 1: values = values.reshape(-1, 1) else: # by definition an array here # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) if dtype is not None and not is_dtype_equal(values.dtype, dtype): shape = values.shape flat = values.ravel() # GH#40110 see similar check inside sanitize_array rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f") values = sanitize_array(flat, None, dtype=dtype, copy=copy, raise_cast_failure=rcf) values = values.reshape(shape) # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes(values.shape[0], values.shape[1], index=index, columns=columns) _check_values_indices_shape_match(values, index, columns) if typ == "array": if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) if dtype is None and is_object_dtype(values.dtype): arrays = [ ensure_wrapped_if_datetimelike( maybe_infer_to_datetimelike(values[:, i])) for i in range(values.shape[1]) ] else: if is_datetime_or_timedelta_dtype(values.dtype): values = ensure_wrapped_if_datetimelike(values) arrays = [values[:, i] for i in range(values.shape[1])] return ArrayManager(arrays, [index, columns], verify_integrity=False) values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values.dtype): obj_columns = list(values) maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns] # don't convert (and copy) the objects if no type inference occurs if any(x is not y for x, y in zip(obj_columns, maybe_datetime)): dvals_list = [ ensure_block_shape(dval, 2) for dval in maybe_datetime ] block_values = [ new_block_2d(dvals_list[n], placement=BlockPlacement(n)) for n in range(len(dvals_list)) ] else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp) block_values = [nb] else: bp = BlockPlacement(slice(len(columns))) nb = new_block_2d(values, placement=bp) block_values = [nb] if len(columns) == 0: block_values = [] return create_block_manager_from_blocks(block_values, [columns, index], verify_integrity=False)
def _view_if_needed(values): if is_datetime_or_timedelta_dtype(values): return values.view(np.int64) return values
def to_numeric(arg, errors='raise', downcast=None): """ Convert argument to a numeric type. The default return dtype is `float64` or `int64` depending on the data supplied. Use the `downcast` parameter to obtain other dtypes. Parameters ---------- arg : list, tuple, 1-d array, or Series errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input downcast : {'integer', 'signed', 'unsigned', 'float'} , default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) As this behaviour is separate from the core conversion to numeric values, any errors raised during the downcasting will be surfaced regardless of the value of the 'errors' input. In addition, downcasting will only occur if the size of the resulting data's dtype is strictly larger than the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. .. versionadded:: 0.19.0 Returns ------- ret : numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray Examples -------- Take separate series and convert to numeric, coercing when told to >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) 0 1.0 1 2.0 2 -3.0 dtype: float64 >>> pd.to_numeric(s, downcast='float') 0 1.0 1 2.0 2 -3.0 dtype: float32 >>> pd.to_numeric(s, downcast='signed') 0 1 1 2 2 -3 dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') 0 apple 1 1.0 2 2 3 -3 dtype: object >>> pd.to_numeric(s, errors='coerce') 0 NaN 1 1.0 2 2.0 3 -3.0 dtype: float64 See Also -------- pandas.DataFrame.astype : Cast argument to a specified dtype. pandas.to_datetime : Convert argument to datetime. pandas.to_timedelta : Convert argument to timedelta. numpy.ndarray.astype : Cast a numpy array to a specified type. """ if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): raise ValueError('invalid downcasting method provided') is_series = False is_index = False is_scalars = False if isinstance(arg, ABCSeries): is_series = True values = arg.values elif isinstance(arg, ABCIndexClass): is_index = True values = arg.asi8 if values is None: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') elif is_scalar(arg): if is_decimal(arg): return float(arg) if is_number(arg): return arg is_scalars = True values = np.array([arg], dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a list, tuple, 1-d array, or Series') else: values = arg try: if is_numeric_dtype(values): pass elif is_datetime_or_timedelta_dtype(values): values = values.astype(np.int64) else: values = ensure_object(values) coerce_numeric = False if errors in ('ignore', 'raise') else True values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) except Exception: if errors == 'raise': raise # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values): typecodes = None if downcast in ('integer', 'signed'): typecodes = np.typecodes['Integer'] elif downcast == 'unsigned' and np.min(values) >= 0: typecodes = np.typecodes['UnsignedInteger'] elif downcast == 'float': typecodes = np.typecodes['Float'] # pandas support goes only to np.float32, # as float dtypes smaller than that are # extremely rare and not well supported float_32_char = np.dtype(np.float32).char float_32_ind = typecodes.index(float_32_char) typecodes = typecodes[float_32_ind:] if typecodes is not None: # from smallest to largest for dtype in typecodes: if np.dtype(dtype).itemsize <= values.dtype.itemsize: values = maybe_downcast_to_dtype(values, dtype) # successful conversion if values.dtype == dtype: break if is_series: return pd.Series(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy_with_infer return pd.Index(values, name=arg.name) elif is_scalars: return values[0] else: return values
def maybe_promote(dtype, fill_value=np.nan): """ Find the minimal dtype that can hold both the given dtype and fill_value. Parameters ---------- dtype : np.dtype or ExtensionDtype fill_value : scalar, default np.nan Returns ------- dtype Upcasted from dtype argument if necessary. fill_value Upcasted from fill_value argument if necessary. """ if not is_scalar(fill_value) and not is_object_dtype(dtype): # with object dtype there is nothing to promote, and the user can # pass pretty much any weird fill_value they like raise ValueError("fill_value must be a scalar") # if we passed an array here, determine the fill value by dtype if isinstance(fill_value, np.ndarray): if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): fill_value = fill_value.dtype.type("NaT", "ns") else: # we need to change to object type as our # fill_value is of object type if fill_value.dtype == np.object_: dtype = np.dtype(np.object_) fill_value = np.nan if dtype == np.object_ or dtype.kind in ["U", "S"]: # We treat string-like dtypes as object, and _always_ fill # with np.nan fill_value = np.nan dtype = np.dtype(np.object_) # returns tuple of (dtype, fill_value) if issubclass(dtype.type, np.datetime64): if isinstance(fill_value, datetime) and fill_value.tzinfo is not None: # Trying to insert tzaware into tznaive, have to cast to object dtype = np.dtype(np.object_) elif is_integer(fill_value) or (is_float(fill_value) and not isna(fill_value)): dtype = np.dtype(np.object_) else: try: fill_value = tslibs.Timestamp(fill_value).to_datetime64() except (TypeError, ValueError): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.timedelta64): if (is_integer(fill_value) or (is_float(fill_value) and not np.isnan(fill_value)) or isinstance(fill_value, str)): # TODO: What about str that can be a timedelta? dtype = np.dtype(np.object_) else: try: fv = tslibs.Timedelta(fill_value) except ValueError: dtype = np.dtype(np.object_) else: if fv is NaT: # NaT has no `to_timedelta64` method fill_value = np.timedelta64("NaT", "ns") else: fill_value = fv.to_timedelta64() elif is_datetime64tz_dtype(dtype): if isna(fill_value): fill_value = NaT elif not isinstance(fill_value, datetime): dtype = np.dtype(np.object_) elif fill_value.tzinfo is None: dtype = np.dtype(np.object_) elif not tz_compare(fill_value.tzinfo, dtype.tz): # TODO: sure we want to cast here? dtype = np.dtype(np.object_) elif is_extension_array_dtype(dtype) and isna(fill_value): fill_value = dtype.na_value elif is_float(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): dtype = np.dtype(np.float64) elif dtype.kind == "f": mst = np.min_scalar_type(fill_value) if mst > dtype: # e.g. mst is np.float64 and dtype is np.float32 dtype = mst elif dtype.kind == "c": mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) elif is_bool(fill_value): if not issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif is_integer(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): if not np.can_cast(fill_value, dtype): # upcast to prevent overflow mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) if dtype.kind == "f": # Case where we disagree with numpy dtype = np.dtype(np.object_) elif is_complex(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, (np.integer, np.floating)): mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) elif dtype.kind == "c": mst = np.min_scalar_type(fill_value) if mst > dtype: # e.g. mst is np.complex128 and dtype is np.complex64 dtype = mst elif fill_value is None: if is_float_dtype(dtype) or is_complex_dtype(dtype): fill_value = np.nan elif is_integer_dtype(dtype): dtype = np.float64 fill_value = np.nan elif is_datetime_or_timedelta_dtype(dtype): fill_value = dtype.type("NaT", "ns") else: dtype = np.dtype(np.object_) fill_value = np.nan else: dtype = np.dtype(np.object_) # in case we have a string that looked like a number if is_extension_array_dtype(dtype): pass elif issubclass(np.dtype(dtype).type, (bytes, str)): dtype = np.dtype(np.object_) fill_value = _ensure_dtype_type(fill_value, dtype) return dtype, fill_value
def to_numeric(arg, errors='raise', downcast=None): """ Convert argument to a numeric type. The default return dtype is `float64` or `int64` depending on the data supplied. Use the `downcast` parameter to obtain other dtypes. Parameters ---------- arg : list, tuple, 1-d array, or Series errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaN - If 'ignore', then invalid parsing will return the input downcast : {'integer', 'signed', 'unsigned', 'float'} , default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) As this behaviour is separate from the core conversion to numeric values, any errors raised during the downcasting will be surfaced regardless of the value of the 'errors' input. In addition, downcasting will only occur if the size of the resulting data's dtype is strictly larger than the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. .. versionadded:: 0.19.0 Returns ------- ret : numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray Examples -------- Take separate series and convert to numeric, coercing when told to >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) 0 1.0 1 2.0 2 -3.0 dtype: float64 >>> pd.to_numeric(s, downcast='float') 0 1.0 1 2.0 2 -3.0 dtype: float32 >>> pd.to_numeric(s, downcast='signed') 0 1 1 2 2 -3 dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') 0 apple 1 1.0 2 2 3 -3 dtype: object >>> pd.to_numeric(s, errors='coerce') 0 NaN 1 1.0 2 2.0 3 -3.0 dtype: float64 See Also -------- pandas.DataFrame.astype : Cast argument to a specified dtype. pandas.to_datetime : Convert argument to datetime. pandas.to_timedelta : Convert argument to timedelta. numpy.ndarray.astype : Cast a numpy array to a specified type. """ if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'): raise ValueError('invalid downcasting method provided') is_series = False is_index = False is_scalars = False if isinstance(arg, ABCSeries): is_series = True values = arg.values elif isinstance(arg, ABCIndexClass): is_index = True values = arg.asi8 if values is None: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') elif is_scalar(arg): if is_decimal(arg): return float(arg) if is_number(arg): return arg is_scalars = True values = np.array([arg], dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a list, tuple, 1-d array, or Series') else: values = arg try: if is_numeric_dtype(values): pass elif is_datetime_or_timedelta_dtype(values): values = values.astype(np.int64) else: values = ensure_object(values) coerce_numeric = False if errors in ('ignore', 'raise') else True values = lib.maybe_convert_numeric(values, set(), coerce_numeric=coerce_numeric) except Exception: if errors == 'raise': raise # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values): typecodes = None if downcast in ('integer', 'signed'): typecodes = np.typecodes['Integer'] elif downcast == 'unsigned' and np.min(values) >= 0: typecodes = np.typecodes['UnsignedInteger'] elif downcast == 'float': typecodes = np.typecodes['Float'] # pandas support goes only to np.float32, # as float dtypes smaller than that are # extremely rare and not well supported float_32_char = np.dtype(np.float32).char float_32_ind = typecodes.index(float_32_char) typecodes = typecodes[float_32_ind:] if typecodes is not None: # from smallest to largest for dtype in typecodes: if np.dtype(dtype).itemsize <= values.dtype.itemsize: values = maybe_downcast_to_dtype(values, dtype) # successful conversion if values.dtype == dtype: break if is_series: return pd.Series(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy_with_infer return pd.Index(values, name=arg.name) elif is_scalars: return values[0] else: return values
def _get_values( values: np.ndarray, skipna: bool, fill_value: Any = None, fill_value_typ: Optional[str] = None, mask: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]: """ Utility to get the values view, mask, dtype, dtype_max, and fill_value. If both mask and fill_value/fill_value_typ are not None and skipna is True, the values array will be copied. For input arrays of boolean or integer dtypes, copies will only occur if a precomputed mask, a fill_value/fill_value_typ, and skipna=True are provided. Parameters ---------- values : ndarray input array to potentially compute mask for skipna : bool boolean for whether NaNs should be skipped fill_value : Any value to fill NaNs with fill_value_typ : str Set to '+inf' or '-inf' to handle dtype-specific infinities mask : Optional[np.ndarray] nan-mask if known Returns ------- values : ndarray Potential copy of input value array mask : Optional[ndarray[bool]] Mask for values, if deemed necessary to compute dtype : dtype dtype for values dtype_max : dtype platform independent dtype fill_value : Any fill value used """ # In _get_values is only called from within nanops, and in all cases # with scalar fill_value. This guarantee is important for the # maybe_upcast_putmask call below assert is_scalar(fill_value) mask = _maybe_get_mask(values, skipna, mask) if is_datetime64tz_dtype(values): # lib.values_from_object returns M8[ns] dtype instead of tz-aware, # so this case must be handled separately from the rest dtype = values.dtype values = getattr(values, "_values", values) else: values = lib.values_from_object(values) dtype = values.dtype if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = getattr(values, "asi8", values) values = values.view(np.int64) dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value( dtype, fill_value=fill_value, fill_value_typ=fill_value_typ ) copy = (mask is not None) and (fill_value is not None) if skipna and copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, _ = maybe_upcast_putmask(values, mask, fill_value) # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max, fill_value
def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True, mask=None): """ utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value copy = True will force the copy """ if is_datetime64tz_dtype(values): # com.values_from_object returns M8[ns] dtype instead of tz-aware, # so this case must be handled separately from the rest dtype = values.dtype values = getattr(values, "_values", values) else: values = com.values_from_object(values) dtype = values.dtype if mask is None: if isfinite: mask = _isfinite(values) else: mask = isna(values) if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = getattr(values, "asi8", values) values = values.view(np.int64) dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna: if copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = maybe_upcast_putmask(values, mask, fill_value) elif copy: values = values.copy() # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max, fill_value
def to_numeric(arg, errors="raise", downcast=None): """ Convert argument to a numeric type. The default return dtype is `float64` or `int64` depending on the data supplied. Use the `downcast` parameter to obtain other dtypes. Please note that precision loss may occur if really large numbers are passed in. Due to the internal limitations of `ndarray`, if numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min) or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are passed in, it is very likely they will be converted to float so that they can stored in an `ndarray`. These warnings apply similarly to `Series` since it internally leverages `ndarray`. Parameters ---------- arg : scalar, list, tuple, 1-d array, or Series Argument to be converted. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaN. - If 'ignore', then invalid parsing will return the input. downcast : {'integer', 'signed', 'unsigned', 'float'}, default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) As this behaviour is separate from the core conversion to numeric values, any errors raised during the downcasting will be surfaced regardless of the value of the 'errors' input. In addition, downcasting will only occur if the size of the resulting data's dtype is strictly larger than the dtype it is to be cast to, so if none of the dtypes checked satisfy that specification, no downcasting will be performed on the data. Returns ------- ret Numeric if parsing succeeded. Return type depends on input. Series if Series, otherwise ndarray. See Also -------- DataFrame.astype : Cast argument to a specified dtype. to_datetime : Convert argument to datetime. to_timedelta : Convert argument to timedelta. numpy.ndarray.astype : Cast a numpy array to a specified type. DataFrame.convert_dtypes : Convert dtypes. Examples -------- Take separate series and convert to numeric, coercing when told to >>> s = pd.Series(['1.0', '2', -3]) >>> pd.to_numeric(s) 0 1.0 1 2.0 2 -3.0 dtype: float64 >>> pd.to_numeric(s, downcast='float') 0 1.0 1 2.0 2 -3.0 dtype: float32 >>> pd.to_numeric(s, downcast='signed') 0 1 1 2 2 -3 dtype: int8 >>> s = pd.Series(['apple', '1.0', '2', -3]) >>> pd.to_numeric(s, errors='ignore') 0 apple 1 1.0 2 2 3 -3 dtype: object >>> pd.to_numeric(s, errors='coerce') 0 NaN 1 1.0 2 2.0 3 -3.0 dtype: float64 """ if downcast not in (None, "integer", "signed", "unsigned", "float"): raise ValueError("invalid downcasting method provided") if errors not in ("ignore", "raise", "coerce"): raise ValueError("invalid error value specified") is_series = False is_index = False is_scalars = False if isinstance(arg, ABCSeries): is_series = True values = arg.values elif isinstance(arg, ABCIndexClass): is_index = True if needs_i8_conversion(arg.dtype): values = arg.asi8 else: values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype="O") elif is_scalar(arg): if is_decimal(arg): return float(arg) if is_number(arg): return arg is_scalars = True values = np.array([arg], dtype="O") elif getattr(arg, "ndim", 1) > 1: raise TypeError("arg must be a list, tuple, 1-d array, or Series") else: values = arg values_dtype = getattr(values, "dtype", None) if is_numeric_dtype(values_dtype): pass elif is_datetime_or_timedelta_dtype(values_dtype): values = values.astype(np.int64) else: values = ensure_object(values) coerce_numeric = errors not in ("ignore", "raise") try: values = lib.maybe_convert_numeric( values, set(), coerce_numeric=coerce_numeric ) except (ValueError, TypeError): if errors == "raise": raise # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values.dtype): typecodes = None if downcast in ("integer", "signed"): typecodes = np.typecodes["Integer"] elif downcast == "unsigned" and (not len(values) or np.min(values) >= 0): typecodes = np.typecodes["UnsignedInteger"] elif downcast == "float": typecodes = np.typecodes["Float"] # pandas support goes only to np.float32, # as float dtypes smaller than that are # extremely rare and not well supported float_32_char = np.dtype(np.float32).char float_32_ind = typecodes.index(float_32_char) typecodes = typecodes[float_32_ind:] if typecodes is not None: # from smallest to largest for dtype in typecodes: dtype = np.dtype(dtype) if dtype.itemsize <= values.dtype.itemsize: values = maybe_downcast_numeric(values, dtype) # successful conversion if values.dtype == dtype: break if is_series: return arg._constructor(values, index=arg.index, name=arg.name) elif is_index: # because we want to coerce to numeric if possible, # do not use _shallow_copy return pd.Index(values, name=arg.name) elif is_scalars: return values[0] else: return values
def is_timestamp(self) -> bool: return is_datetime_or_timedelta_dtype(self.pd_dtype)
def _resample_time(trj: TrajaDataFrame, step_time: Union[float, int]): if not is_datetime_or_timedelta_dtype(trj.index): raise Exception(f"{trj.index.dtype} is not datetime or timedelta.") return trj.resample(step_time).agg({"x": np.mean, "y": np.mean})
def _view_if_needed(values): if is_datetime_or_timedelta_dtype(values): return values.view(np.int64) return values