def test_is_datetime_dtypes(self): ts = pd.date_range('20130101', periods=3) tsa = pd.date_range('20130101', periods=3, tz='US/Eastern') assert is_datetime64_dtype('datetime64') assert is_datetime64_dtype('datetime64[ns]') assert is_datetime64_dtype(ts) assert not is_datetime64_dtype(tsa) assert not is_datetime64_ns_dtype('datetime64') assert is_datetime64_ns_dtype('datetime64[ns]') assert is_datetime64_ns_dtype(ts) assert is_datetime64_ns_dtype(tsa) assert is_datetime64_any_dtype('datetime64') assert is_datetime64_any_dtype('datetime64[ns]') assert is_datetime64_any_dtype(ts) assert is_datetime64_any_dtype(tsa) assert not is_datetime64tz_dtype('datetime64') assert not is_datetime64tz_dtype('datetime64[ns]') assert not is_datetime64tz_dtype(ts) assert is_datetime64tz_dtype(tsa) for tz in ['US/Eastern', 'UTC']: dtype = 'datetime64[ns, {}]'.format(tz) assert not is_datetime64_dtype(dtype) assert is_datetime64tz_dtype(dtype) assert is_datetime64_ns_dtype(dtype) assert is_datetime64_any_dtype(dtype)
def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True, mask=None, compute_mask=True): """ utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value copy = True will force the copy """ if skipna: compute_mask = True if is_datetime64tz_dtype(values): # com.values_from_object returns M8[ns] dtype instead of tz-aware, # so this case must be handled separately from the rest dtype = values.dtype values = getattr(values, "_values", values) else: values = com.values_from_object(values) dtype = values.dtype if mask is None and compute_mask: if isfinite: mask = _isfinite(values) else: mask = isna(values) if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = getattr(values, "asi8", values) values = values.view(np.int64) dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) if skipna: if copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, changed = maybe_upcast_putmask(values, mask, fill_value) elif copy: values = values.copy() # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max, fill_value
def test_compat(self): assert is_datetime64tz_dtype(self.dtype) assert is_datetime64tz_dtype('datetime64[ns, US/Eastern]') assert is_datetime64_any_dtype(self.dtype) assert is_datetime64_any_dtype('datetime64[ns, US/Eastern]') assert is_datetime64_ns_dtype(self.dtype) assert is_datetime64_ns_dtype('datetime64[ns, US/Eastern]') assert not is_datetime64_dtype(self.dtype) assert not is_datetime64_dtype('datetime64[ns, US/Eastern]')
def test_compat(self): self.assertTrue(is_datetime64tz_dtype(self.dtype)) self.assertTrue(is_datetime64tz_dtype('datetime64[ns, US/Eastern]')) self.assertTrue(is_datetime64_any_dtype(self.dtype)) self.assertTrue(is_datetime64_any_dtype('datetime64[ns, US/Eastern]')) self.assertTrue(is_datetime64_ns_dtype(self.dtype)) self.assertTrue(is_datetime64_ns_dtype('datetime64[ns, US/Eastern]')) self.assertFalse(is_datetime64_dtype(self.dtype)) self.assertFalse(is_datetime64_dtype('datetime64[ns, US/Eastern]'))
def test_dst(self): dr1 = date_range('2013-01-01', periods=3, tz='US/Eastern') s1 = Series(dr1, name='A') assert is_datetime64tz_dtype(s1) with tm.assert_produces_warning(FutureWarning): assert is_datetimetz(s1) dr2 = date_range('2013-08-01', periods=3, tz='US/Eastern') s2 = Series(dr2, name='A') assert is_datetime64tz_dtype(s2) with tm.assert_produces_warning(FutureWarning): assert is_datetimetz(s2) assert s1.dtype == s2.dtype
def wrapper(left, right, name=name, na_op=na_op): if isinstance(right, ABCDataFrame): return NotImplemented left, right = _align_method_SERIES(left, right) res_name = _get_series_op_result_name(left, right) if is_datetime64_dtype(left) or is_datetime64tz_dtype(left): result = dispatch_to_index_op(op, left, right, pd.DatetimeIndex) return construct_result(left, result, index=left.index, name=res_name, dtype=result.dtype) elif is_timedelta64_dtype(left): result = dispatch_to_index_op(op, left, right, pd.TimedeltaIndex) return construct_result(left, result, index=left.index, name=res_name, dtype=result.dtype) elif is_categorical_dtype(left): raise TypeError("{typ} cannot perform the operation " "{op}".format(typ=type(left).__name__, op=str_rep)) lvalues = left.values rvalues = right if isinstance(rvalues, ABCSeries): rvalues = rvalues.values result = safe_na_op(lvalues, rvalues) return construct_result(left, result, index=left.index, name=res_name, dtype=None)
def _wrap_results(result, dtype, fill_value=None): """ wrap our results if needed """ if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): if fill_value is None: # GH#24293 fill_value = iNaT if not isinstance(result, np.ndarray): tz = getattr(dtype, 'tz', None) assert not isna(fill_value), "Expected non-null fill_value" if result == fill_value: result = np.nan result = tslibs.Timestamp(result, tz=tz) else: result = result.view(dtype) elif is_timedelta64_dtype(dtype): if not isinstance(result, np.ndarray): if result == fill_value: result = np.nan # raise if we have a timedelta64[ns] which is too large if np.fabs(result) > _int64_max: raise ValueError("overflow in timedelta operation") result = tslibs.Timedelta(result, unit='ns') else: result = result.astype('i8').view(dtype) return result
def backfill_1d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if is_float_dtype(values): name = 'backfill_inplace_{name}'.format(name=dtype.name) _method = getattr(algos, name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _backfill_1d_datetime elif is_integer_dtype(values): values = ensure_float64(values) _method = algos.backfill_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_inplace_object if _method is None: raise ValueError('Invalid dtype for backfill_1d [{name}]' .format(name=dtype.name)) if mask is None: mask = isna(values) mask = mask.view(np.uint8) _method(values, mask, limit=limit) return values
def pad_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if is_float_dtype(values): name = 'pad_2d_inplace_{name}'.format(name=dtype.name) _method = getattr(algos, name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _pad_2d_datetime elif is_integer_dtype(values): values = ensure_float64(values) _method = algos.pad_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.pad_2d_inplace_object if _method is None: raise ValueError('Invalid dtype for pad_2d [{name}]' .format(name=dtype.name)) if mask is None: mask = isna(values) mask = mask.view(np.uint8) if np.all(values.shape): _method(values, mask, limit=limit) else: # for test coverage pass return values
def __new__(cls, data): # CombinedDatetimelikeProperties isn't really instantiated. Instead # we need to choose which parent (datetime or timedelta) is # appropriate. Since we're checking the dtypes anyway, we'll just # do all the validation here. from pandas import Series if not isinstance(data, Series): raise TypeError("cannot convert an object of type {0} to a " "datetimelike index".format(type(data))) orig = data if is_categorical_dtype(data) else None if orig is not None: data = Series(orig.values.categories, name=orig.name, copy=False) try: if is_datetime64_dtype(data.dtype): return DatetimeProperties(data, orig) elif is_datetime64tz_dtype(data.dtype): return DatetimeProperties(data, orig) elif is_timedelta64_dtype(data.dtype): return TimedeltaProperties(data, orig) else: if is_period_arraylike(data): return PeriodProperties(data, orig) if is_datetime_arraylike(data): return DatetimeProperties(data, orig) except Exception: pass # we raise an attribute error anyway raise AttributeError("Can only use .dt accessor with datetimelike " "values")
def _format_labels(bins, precision, right=True, include_lowest=False, dtype=None): """ based on the dtype, return our labels """ closed = 'right' if right else 'left' if is_datetime64tz_dtype(dtype): formatter = partial(Timestamp, tz=dtype.tz) adjust = lambda x: x - Timedelta('1ns') elif is_datetime64_dtype(dtype): formatter = Timestamp adjust = lambda x: x - Timedelta('1ns') elif is_timedelta64_dtype(dtype): formatter = Timedelta adjust = lambda x: x - Timedelta('1ns') else: precision = _infer_precision(precision, bins) formatter = lambda x: _round_frac(x, precision) adjust = lambda x: x - 10 ** (-precision) breaks = [formatter(b) for b in bins] labels = IntervalIndex.from_breaks(breaks, closed=closed) if right and include_lowest: # we will adjust the left hand side by precision to # account that we are all right closed v = adjust(labels[0].left) i = IntervalIndex([Interval(v, labels[0].right, closed='right')]) labels = i.append(labels[1:]) return labels
def backfill_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if is_float_dtype(values): _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _backfill_2d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) _method = algos.backfill_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_2d_inplace_object if _method is None: raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype.name) if mask is None: mask = isnull(values) mask = mask.view(np.uint8) if np.all(values.shape): _method(values, mask, limit=limit) else: # for test coverage pass return values
def convert_pandas_type_to_json_field(arr, dtype=None): dtype = dtype or arr.dtype if arr.name is None: name = 'values' else: name = arr.name field = {'name': name, 'type': as_json_table_type(dtype)} if is_categorical_dtype(arr): if hasattr(arr, 'categories'): cats = arr.categories ordered = arr.ordered else: cats = arr.cat.categories ordered = arr.cat.ordered field['constraints'] = {"enum": list(cats)} field['ordered'] = ordered elif is_period_dtype(arr): field['freq'] = arr.freqstr elif is_datetime64tz_dtype(arr): if hasattr(arr, 'dt'): field['tz'] = arr.dt.tz.zone else: field['tz'] = arr.tz.zone return field
def pad_1d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if is_float_dtype(values): name = 'pad_inplace_{name}'.format(name=dtype.name) _method = getattr(algos, name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _pad_1d_datetime elif is_integer_dtype(values): values = ensure_float64(values) _method = algos.pad_inplace_float64 elif values.dtype == np.object_: _method = algos.pad_inplace_object elif is_timedelta64_dtype(values): # NaTs are treated identically to datetime64, so we can dispatch # to that implementation _method = _pad_1d_datetime if _method is None: raise ValueError('Invalid dtype for pad_1d [{name}]' .format(name=dtype.name)) if mask is None: mask = isna(values) mask = mask.view(np.uint8) _method(values, mask, limit=limit) return values
def __sub__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(-other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(-other) elif isinstance(other, (datetime, np.datetime64)): result = self._sub_datetimelike_scalar(other) elif lib.is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these maybe_integer_op_deprecated(self) result = self._time_shift(-other) elif isinstance(other, Period): result = self._sub_period(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.sub) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] result = self._sub_datetime_arraylike(other) elif is_period_dtype(other): # PeriodIndex result = self._sub_period_array(other) elif is_integer_dtype(other): maybe_integer_op_deprecated(self) result = self._addsub_int_array(other, operator.sub) elif isinstance(other, ABCIndexClass): raise TypeError("cannot subtract {cls} and {typ}" .format(cls=type(self).__name__, typ=type(other).__name__)) elif is_float_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot subtract {dtype}-dtype from {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_extension_array_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): from pandas.core.arrays import TimedeltaArrayMixin # TODO: infer freq? return TimedeltaArrayMixin(result) return result
def _convert_bin_to_numeric_type(bins, dtype): """ if the passed bin is of datetime/timedelta type, this method converts it to integer Parameters ---------- bins : list-like of bins dtype : dtype of data Raises ------ ValueError if bins are not of a compat dtype to dtype """ bins_dtype = infer_dtype(bins, skipna=False) if is_timedelta64_dtype(dtype): if bins_dtype in ['timedelta', 'timedelta64']: bins = to_timedelta(bins).view(np.int64) else: raise ValueError("bins must be of timedelta64 dtype") elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): if bins_dtype in ['datetime', 'datetime64']: bins = to_datetime(bins).view(np.int64) else: raise ValueError("bins must be of datetime64 dtype") return bins
def wrapper(left, right, name=name, na_op=na_op): if isinstance(right, ABCDataFrame): return NotImplemented left, right = _align_method_SERIES(left, right) res_name = _get_series_op_result_name(left, right) if is_datetime64_dtype(left) or is_datetime64tz_dtype(left): result = dispatch_to_index_op(op, left, right, pd.DatetimeIndex) return construct_result(left, result, index=left.index, name=res_name, dtype=result.dtype) elif is_timedelta64_dtype(left): result = dispatch_to_index_op(op, left, right, pd.TimedeltaIndex) return construct_result(left, result, index=left.index, name=res_name, dtype=result.dtype) lvalues = left.values rvalues = right if isinstance(rvalues, ABCSeries): rvalues = getattr(rvalues, 'values', rvalues) result = safe_na_op(lvalues, rvalues) return construct_result(left, result, index=left.index, name=res_name, dtype=None)
def to_numpy(self): """ A NumPy ndarray representing the values in this Series or Index. .. versionadded:: 0.24.0 The returned array will be the same up to equality (values equal in `self` will be equal in the returned array; likewise for values that are not equal). When `self` contains an ExtensionArray, the dtype may be different. For example, for a category-dtype Series, ``to_numpy()`` will return a NumPy array and the categorical dtype will be lost. Returns ------- numpy.ndarray See Also -------- Series.array : Get the actual data stored within. Index.array : Get the actual data stored within. DataFrame.to_numpy : Similar method for DataFrame. Notes ----- For NumPy dtypes, this will be a reference to the actual data stored in this Series or Index. Modifying the result in place will modify the data stored in the Series or Index (not that we recommend doing that). For extension types, ``to_numpy()`` *may* require copying data and coercing the result to a NumPy type (possibly object), which may be expensive. When you need a no-copy reference to the underlying data, :attr:`Series.array` should be used instead. This table lays out the different dtypes and return types of ``to_numpy()`` for various dtypes within pandas. ================== ================================ dtype array type ================== ================================ category[T] ndarray[T] (same dtype as input) period ndarray[object] (Periods) interval ndarray[object] (Intervals) IntegerNA ndarray[object] datetime64[ns, tz] ndarray[object] (Timestamps) ================== ================================ Examples -------- >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.to_numpy() array(['a', 'b', 'a'], dtype=object) """ if (is_extension_array_dtype(self.dtype) or is_datetime64tz_dtype(self.dtype)): # TODO(DatetimeArray): remove the second clause. return np.asarray(self._values) return self._values
def test_basic(self): assert is_datetime64tz_dtype(self.dtype) dr = date_range('20130101', periods=3, tz='US/Eastern') s = Series(dr, name='A') # dtypes assert is_datetime64tz_dtype(s.dtype) assert is_datetime64tz_dtype(s) assert not is_datetime64tz_dtype(np.dtype('float64')) assert not is_datetime64tz_dtype(1.0) assert is_datetimetz(s) assert is_datetimetz(s.dtype) assert not is_datetimetz(np.dtype('float64')) assert not is_datetimetz(1.0)
def test_basic(self): self.assertTrue(is_datetime64tz_dtype(self.dtype)) dr = date_range('20130101', periods=3, tz='US/Eastern') s = Series(dr, name='A') # dtypes self.assertTrue(is_datetime64tz_dtype(s.dtype)) self.assertTrue(is_datetime64tz_dtype(s)) self.assertFalse(is_datetime64tz_dtype(np.dtype('float64'))) self.assertFalse(is_datetime64tz_dtype(1.0)) self.assertTrue(is_datetimetz(s)) self.assertTrue(is_datetimetz(s.dtype)) self.assertFalse(is_datetimetz(np.dtype('float64'))) self.assertFalse(is_datetimetz(1.0))
def __sub__(self, other): from pandas import Index other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(-other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(-other) elif isinstance(other, (datetime, np.datetime64)): result = self._sub_datelike(other) elif is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these result = self.shift(-other) elif isinstance(other, Period): result = self._sub_period(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.sub) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] result = self._sub_datelike(other) elif isinstance(other, Index): raise TypeError("cannot subtract {cls} and {typ}" .format(cls=type(self).__name__, typ=type(other).__name__)) elif is_integer_dtype(other) and self.freq is None: # GH#19123 raise NullFrequencyError("Cannot shift with no freq") elif is_float_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot subtract {dtype}-dtype from {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) else: # pragma: no cover return NotImplemented if result is NotImplemented: return NotImplemented elif not isinstance(result, Index): # Index.__new__ will choose appropriate subclass for dtype result = Index(result) res_name = ops.get_op_result_name(self, other) result.name = res_name return result
def __add__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(other) elif isinstance(other, (datetime, np.datetime64)): result = self._add_datetimelike_scalar(other) elif lib.is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these maybe_integer_op_deprecated(self) result = self._time_shift(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.add) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] return self._add_datetime_arraylike(other) elif is_integer_dtype(other): maybe_integer_op_deprecated(self) result = self._addsub_int_array(other, operator.add) elif is_float_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot add {dtype}-dtype to {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_period_dtype(other): # if self is a TimedeltaArray and other is a PeriodArray with # a timedelta-like (i.e. Tick) freq, this operation is valid. # Defer to the PeriodArray implementation. # In remaining cases, this will end up raising TypeError. return NotImplemented elif is_extension_array_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): from pandas.core.arrays import TimedeltaArrayMixin # TODO: infer freq? return TimedeltaArrayMixin(result) return result
def test_value_counts_unique_nunique(self): for orig in self.objs: o = orig.copy() klass = type(o) values = o._values if isinstance(values, Index): # reset name not to affect latter process values.name = None # create repeated values, 'n'th element is repeated by n+1 times # skip boolean, because it only has 2 values at most if isinstance(o, Index) and o.is_boolean(): continue elif isinstance(o, Index): expected_index = Index(o[::-1]) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) o.name = 'a' else: expected_index = Index(values[::-1]) idx = o.index.repeat(range(1, len(o) + 1)) # take-based repeat indices = np.repeat(np.arange(len(o)), range(1, len(o) + 1)) rep = values.take(indices) o = klass(rep, index=idx, name='a') # check values has the same dtype as the original assert o.dtype == orig.dtype expected_s = Series(range(10, 0, -1), index=expected_index, dtype='int64', name='a') result = o.value_counts() tm.assert_series_equal(result, expected_s) assert result.index.name is None assert result.name == 'a' result = o.unique() if isinstance(o, Index): assert isinstance(result, o.__class__) tm.assert_index_equal(result, orig) elif is_datetime64tz_dtype(o): # datetimetz Series returns array of Timestamp assert result[0] == orig[0] for r in result: assert isinstance(r, Timestamp) tm.assert_numpy_array_equal( result.astype(object), orig._values.astype(object)) else: tm.assert_numpy_array_equal(result, orig.values) assert o.nunique() == len(np.unique(o.values))
def _get_prev_label(label): dtype = getattr(label, 'dtype', type(label)) if isinstance(label, (Timestamp, Timedelta)): dtype = 'datetime64' if is_datetime_or_timedelta_dtype(dtype) or is_datetime64tz_dtype(dtype): return label - np.timedelta64(1, 'ns') elif is_integer_dtype(dtype): return label - 1 elif is_float_dtype(dtype): return np.nextafter(label, -np.infty) else: raise TypeError('cannot determine next label for type {typ!r}' .format(typ=type(label)))
def maybe_to_datetimelike(data, copy=False): """ return a DelegatedClass of a Series that is datetimelike (e.g. datetime64[ns],timedelta64[ns] dtype or a Series of Periods) raise TypeError if this is not possible. Parameters ---------- data : Series copy : boolean, default False copy the input data Returns ------- DelegatedClass """ from pandas import Series if not isinstance(data, Series): raise TypeError("cannot convert an object of type {0} to a " "datetimelike index".format(type(data))) index = data.index name = data.name orig = data if is_categorical_dtype(data) else None if orig is not None: data = orig.values.categories if is_datetime64_dtype(data.dtype): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) elif is_datetime64tz_dtype(data.dtype): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer', ambiguous='infer'), index, data.name, orig=orig) elif is_timedelta64_dtype(data.dtype): return TimedeltaProperties(TimedeltaIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) else: if is_period_arraylike(data): return PeriodProperties(PeriodIndex(data, copy=copy), index, name=name, orig=orig) if is_datetime_arraylike(data): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) raise TypeError("cannot convert an object of type {0} to a " "datetimelike index".format(type(data)))
def __add__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(other) elif isinstance(other, (datetime, np.datetime64)): result = self._add_datelike(other) elif is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these result = self.shift(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.add) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] return self._add_datelike(other) elif is_integer_dtype(other): result = self._addsub_int_array(other, operator.add) elif is_float_dtype(other) or is_period_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot add {dtype}-dtype to {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_categorical_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented if result is NotImplemented: return NotImplemented elif not isinstance(result, Index): # Index.__new__ will choose appropriate subclass for dtype result = Index(result) res_name = ops.get_op_result_name(self, other) result.name = res_name return result
def wrapper(self, other): meth = getattr(dtl.DatetimeLikeArrayMixin, opname) if isinstance(other, (datetime, np.datetime64, compat.string_types)): if isinstance(other, (datetime, np.datetime64)): # GH#18435 strings get a pass from tzawareness compat self._assert_tzawareness_compat(other) try: other = _to_m8(other, tz=self.tz) except ValueError: # string that cannot be parsed to Timestamp return ops.invalid_comparison(self, other, op) result = meth(self, other) if isna(other): result.fill(nat_result) elif lib.is_scalar(other): return ops.invalid_comparison(self, other, op) else: if isinstance(other, list): # FIXME: This can break for object-dtype with mixed types other = type(self)(other) elif not isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)): # Following Timestamp convention, __eq__ is all-False # and __ne__ is all True, others raise TypeError. return ops.invalid_comparison(self, other, op) if is_object_dtype(other): result = op(self.astype('O'), np.array(other)) elif not (is_datetime64_dtype(other) or is_datetime64tz_dtype(other)): # e.g. is_timedelta64_dtype(other) return ops.invalid_comparison(self, other, op) else: self._assert_tzawareness_compat(other) result = meth(self, np.asarray(other)) result = com.values_from_object(result) # Make sure to pass an array to result[...]; indexing with # Series breaks with older version of numpy o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result return result
def nanmean(values, axis=None, skipna=True, mask=None): """ Compute the mean of the element along an axis ignoring NaNs Parameters ---------- values : ndarray axis: int, optional skipna : bool, default True mask : ndarray[bool], optional nan-mask if known Returns ------- result : float Unless input is a float array, in which case use the same precision as the input array. Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1, 2, np.nan]) >>> nanops.nanmean(s) 1.5 """ values, mask, dtype, dtype_max, _ = _get_values( values, skipna, 0, mask=mask) dtype_sum = dtype_max dtype_count = np.float64 if (is_integer_dtype(dtype) or is_timedelta64_dtype(dtype) or is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype)): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype dtype_count = dtype count = _get_counts(mask, axis, dtype=dtype_count) the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) if axis is not None and getattr(the_sum, 'ndim', False): with np.errstate(all="ignore"): # suppress division by zero warnings the_mean = the_sum / count ct_mask = count == 0 if ct_mask.any(): the_mean[ct_mask] = np.nan else: the_mean = the_sum / count if count > 0 else np.nan return _wrap_results(the_mean, dtype)
def astype(self, dtype, copy=True, how='start'): dtype = pandas_dtype(dtype) if is_object_dtype(dtype): return self.asobject elif is_integer_dtype(dtype): if copy: return self._int64index.copy() else: return self._int64index elif is_datetime64_dtype(dtype): return self.to_timestamp(how=how) elif is_datetime64tz_dtype(dtype): return self.to_timestamp(how=how).tz_localize(dtype.tz) elif is_period_dtype(dtype): return self.asfreq(freq=dtype.freq) raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype)
def _assert_tzawareness_compat(self, other): # adapted from _Timestamp._assert_tzawareness_compat other_tz = getattr(other, 'tzinfo', None) if is_datetime64tz_dtype(other): # Get tzinfo from Series dtype other_tz = other.dtype.tz if other is NaT: # pd.NaT quacks both aware and naive pass elif self.tz is None: if other_tz is not None: raise TypeError('Cannot compare tz-naive and tz-aware ' 'datetime-like objects.') elif other_tz is None: raise TypeError('Cannot compare tz-naive and tz-aware ' 'datetime-like objects')
def _convert_from_pandas(self, pdf: "PandasDataFrameLike", schema: Union[StructType, str, List[str]], timezone: str) -> List: """ Convert a pandas.DataFrame to list of records that can be used to make a DataFrame Returns ------- list list of records """ import pandas as pd from pyspark.sql import SparkSession assert isinstance(self, SparkSession) if timezone is not None: from pyspark.sql.pandas.types import _check_series_convert_timestamps_tz_local from pandas.core.dtypes.common import is_datetime64tz_dtype, is_timedelta64_dtype copied = False if isinstance(schema, StructType): for field in schema: # TODO: handle nested timestamps, such as ArrayType(TimestampType())? if isinstance(field.dataType, TimestampType): s = _check_series_convert_timestamps_tz_local( pdf[field.name], timezone) if s is not pdf[field.name]: if not copied: # Copy once if the series is modified to prevent the original # Pandas DataFrame from being updated pdf = pdf.copy() copied = True pdf[field.name] = s else: should_localize = not is_timestamp_ntz_preferred() for column, series in pdf.iteritems(): s = series if should_localize and is_datetime64tz_dtype( s.dtype) and s.dt.tz is not None: s = _check_series_convert_timestamps_tz_local( series, timezone) if s is not series: if not copied: # Copy once if the series is modified to prevent the original # Pandas DataFrame from being updated pdf = pdf.copy() copied = True pdf[column] = s for column, series in pdf.iteritems(): if is_timedelta64_dtype(series): if not copied: pdf = pdf.copy() copied = True # Explicitly set the timedelta as object so the output of numpy records can # hold the timedelta instances as are. Otherwise, it converts to the internal # numeric values. ser = pdf[column] pdf[column] = pd.Series(ser.dt.to_pytimedelta(), index=ser.index, dtype="object", name=ser.name) # Convert pandas.DataFrame to list of numpy records np_records = pdf.to_records(index=False) # Check if any columns need to be fixed for Spark to infer properly if len(np_records) > 0: record_dtype = self._get_numpy_record_dtype(np_records[0]) if record_dtype is not None: return [r.astype(record_dtype).tolist() for r in np_records] # Convert list of numpy records to python lists return [r.tolist() for r in np_records]
def _create_from_pandas_with_arrow(self, pdf: "PandasDataFrameLike", schema: Union[StructType, List[str]], timezone: str) -> "DataFrame": """ Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the data types will be used to coerce the data in Pandas to Arrow conversion. """ from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame assert isinstance(self, SparkSession) from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer from pyspark.sql.types import TimestampType from pyspark.sql.pandas.types import from_arrow_type, to_arrow_type from pyspark.sql.pandas.utils import ( require_minimum_pandas_version, require_minimum_pyarrow_version, ) require_minimum_pandas_version() require_minimum_pyarrow_version() from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype # type: ignore[attr-defined] import pyarrow as pa # Create the Spark schema from list of names passed in with Arrow types if isinstance(schema, (list, tuple)): arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False) struct = StructType() prefer_timestamp_ntz = is_timestamp_ntz_preferred() for name, field in zip(schema, arrow_schema): struct.add(name, from_arrow_type(field.type, prefer_timestamp_ntz), nullable=field.nullable) schema = struct # Determine arrow types to coerce data when creating batches if isinstance(schema, StructType): arrow_types = [to_arrow_type(f.dataType) for f in schema.fields] elif isinstance(schema, DataType): raise ValueError( "Single data type %s is not supported with Arrow" % str(schema)) else: # Any timestamps must be coerced to be compatible with Spark arrow_types = [ to_arrow_type(TimestampType()) if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None for t in pdf.dtypes ] # Slice the DataFrame to be batched step = -(-len(pdf) // self.sparkContext.defaultParallelism ) # round int up pdf_slices = (pdf.iloc[start:start + step] for start in range(0, len(pdf), step)) # Create list of Arrow (columns, type) for serializer dump_stream arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types) ] for pdf_slice in pdf_slices] jsqlContext = self._wrapped._jsqlContext # type: ignore[attr-defined] safecheck = self._wrapped._conf.arrowSafeTypeConversion( ) # type: ignore[attr-defined] col_by_name = True # col by name only applies to StructType columns, can't happen here ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name) @no_type_check def reader_func(temp_filename): return self._jvm.PythonSQLUtils.readArrowStreamFromFile( jsqlContext, temp_filename) @no_type_check def create_RDD_server(): return self._jvm.ArrowRDDServer(jsqlContext) # Create Spark DataFrame from Arrow stream file, using one batch per partition jrdd = self._sc._serialize_to_jvm(arrow_data, ser, reader_func, create_RDD_server) assert self._jvm is not None jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
def test_constructor_with_datetime_tz(self): # 8260 # support datetime64 with tz dr = date_range("20130101", periods=3, tz="US/Eastern") s = Series(dr) assert s.dtype.name == "datetime64[ns, US/Eastern]" assert s.dtype == "datetime64[ns, US/Eastern]" assert is_datetime64tz_dtype(s.dtype) assert "datetime64[ns, US/Eastern]" in str(s) # export result = s.values assert isinstance(result, np.ndarray) assert result.dtype == "datetime64[ns]" exp = pd.DatetimeIndex(result) exp = exp.tz_localize("UTC").tz_convert(tz=s.dt.tz) tm.assert_index_equal(dr, exp) # indexing result = s.iloc[0] assert result == Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern", freq="D") result = s[0] assert result == Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern", freq="D") result = s[Series([True, True, False], index=s.index)] assert_series_equal(result, s[0:2]) result = s.iloc[0:1] assert_series_equal(result, Series(dr[0:1])) # concat result = pd.concat([s.iloc[0:1], s.iloc[1:]]) assert_series_equal(result, s) # short str assert "datetime64[ns, US/Eastern]" in str(s) # formatting with NaT result = s.shift() assert "datetime64[ns, US/Eastern]" in str(result) assert "NaT" in str(result) # long str t = Series(date_range("20130101", periods=1000, tz="US/Eastern")) assert "datetime64[ns, US/Eastern]" in str(t) result = pd.DatetimeIndex(s, freq="infer") tm.assert_index_equal(result, dr) # inference s = Series([ pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), ]) assert s.dtype == "datetime64[ns, US/Pacific]" assert lib.infer_dtype(s, skipna=True) == "datetime64" s = Series([ pd.Timestamp("2013-01-01 13:00:00-0800", tz="US/Pacific"), pd.Timestamp("2013-01-02 14:00:00-0800", tz="US/Eastern"), ]) assert s.dtype == "object" assert lib.infer_dtype(s, skipna=True) == "datetime" # with all NaT s = Series(pd.NaT, index=[0, 1], dtype="datetime64[ns, US/Eastern]") expected = Series(pd.DatetimeIndex(["NaT", "NaT"], tz="US/Eastern")) assert_series_equal(s, expected)
def _cython_operation(self, kind: str, values, how: str, axis, min_count: int = -1, **kwargs) -> Tuple[np.ndarray, Optional[List[str]]]: """ Returns the values of a cython operation as a Tuple of [data, names]. Names is only useful when dealing with 2D results, like ohlc (see self._name_functions). """ assert kind in ["transform", "aggregate"] orig_values = values if values.ndim > 2: raise NotImplementedError( "number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( f"datetime64 type does not support {how} operations") elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( f"timedelta64 type does not support {how} operations") if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? kludge can be avoided when # 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, is_datetimelike, min_count) elif kind == "transform": result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, codes, func, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] names: Optional[List[str]] = self._name_functions.get(how, None) if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype): result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) return result, names
def _convert_listlike_datetimes( arg, format: Optional[str], name: Hashable = None, tz: Optional[Timezone] = None, unit: Optional[str] = None, errors: Optional[str] = None, infer_datetime_format: bool = False, dayfirst: Optional[bool] = None, yearfirst: Optional[bool] = None, exact: bool = True, ): """ Helper function for to_datetime. Performs the conversions of 1D listlike of dates Parameters ---------- arg : list, tuple, ndarray, Series, Index date to be parsed name : object None or string for the Index name tz : object None or 'utc' unit : string None or string of the frequency of the passed data errors : string error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore' infer_datetime_format : bool, default False inferring format behavior from to_datetime dayfirst : boolean dayfirst parsing behavior from to_datetime yearfirst : boolean yearfirst parsing behavior from to_datetime exact : bool, default True exact format matching behavior from to_datetime Returns ------- Index-like of parsed dates """ if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype="O") arg_dtype = getattr(arg, "dtype", None) # these are shortcutable if is_datetime64tz_dtype(arg_dtype): if not isinstance(arg, (DatetimeArray, DatetimeIndex)): return DatetimeIndex(arg, tz=tz, name=name) if tz == "utc": arg = arg.tz_convert(None).tz_localize(tz) return arg elif is_datetime64_ns_dtype(arg_dtype): if not isinstance(arg, (DatetimeArray, DatetimeIndex)): try: return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass elif tz: # DatetimeArray, DatetimeIndex return arg.tz_localize(tz) return arg elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") return _to_datetime_with_unit(arg, unit, name, tz, errors) elif getattr(arg, "ndim", 1) > 1: raise TypeError( "arg must be a string, datetime, list, tuple, 1-d array, or Series" ) # warn if passing timedelta64, raise for PeriodDtype # NB: this must come after unit transformation orig_arg = arg try: arg, _ = maybe_convert_dtype(arg, copy=False) except TypeError: if errors == "coerce": result = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg)) return DatetimeIndex(result, name=name) elif errors == "ignore": # error: Incompatible types in assignment (expression has type # "Index", variable has type "ExtensionArray") result = Index(arg, name=name) # type: ignore[assignment] return result raise arg = ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None # error: Incompatible types in assignment (expression has type "None", variable has # type "ExtensionArray") result = None # type: ignore[assignment] if format is not None: # error: Incompatible types in assignment (expression has type # "Optional[Index]", variable has type "ndarray") result = _to_datetime_with_format( # type: ignore[assignment] arg, orig_arg, name, tz, format, exact, errors, infer_datetime_format ) if result is not None: return result if result is None: assert format is None or infer_datetime_format utc = tz == "utc" result, tz_parsed = objects_to_datetime64ns( arg, dayfirst=dayfirst, yearfirst=yearfirst, utc=utc, errors=errors, require_iso8601=require_iso8601, allow_object=True, ) if tz_parsed is not None: # We can take a shortcut since the datetime64 numpy array # is in UTC dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) return DatetimeIndex._simple_new(dta, name=name) utc = tz == "utc" return _box_as_indexlike(result, utc=utc, name=name)
def test_is_datetime64tz_dtype(): assert not com.is_datetime64tz_dtype(object) assert not com.is_datetime64tz_dtype([1, 2, 3]) assert not com.is_datetime64tz_dtype(pd.DatetimeIndex([1, 2, 3])) assert com.is_datetime64tz_dtype( pd.DatetimeIndex(["2000"], tz="US/Eastern"))
def get_reindexed_values(self, empty_dtype, upcasted_na): if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na if self.is_na: if getattr(self.block, "is_object", False): # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls values = self.block.values.ravel(order="K") if len(values) and values[0] is None: fill_value = None if getattr(self.block, "is_datetimetz", False) or is_datetime64tz_dtype(empty_dtype): if self.block is None: array = empty_dtype.construct_array_type() # TODO(EA2D): special case unneeded with 2D EAs return array(np.full(self.shape[1], fill_value.value), dtype=empty_dtype) elif getattr(self.block, "is_categorical", False): pass elif getattr(self.block, "is_extension", False): pass else: missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values if self.block.is_bool and not self.block.is_categorical: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values elif self.block.is_extension: values = self.block.values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax, fill_value=fill_value) return values
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na if self.is_valid_na_for(empty_dtype): blk_dtype = getattr(self.block, "dtype", None) if blk_dtype == np.dtype("object"): # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls values = self.block.values.ravel(order="K") if len(values) and values[0] is None: fill_value = None if is_datetime64tz_dtype(empty_dtype): # TODO(EA2D): special case unneeded with 2D EAs i8values = np.full(self.shape[1], fill_value.value) return DatetimeArray(i8values, dtype=empty_dtype) elif is_extension_array_dtype(blk_dtype): pass elif isinstance(empty_dtype, ExtensionDtype): cls = empty_dtype.construct_array_type() missing_arr = cls._from_sequence([], dtype=empty_dtype) ncols, nrows = self.shape assert ncols == 1, ncols empty_arr = -1 * np.ones((nrows, ), dtype=np.intp) return missing_arr.take(empty_arr, allow_fill=True, fill_value=fill_value) else: # NB: we should never get here with empty_dtype integer or bool; # if we did, the missing_arr.fill would cast to gibberish missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values if self.block.is_bool and not isinstance(self.block.values, Categorical): # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values elif self.block.is_extension: values = self.block.values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax) return values
def test_constructor_with_datetime_tz(self): # 8260 # support datetime64 with tz dr = date_range('20130101', periods=3, tz='US/Eastern') s = Series(dr) assert s.dtype.name == 'datetime64[ns, US/Eastern]' assert s.dtype == 'datetime64[ns, US/Eastern]' assert is_datetime64tz_dtype(s.dtype) assert 'datetime64[ns, US/Eastern]' in str(s) # export result = s.values assert isinstance(result, np.ndarray) assert result.dtype == 'datetime64[ns]' exp = pd.DatetimeIndex(result) exp = exp.tz_localize('UTC').tz_convert(tz=s.dt.tz) tm.assert_index_equal(dr, exp) # indexing result = s.iloc[0] assert result == Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern', freq='D') result = s[0] assert result == Timestamp('2013-01-01 00:00:00-0500', tz='US/Eastern', freq='D') result = s[Series([True, True, False], index=s.index)] assert_series_equal(result, s[0:2]) result = s.iloc[0:1] assert_series_equal(result, Series(dr[0:1])) # concat result = pd.concat([s.iloc[0:1], s.iloc[1:]]) assert_series_equal(result, s) # short str assert 'datetime64[ns, US/Eastern]' in str(s) # formatting with NaT result = s.shift() assert 'datetime64[ns, US/Eastern]' in str(result) assert 'NaT' in str(result) # long str t = Series(date_range('20130101', periods=1000, tz='US/Eastern')) assert 'datetime64[ns, US/Eastern]' in str(t) result = pd.DatetimeIndex(s, freq='infer') tm.assert_index_equal(result, dr) # inference s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Pacific')]) assert s.dtype == 'datetime64[ns, US/Pacific]' assert lib.infer_dtype(s, skipna=False) == 'datetime64' s = Series([pd.Timestamp('2013-01-01 13:00:00-0800', tz='US/Pacific'), pd.Timestamp('2013-01-02 14:00:00-0800', tz='US/Eastern')]) assert s.dtype == 'object' assert lib.infer_dtype(s, skipna=False) == 'datetime' # with all NaT s = Series(pd.NaT, index=[0, 1], dtype='datetime64[ns, US/Eastern]') expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern')) assert_series_equal(s, expected)
def _get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype na """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.dtype(np.float64), np.nan if _is_uniform_reindex(join_units): # FIXME: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): if unit.block is None: has_none_blocks = True else: dtypes[i] = unit.dtype upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) for dtype, unit in zip(dtypes, join_units): if dtype is None: continue if is_categorical_dtype(dtype): upcast_cls = "category" elif is_datetime64tz_dtype(dtype): upcast_cls = "datetimetz" elif issubclass(dtype.type, np.bool_): upcast_cls = "bool" elif issubclass(dtype.type, np.object_): upcast_cls = "object" elif is_datetime64_dtype(dtype): upcast_cls = "datetime" elif is_timedelta64_dtype(dtype): upcast_cls = "timedelta" elif is_sparse(dtype): upcast_cls = dtype.subtype.name elif is_extension_array_dtype(dtype): upcast_cls = "object" elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: upcast_cls = "float" # Null blocks should not influence upcast class selection, unless there # are only null blocks, when same upcasting rules must be applied to # null upcast classes. if unit.is_na: null_upcast_classes[upcast_cls].append(dtype) else: upcast_classes[upcast_cls].append(dtype) if not upcast_classes: upcast_classes = null_upcast_classes # TODO: de-duplicate with maybe_promote? # create the result if "object" in upcast_classes: return np.dtype(np.object_), np.nan elif "bool" in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None elif "category" in upcast_classes: return np.dtype(np.object_), np.nan elif "datetimetz" in upcast_classes: # GH-25014. We use NaT instead of iNaT, since this eventually # ends up in DatetimeArray.take, which does not allow iNaT. dtype = upcast_classes["datetimetz"] return dtype[0], NaT elif "datetime" in upcast_classes: return np.dtype("M8[ns]"), np.datetime64("NaT", "ns") elif "timedelta" in upcast_classes: return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") else: # pragma try: g = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: if is_float_dtype(g): return g, g.type(np.nan) elif is_numeric_dtype(g): if has_none_blocks: return np.dtype(np.float64), np.nan else: return g, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, unit=None, errors=None, infer_datetime_format=None, dayfirst=None, yearfirst=None, exact=None): """ Helper function for to_datetime. Performs the conversions of 1D listlike of dates Parameters ---------- arg : list, tuple, ndarray, Series, Index date to be parced box : boolean True boxes result as an Index-like, False returns an ndarray name : object None or string for the Index name tz : object None or 'utc' unit : string None or string of the frequency of the passed data errors : string error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore' infer_datetime_format : boolean inferring format behavior from to_datetime dayfirst : boolean dayfirst parsing behavior from to_datetime yearfirst : boolean yearfirst parsing behavior from to_datetime exact : boolean exact format matching behavior from to_datetime Returns ------- ndarray of parsed dates Returns: - Index-like if box=True - ndarray of Timestamps if box=False """ from pandas import DatetimeIndex from pandas.core.arrays import DatetimeArray from pandas.core.arrays.datetimes import (maybe_convert_dtype, objects_to_datetime64ns) if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') # these are shortcutable if is_datetime64tz_dtype(arg): if not isinstance(arg, (DatetimeArray, DatetimeIndex)): return DatetimeIndex(arg, tz=tz, name=name) if tz == 'utc': arg = arg.tz_convert(None).tz_localize(tz) return arg elif is_datetime64_ns_dtype(arg): if box and not isinstance(arg, (DatetimeArray, DatetimeIndex)): try: return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass return arg elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") arg = getattr(arg, 'values', arg) result, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) if box: if errors == 'ignore': from pandas import Index result = Index(result, name=name) else: result = DatetimeIndex(result, name=name) # GH 23758: We may still need to localize the result with tz # GH 25546: Apply tz_parsed first (from arg), then tz (from caller) # result will be naive but in UTC try: result = result.tz_localize('UTC').tz_convert(tz_parsed) except AttributeError: # Regular Index from 'ignore' path return result if tz is not None: if result.tz is None: result = result.tz_localize(tz) else: result = result.tz_convert(tz) return result elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') # warn if passing timedelta64, raise for PeriodDtype # NB: this must come after unit transformation orig_arg = arg arg, _ = maybe_convert_dtype(arg, copy=False) arg = ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = _format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None tz_parsed = None result = None if format is not None: try: # shortcut formatting here if format == '%Y%m%d': try: # pass orig_arg as float-dtype may have been converted to # datetime64[ns] orig_arg = ensure_object(orig_arg) result = _attempt_YYYYMMDD(orig_arg, errors=errors) except (ValueError, TypeError, tslibs.OutOfBoundsDatetime): raise ValueError("cannot convert the input to " "'%Y%m%d' date format") # fallback if result is None: try: result, timezones = array_strptime(arg, format, exact=exact, errors=errors) if '%Z' in format or '%z' in format: return _return_parsed_timezone_results( result, timezones, box, tz, name) except tslibs.OutOfBoundsDatetime: if errors == 'raise': raise elif errors == 'coerce': result = np.empty(arg.shape, dtype='M8[ns]') iresult = result.view('i8') iresult.fill(tslibs.iNaT) else: result = arg except ValueError: # if format was inferred, try falling back # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: if errors == 'raise': raise elif errors == 'coerce': result = np.empty(arg.shape, dtype='M8[ns]') iresult = result.view('i8') iresult.fill(tslibs.iNaT) else: result = arg except ValueError as e: # Fallback to try to convert datetime objects if timezone-aware # datetime objects are found without passing `utc=True` try: values, tz = conversion.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, name=name, tz=tz) except (ValueError, TypeError): raise e if result is None: assert format is None or infer_datetime_format utc = tz == 'utc' result, tz_parsed = objects_to_datetime64ns( arg, dayfirst=dayfirst, yearfirst=yearfirst, utc=utc, errors=errors, require_iso8601=require_iso8601, allow_object=True) if tz_parsed is not None: if box: # We can take a shortcut since the datetime64 numpy array # is in UTC return DatetimeIndex._simple_new(result, name=name, tz=tz_parsed) else: # Convert the datetime64 numpy array to an numpy array # of datetime objects result = [ Timestamp(ts, tz=tz_parsed).to_pydatetime() for ts in result ] return np.array(result, dtype=object) if box: # Ensure we return an Index in all cases where box=True if is_datetime64_dtype(result): return DatetimeIndex(result, tz=tz, name=name) elif is_object_dtype(result): # e.g. an Index of datetime objects from pandas import Index return Index(result, name=name) return result
def _ea_wrap_cython_operation( self, values: ExtensionArray, min_count: int, ngroups: int, comp_ids: np.ndarray, **kwargs, ) -> ArrayLike: """ If we have an ExtensionArray, unwrap, call _cython_operation, and re-wrap if appropriate. """ # TODO: general case implementation overridable by EAs. orig_values = values if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype): # All of the functions implemented here are ordinal, so we can # operate on the tz-naive equivalents npvalues = values.view("M8[ns]") res_values = self._cython_op_ndim_compat( # error: Argument 1 to "_cython_op_ndim_compat" of # "WrappedCythonOp" has incompatible type # "Union[ExtensionArray, ndarray]"; expected "ndarray" npvalues, # type: ignore[arg-type] min_count=min_count, ngroups=ngroups, comp_ids=comp_ids, mask=None, **kwargs, ) if self.how in ["rank"]: # i.e. how in WrappedCythonOp.cast_blocklist, since # other cast_blocklist methods dont go through cython_operation # preserve float64 dtype return res_values res_values = res_values.astype("i8", copy=False) # error: Too many arguments for "ExtensionArray" result = type(orig_values)( # type: ignore[call-arg] res_values, dtype=orig_values.dtype ) return result elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype): # IntegerArray or BooleanArray npvalues = values.to_numpy("float64", na_value=np.nan) res_values = self._cython_op_ndim_compat( npvalues, min_count=min_count, ngroups=ngroups, comp_ids=comp_ids, mask=None, **kwargs, ) if self.how in ["rank"]: # i.e. how in WrappedCythonOp.cast_blocklist, since # other cast_blocklist methods dont go through cython_operation return res_values dtype = self.get_result_dtype(orig_values.dtype) # error: Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" # has no attribute "construct_array_type" cls = dtype.construct_array_type() # type: ignore[union-attr] return cls._from_sequence(res_values, dtype=dtype) elif is_float_dtype(values.dtype): # FloatingArray # error: "ExtensionDtype" has no attribute "numpy_dtype" npvalues = values.to_numpy( values.dtype.numpy_dtype, # type: ignore[attr-defined] na_value=np.nan, ) res_values = self._cython_op_ndim_compat( npvalues, min_count=min_count, ngroups=ngroups, comp_ids=comp_ids, mask=None, **kwargs, ) if self.how in ["rank"]: # i.e. how in WrappedCythonOp.cast_blocklist, since # other cast_blocklist methods dont go through cython_operation return res_values dtype = self.get_result_dtype(orig_values.dtype) # error: Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" # has no attribute "construct_array_type" cls = dtype.construct_array_type() # type: ignore[union-attr] return cls._from_sequence(res_values, dtype=dtype) raise NotImplementedError( f"function is not implemented for this dtype: {values.dtype}" )
def maybe_promote(dtype, fill_value=np.nan): """ Find the minimal dtype that can hold both the given dtype and fill_value. Parameters ---------- dtype : np.dtype or ExtensionDtype fill_value : scalar, default np.nan Returns ------- dtype Upcasted from dtype argument if necessary. fill_value Upcasted from fill_value argument if necessary. """ if not is_scalar(fill_value) and not is_object_dtype(dtype): # with object dtype there is nothing to promote, and the user can # pass pretty much any weird fill_value they like raise ValueError("fill_value must be a scalar") # if we passed an array here, determine the fill value by dtype if isinstance(fill_value, np.ndarray): if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): fill_value = fill_value.dtype.type("NaT", "ns") else: # we need to change to object type as our # fill_value is of object type if fill_value.dtype == np.object_: dtype = np.dtype(np.object_) fill_value = np.nan if dtype == np.object_ or dtype.kind in ["U", "S"]: # We treat string-like dtypes as object, and _always_ fill # with np.nan fill_value = np.nan dtype = np.dtype(np.object_) # returns tuple of (dtype, fill_value) if issubclass(dtype.type, np.datetime64): if isinstance(fill_value, datetime) and fill_value.tzinfo is not None: # Trying to insert tzaware into tznaive, have to cast to object dtype = np.dtype(np.object_) elif is_integer(fill_value) or (is_float(fill_value) and not isna(fill_value)): dtype = np.dtype(np.object_) else: try: fill_value = tslibs.Timestamp(fill_value).to_datetime64() except (TypeError, ValueError): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.timedelta64): if (is_integer(fill_value) or (is_float(fill_value) and not np.isnan(fill_value)) or isinstance(fill_value, str)): # TODO: What about str that can be a timedelta? dtype = np.dtype(np.object_) else: try: fv = tslibs.Timedelta(fill_value) except ValueError: dtype = np.dtype(np.object_) else: if fv is NaT: # NaT has no `to_timedelta64` method fill_value = np.timedelta64("NaT", "ns") else: fill_value = fv.to_timedelta64() elif is_datetime64tz_dtype(dtype): if isna(fill_value): fill_value = NaT elif not isinstance(fill_value, datetime): dtype = np.dtype(np.object_) elif fill_value.tzinfo is None: dtype = np.dtype(np.object_) elif not tz_compare(fill_value.tzinfo, dtype.tz): # TODO: sure we want to cast here? dtype = np.dtype(np.object_) elif is_extension_array_dtype(dtype) and isna(fill_value): fill_value = dtype.na_value elif is_float(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): dtype = np.dtype(np.float64) elif dtype.kind == "f": mst = np.min_scalar_type(fill_value) if mst > dtype: # e.g. mst is np.float64 and dtype is np.float32 dtype = mst elif dtype.kind == "c": mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) elif is_bool(fill_value): if not issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif is_integer(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, np.integer): if not np.can_cast(fill_value, dtype): # upcast to prevent overflow mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) if dtype.kind == "f": # Case where we disagree with numpy dtype = np.dtype(np.object_) elif is_complex(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.dtype(np.object_) elif issubclass(dtype.type, (np.integer, np.floating)): mst = np.min_scalar_type(fill_value) dtype = np.promote_types(dtype, mst) elif dtype.kind == "c": mst = np.min_scalar_type(fill_value) if mst > dtype: # e.g. mst is np.complex128 and dtype is np.complex64 dtype = mst elif fill_value is None: if is_float_dtype(dtype) or is_complex_dtype(dtype): fill_value = np.nan elif is_integer_dtype(dtype): dtype = np.float64 fill_value = np.nan elif is_datetime_or_timedelta_dtype(dtype): fill_value = dtype.type("NaT", "ns") else: dtype = np.dtype(np.object_) fill_value = np.nan else: dtype = np.dtype(np.object_) # in case we have a string that looked like a number if is_extension_array_dtype(dtype): pass elif issubclass(np.dtype(dtype).type, (bytes, str)): dtype = np.dtype(np.object_) fill_value = _ensure_dtype_type(fill_value, dtype) return dtype, fill_value
def __sub__(self, other): from pandas import Index other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(-other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(-other) elif isinstance(other, (datetime, np.datetime64)): result = self._sub_datelike(other) elif is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these result = self.shift(-other) elif isinstance(other, Period): result = self._sub_period(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.sub) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] result = self._sub_datelike(other) elif is_period_dtype(other): # PeriodIndex result = self._sub_period_array(other) elif is_integer_dtype(other): result = self._addsub_int_array(other, operator.sub) elif isinstance(other, Index): raise TypeError("cannot subtract {cls} and {typ}" .format(cls=type(self).__name__, typ=type(other).__name__)) elif is_float_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot subtract {dtype}-dtype from {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_categorical_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented if result is NotImplemented: return NotImplemented elif not isinstance(result, Index): # Index.__new__ will choose appropriate subclass for dtype result = Index(result) res_name = ops.get_op_result_name(self, other) result.name = res_name return result
def dict_to_mgr( data: dict, index, columns, *, dtype: DtypeObj | None = None, typ: str = "block", copy: bool = True, ) -> Manager: """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. Used in DataFrame.__init__ """ arrays: Sequence[Any] | Series if columns is not None: from pandas.core.series import Series arrays = Series(data, index=columns, dtype=object) data_names = arrays.index missing = arrays.isna() if index is None: # GH10856 # raise ValueError if only scalars in dict index = _extract_index(arrays[~missing]) else: index = ensure_index(index) # no obvious "empty" int column if missing.any() and not is_integer_dtype(dtype): nan_dtype: DtypeObj if dtype is None or (isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.flexible)): # GH#1783 nan_dtype = np.dtype("object") else: nan_dtype = dtype val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) arrays.loc[missing] = [val] * missing.sum() arrays = list(arrays) else: keys = list(data.keys()) columns = data_names = Index(keys) arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies arrays = [ arr if not isinstance(arr, Index) else arr._data for arr in arrays ] arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] if copy: # arrays_to_mgr (via form_blocks) won't make copies for EAs # dtype attr check to exclude EADtype-castable strs arrays = [ x if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype) else x.copy() for x in arrays ] # TODO: can we get rid of the dt64tz special case above? return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, typ=typ, consolidate=copy)
def _get_values( values: np.ndarray, skipna: bool, fill_value: Any = None, fill_value_typ: Optional[str] = None, mask: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]: """ Utility to get the values view, mask, dtype, dtype_max, and fill_value. If both mask and fill_value/fill_value_typ are not None and skipna is True, the values array will be copied. For input arrays of boolean or integer dtypes, copies will only occur if a precomputed mask, a fill_value/fill_value_typ, and skipna=True are provided. Parameters ---------- values : ndarray input array to potentially compute mask for skipna : bool boolean for whether NaNs should be skipped fill_value : Any value to fill NaNs with fill_value_typ : str Set to '+inf' or '-inf' to handle dtype-specific infinities mask : Optional[np.ndarray] nan-mask if known Returns ------- values : ndarray Potential copy of input value array mask : Optional[ndarray[bool]] Mask for values, if deemed necessary to compute dtype : dtype dtype for values dtype_max : dtype platform independent dtype fill_value : Any fill value used """ # In _get_values is only called from within nanops, and in all cases # with scalar fill_value. This guarantee is important for the # maybe_upcast_putmask call below assert is_scalar(fill_value) mask = _maybe_get_mask(values, skipna, mask) if is_datetime64tz_dtype(values): # lib.values_from_object returns M8[ns] dtype instead of tz-aware, # so this case must be handled separately from the rest dtype = values.dtype values = getattr(values, "_values", values) else: values = lib.values_from_object(values) dtype = values.dtype if is_datetime_or_timedelta_dtype(values) or is_datetime64tz_dtype(values): # changing timedelta64/datetime64 to int64 needs to happen after # finding `mask` above values = getattr(values, "asi8", values) values = values.view(np.int64) dtype_ok = _na_ok_dtype(dtype) # get our fill value (in case we need to provide an alternative # dtype for it) fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ) copy = (mask is not None) and (fill_value is not None) if skipna and copy: values = values.copy() if dtype_ok: np.putmask(values, mask, fill_value) # promote if needed else: values, _ = maybe_upcast_putmask(values, mask, fill_value) # return a platform independent precision dtype dtype_max = dtype if is_integer_dtype(dtype) or is_bool_dtype(dtype): dtype_max = np.int64 elif is_float_dtype(dtype): dtype_max = np.float64 return values, mask, dtype, dtype_max, fill_value
def __init__( self, data, sparse_index=None, index=None, fill_value=None, kind="integer", dtype=None, copy=False, ): if fill_value is None and isinstance(dtype, SparseDtype): fill_value = dtype.fill_value if isinstance(data, type(self)): # disable normal inference on dtype, sparse_index, & fill_value if sparse_index is None: sparse_index = data.sp_index if fill_value is None: fill_value = data.fill_value if dtype is None: dtype = data.dtype # TODO: make kind=None, and use data.kind? data = data.sp_values # Handle use-provided dtype if isinstance(dtype, str): # Two options: dtype='int', regular numpy dtype # or dtype='Sparse[int]', a sparse dtype try: dtype = SparseDtype.construct_from_string(dtype) except TypeError: dtype = pandas_dtype(dtype) if isinstance(dtype, SparseDtype): if fill_value is None: fill_value = dtype.fill_value dtype = dtype.subtype if index is not None and not is_scalar(data): raise Exception("must only pass scalars with an index") if is_scalar(data): if index is not None: if data is None: data = np.nan if index is not None: npoints = len(index) elif sparse_index is None: npoints = 1 else: npoints = sparse_index.length dtype = infer_dtype_from_scalar(data)[0] data = construct_1d_arraylike_from_scalar(data, npoints, dtype) if dtype is not None: dtype = pandas_dtype(dtype) # TODO: disentangle the fill_value dtype inference from # dtype inference if data is None: # TODO: What should the empty dtype be? Object or float? data = np.array([], dtype=dtype) if not is_array_like(data): try: # probably shared code in sanitize_series data = sanitize_array(data, index=None) except ValueError: # NumPy may raise a ValueError on data like [1, []] # we retry with object dtype here. if dtype is None: dtype = object data = np.atleast_1d(np.asarray(data, dtype=dtype)) else: raise if copy: # TODO: avoid double copy when dtype forces cast. data = data.copy() if fill_value is None: fill_value_dtype = data.dtype if dtype is None else dtype if fill_value_dtype is None: fill_value = np.nan else: fill_value = na_value_for_dtype(fill_value_dtype) if isinstance(data, type(self)) and sparse_index is None: sparse_index = data._sparse_index sparse_values = np.asarray(data.sp_values, dtype=dtype) elif sparse_index is None: data = extract_array(data, extract_numpy=True) if not isinstance(data, np.ndarray): # EA if is_datetime64tz_dtype(data.dtype): warnings.warn( f"Creating SparseArray from {data.dtype} data " "loses timezone information. Cast to object before " "sparse to retain timezone information.", UserWarning, stacklevel=2, ) data = np.asarray(data, dtype="datetime64[ns]") data = np.asarray(data) sparse_values, sparse_index, fill_value = make_sparse( data, kind=kind, fill_value=fill_value, dtype=dtype) else: sparse_values = np.asarray(data, dtype=dtype) if len(sparse_values) != sparse_index.npoints: raise AssertionError( f"Non array-like type {type(sparse_values)} must " "have the same length as the index") self._sparse_index = sparse_index self._sparse_values = sparse_values self._dtype = SparseDtype(sparse_values.dtype, fill_value)
def test_value_counts_unique_nunique_null(self): for null_obj in [np.nan, None]: for orig in self.objs: o = orig.copy() klass = type(o) values = o._ndarray_values if not self._allow_na_ops(o): continue # special assign to the numpy array if is_datetime64tz_dtype(o): if isinstance(o, DatetimeIndex): v = o.asi8 v[0:2] = iNaT values = o._shallow_copy(v) else: o = o.copy() o[0:2] = iNaT values = o._values elif needs_i8_conversion(o): values[0:2] = iNaT values = o._shallow_copy(values) else: values[0:2] = null_obj # check values has the same dtype as the original assert values.dtype == o.dtype # create repeated values, 'n'th element is repeated by n+1 # times if isinstance(o, (DatetimeIndex, PeriodIndex)): expected_index = o.copy() expected_index.name = None # attach name to klass o = klass(values.repeat(range(1, len(o) + 1))) o.name = 'a' else: if is_datetime64tz_dtype(o): expected_index = orig._values._shallow_copy(values) else: expected_index = Index(values) expected_index.name = None o = o.repeat(range(1, len(o) + 1)) o.name = 'a' # check values has the same dtype as the original assert o.dtype == orig.dtype # check values correctly have NaN nanloc = np.zeros(len(o), dtype=np.bool) nanloc[:3] = True if isinstance(o, Index): tm.assert_numpy_array_equal(pd.isna(o), nanloc) else: exp = Series(nanloc, o.index, name='a') tm.assert_series_equal(pd.isna(o), exp) expected_s_na = Series(list(range(10, 2, -1)) + [3], index=expected_index[9:0:-1], dtype='int64', name='a') expected_s = Series(list(range(10, 2, -1)), index=expected_index[9:1:-1], dtype='int64', name='a') result_s_na = o.value_counts(dropna=False) tm.assert_series_equal(result_s_na, expected_s_na) assert result_s_na.index.name is None assert result_s_na.name == 'a' result_s = o.value_counts() tm.assert_series_equal(o.value_counts(), expected_s) assert result_s.index.name is None assert result_s.name == 'a' result = o.unique() if isinstance(o, Index): tm.assert_index_equal(result, Index(values[1:], name='a')) elif is_datetime64tz_dtype(o): # unable to compare NaT / nan vals = values[2:].astype(object).values tm.assert_numpy_array_equal(result[1:], vals) assert result[0] is pd.NaT else: tm.assert_numpy_array_equal(result[1:], values[2:]) assert pd.isna(result[0]) assert result.dtype == orig.dtype assert o.nunique() == 8 assert o.nunique(dropna=False) == 9
def _cython_operation(self, kind: str, values, how, axis, min_count=-1, **kwargs): assert kind in ["transform", "aggregate"] orig_values = values # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError( "{dtype} dtype not supported".format(dtype=values.dtype)) elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( "datetime64 type does not support {how} operations".format( how=how)) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( "timedelta64 type does not support {how} operations". format(how=how)) if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? kludge can be avoided when # 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: try: values = ensure_float64(values) except TypeError: if lib.infer_dtype(values, skipna=False) == "complex": values = values.astype(complex) else: raise func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = "{kind}{itemsize}".format( kind=values.dtype.kind, itemsize=values.dtype.itemsize) else: out_dtype = "object" labels, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, labels, func, is_datetimelike, min_count) elif kind == "transform": result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, labels, func, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: names = self._name_functions[how]() # type: Optional[List[str]] else: names = None if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype): result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) return result, names
def _convert_listlike_datetimes( arg, format: Optional[str], name: Hashable = None, tz: Optional[Timezone] = None, unit: Optional[str] = None, errors: Optional[str] = None, infer_datetime_format: Optional[bool] = None, dayfirst: Optional[bool] = None, yearfirst: Optional[bool] = None, exact: Optional[bool] = None, ): """ Helper function for to_datetime. Performs the conversions of 1D listlike of dates Parameters ---------- arg : list, tuple, ndarray, Series, Index date to be parsed name : object None or string for the Index name tz : object None or 'utc' unit : string None or string of the frequency of the passed data errors : string error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore' infer_datetime_format : boolean inferring format behavior from to_datetime dayfirst : boolean dayfirst parsing behavior from to_datetime yearfirst : boolean yearfirst parsing behavior from to_datetime exact : boolean exact format matching behavior from to_datetime Returns ------- Index-like of parsed dates """ if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype="O") arg_dtype = getattr(arg, "dtype", None) # these are shortcutable if is_datetime64tz_dtype(arg_dtype): if not isinstance(arg, (DatetimeArray, DatetimeIndex)): return DatetimeIndex(arg, tz=tz, name=name) if tz == "utc": arg = arg.tz_convert(None).tz_localize(tz) return arg elif is_datetime64_ns_dtype(arg_dtype): if not isinstance(arg, (DatetimeArray, DatetimeIndex)): try: return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass elif tz: # DatetimeArray, DatetimeIndex return arg.tz_localize(tz) return arg elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") arg = getattr(arg, "_values", arg) # GH 30050 pass an ndarray to tslib.array_with_unit_to_datetime # because it expects an ndarray argument if isinstance(arg, IntegerArray): result = arg.astype(f"datetime64[{unit}]") tz_parsed = None else: result, tz_parsed = tslib.array_with_unit_to_datetime( arg, unit, errors=errors) if errors == "ignore": result = Index(result, name=name) else: result = DatetimeIndex(result, name=name) # GH 23758: We may still need to localize the result with tz # GH 25546: Apply tz_parsed first (from arg), then tz (from caller) # result will be naive but in UTC try: result = result.tz_localize("UTC").tz_convert(tz_parsed) except AttributeError: # Regular Index from 'ignore' path return result if tz is not None: if result.tz is None: result = result.tz_localize(tz) else: result = result.tz_convert(tz) return result elif getattr(arg, "ndim", 1) > 1: raise TypeError( "arg must be a string, datetime, list, tuple, 1-d array, or Series" ) # warn if passing timedelta64, raise for PeriodDtype # NB: this must come after unit transformation orig_arg = arg try: arg, _ = maybe_convert_dtype(arg, copy=False) except TypeError: if errors == "coerce": result = np.array(["NaT"], dtype="datetime64[ns]").repeat(len(arg)) return DatetimeIndex(result, name=name) elif errors == "ignore": result = Index(arg, name=name) return result raise arg = ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None tz_parsed = None result = None if format is not None: try: # shortcut formatting here if format == "%Y%m%d": try: # pass orig_arg as float-dtype may have been converted to # datetime64[ns] orig_arg = ensure_object(orig_arg) result = _attempt_YYYYMMDD(orig_arg, errors=errors) except (ValueError, TypeError, OutOfBoundsDatetime) as err: raise ValueError( "cannot convert the input to '%Y%m%d' date format" ) from err # fallback if result is None: try: result, timezones = array_strptime(arg, format, exact=exact, errors=errors) if "%Z" in format or "%z" in format: return _return_parsed_timezone_results( result, timezones, tz, name) except OutOfBoundsDatetime: if errors == "raise": raise elif errors == "coerce": result = np.empty(arg.shape, dtype="M8[ns]") iresult = result.view("i8") iresult.fill(iNaT) else: result = arg except ValueError: # if format was inferred, try falling back # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: if errors == "raise": raise elif errors == "coerce": result = np.empty(arg.shape, dtype="M8[ns]") iresult = result.view("i8") iresult.fill(iNaT) else: result = arg except ValueError as e: # Fallback to try to convert datetime objects if timezone-aware # datetime objects are found without passing `utc=True` try: values, tz = conversion.datetime_to_datetime64(arg) dta = DatetimeArray(values, dtype=tz_to_dtype(tz)) return DatetimeIndex._simple_new(dta, name=name) except (ValueError, TypeError): raise e if result is None: assert format is None or infer_datetime_format utc = tz == "utc" result, tz_parsed = objects_to_datetime64ns( arg, dayfirst=dayfirst, yearfirst=yearfirst, utc=utc, errors=errors, require_iso8601=require_iso8601, allow_object=True, ) if tz_parsed is not None: # We can take a shortcut since the datetime64 numpy array # is in UTC dta = DatetimeArray(result, dtype=tz_to_dtype(tz_parsed)) return DatetimeIndex._simple_new(dta, name=name) utc = tz == "utc" return _box_as_indexlike(result, utc=utc, name=name)
def cut( x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates="raise", ): """ Bin values into discrete intervals. Use `cut` when you need to segment and sort data values into bins. This function is also useful for going from a continuous variable to a categorical variable. For example, `cut` could convert ages to groups of age ranges. Supports binning into an equal number of bins, or a pre-specified array of bins. Parameters ---------- x : array-like The input array to be binned. Must be 1-dimensional. bins : int, sequence of scalars, or IntervalIndex The criteria to bin by. * int : Defines the number of equal-width bins in the range of `x`. The range of `x` is extended by .1% on each side to include the minimum and maximum values of `x`. * sequence of scalars : Defines the bin edges allowing for non-uniform width. No extension of the range of `x` is done. * IntervalIndex : Defines the exact bins to be used. Note that IntervalIndex for `bins` must be non-overlapping. right : bool, default True Indicates whether `bins` includes the rightmost edge or not. If ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]`` indicate (1,2], (2,3], (3,4]. This argument is ignored when `bins` is an IntervalIndex. labels : array or bool, optional Specifies the labels for the returned bins. Must be the same length as the resulting bins. If False, returns only integer indicators of the bins. This affects the type of the output container (see below). This argument is ignored when `bins` is an IntervalIndex. retbins : bool, default False Whether to return the bins or not. Useful when bins is provided as a scalar. precision : int, default 3 The precision at which to store and display the bins labels. include_lowest : bool, default False Whether the first interval should be left-inclusive or not. duplicates : {default 'raise', 'drop'}, optional If bin edges are not unique, raise ValueError or drop non-uniques. .. versionadded:: 0.23.0 Returns ------- out : Categorical, Series, or ndarray An array-like object representing the respective bin for each value of `x`. The type depends on the value of `labels`. * True (default) : returns a Series for Series `x` or a Categorical for all other inputs. The values stored within are Interval dtype. * sequence of scalars : returns a Series for Series `x` or a Categorical for all other inputs. The values stored within are whatever the type in the sequence is. * False : returns an ndarray of integers. bins : numpy.ndarray or IntervalIndex. The computed or specified bins. Only returned when `retbins=True`. For scalar or sequence `bins`, this is an ndarray with the computed bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For an IntervalIndex `bins`, this is equal to `bins`. See Also -------- qcut : Discretize variable into equal-sized buckets based on rank or based on sample quantiles. Categorical : Array type for storing data that come from a fixed set of values. Series : One-dimensional array with axis labels (including time series). IntervalIndex : Immutable Index implementing an ordered, sliceable set. Notes ----- Any NA values will be NA in the result. Out of bounds values will be NA in the resulting Series or Categorical object. Examples -------- Discretize into three equal-sized bins. >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3) ... # doctest: +ELLIPSIS [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True) ... # doctest: +ELLIPSIS ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ... Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] ... array([0.994, 3. , 5. , 7. ])) Discovers the same bins, but assign them specific labels. Notice that the returned Categorical's categories are `labels` and is ordered. >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), ... 3, labels=["bad", "medium", "good"]) [bad, good, medium, medium, good, bad] Categories (3, object): [bad < medium < good] ``labels=False`` implies you just want the bins back. >>> pd.cut([0, 1, 1, 2], bins=4, labels=False) array([0, 1, 1, 3]) Passing a Series as an input returns a Series with categorical dtype: >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), ... index=['a', 'b', 'c', 'd', 'e']) >>> pd.cut(s, 3) ... # doctest: +ELLIPSIS a (1.992, 4.667] b (1.992, 4.667] c (4.667, 7.333] d (7.333, 10.0] e (7.333, 10.0] dtype: category Categories (3, interval[float64]): [(1.992, 4.667] < (4.667, ... Passing a Series as an input returns a Series with mapping value. It is used to map numerically to intervals based on bins. >>> s = pd.Series(np.array([2, 4, 6, 8, 10]), ... index=['a', 'b', 'c', 'd', 'e']) >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False) ... # doctest: +ELLIPSIS (a 0.0 b 1.0 c 2.0 d 3.0 e 4.0 dtype: float64, array([0, 2, 4, 6, 8])) Use `drop` optional when bins is not unique >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True, ... right=False, duplicates='drop') ... # doctest: +ELLIPSIS (a 0.0 b 1.0 c 2.0 d 3.0 e 3.0 dtype: float64, array([0, 2, 4, 6, 8])) Passing an IntervalIndex for `bins` results in those categories exactly. Notice that values not covered by the IntervalIndex are set to NaN. 0 is to the left of the first bin (which is closed on the right), and 1.5 falls between two bins. >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)]) >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins) [NaN, (0, 1], NaN, (2, 3], (4, 5]] Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]] """ # NOTE: this binning code is changed a bit from histogram for var(x) == 0 # for handling the cut for datetime and timedelta objects x_is_series, series_index, name, x = _preprocess_for_cut(x) x, dtype = _coerce_to_type(x) if not np.iterable(bins): if is_scalar(bins) and bins < 1: raise ValueError("`bins` should be a positive integer.") try: # for array-like sz = x.size except AttributeError: x = np.asarray(x) sz = x.size if sz == 0: raise ValueError("Cannot cut empty array") rng = (nanops.nanmin(x), nanops.nanmax(x)) mn, mx = [mi + 0.0 for mi in rng] if np.isinf(mn) or np.isinf(mx): # GH 24314 raise ValueError("cannot specify integer `bins` when input data " "contains infinity") elif mn == mx: # adjust end points before binning mn -= 0.001 * abs(mn) if mn != 0 else 0.001 mx += 0.001 * abs(mx) if mx != 0 else 0.001 bins = np.linspace(mn, mx, bins + 1, endpoint=True) else: # adjust end points after binning bins = np.linspace(mn, mx, bins + 1, endpoint=True) adj = (mx - mn) * 0.001 # 0.1% of the range if right: bins[0] -= adj else: bins[-1] += adj elif isinstance(bins, IntervalIndex): if bins.is_overlapping: raise ValueError("Overlapping IntervalIndex is not accepted.") else: if is_datetime64tz_dtype(bins): bins = np.asarray(bins, dtype=_NS_DTYPE) else: bins = np.asarray(bins) bins = _convert_bin_to_numeric_type(bins, dtype) # GH 26045: cast to float64 to avoid an overflow if (np.diff(bins.astype("float64")) < 0).any(): raise ValueError("bins must increase monotonically.") fac, bins = _bins_to_cuts( x, bins, right=right, labels=labels, precision=precision, include_lowest=include_lowest, dtype=dtype, duplicates=duplicates, ) return _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name, dtype)
def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ from pandas.core.tools.timedeltas import to_timedelta from pandas.core.tools.datetimes import to_datetime if dtype is not None: if isinstance(dtype, str): dtype = np.dtype(dtype) is_datetime64 = is_datetime64_dtype(dtype) is_datetime64tz = is_datetime64tz_dtype(dtype) is_timedelta64 = is_timedelta64_dtype(dtype) if is_datetime64 or is_datetime64tz or is_timedelta64: # Force the dtype if needed. msg = (f"The '{dtype.name}' dtype has no unit. " f"Please pass in '{dtype.name}[ns]' instead.") if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE): # pandas supports dtype whose granularity is less than [ns] # e.g., [ps], [fs], [as] if dtype <= np.dtype("M8[ns]"): if dtype.name == "datetime64": raise ValueError(msg) dtype = _NS_DTYPE else: raise TypeError( f"cannot convert datetimelike to dtype [{dtype}]") elif is_datetime64tz: # our NaT doesn't support tz's # this will coerce to DatetimeIndex with # a matching dtype below if is_scalar(value) and isna(value): value = [value] elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE): # pandas supports dtype whose granularity is less than [ns] # e.g., [ps], [fs], [as] if dtype <= np.dtype("m8[ns]"): if dtype.name == "timedelta64": raise ValueError(msg) dtype = _TD_DTYPE else: raise TypeError( f"cannot convert timedeltalike to dtype [{dtype}]") if is_scalar(value): if value == iNaT or isna(value): value = iNaT else: value = np.array(value, copy=False) # have a scalar array-like (e.g. NaT) if value.ndim == 0: value = iNaT # we have an array of datetime or timedeltas & nulls elif np.prod( value.shape) or not is_dtype_equal(value.dtype, dtype): try: if is_datetime64: value = to_datetime(value, errors=errors) # GH 25843: Remove tz information since the dtype # didn't specify one if value.tz is not None: value = value.tz_localize(None) value = value._values elif is_datetime64tz: # The string check can be removed once issue #13712 # is solved. String data that is passed with a # datetime64tz is assumed to be naive which should # be localized to the timezone. is_dt_string = is_string_dtype(value) value = to_datetime(value, errors=errors).array if is_dt_string: # Strings here are naive, so directly localize value = value.tz_localize(dtype.tz) else: # Numeric values are UTC at this point, # so localize and convert value = value.tz_localize("UTC").tz_convert( dtype.tz) elif is_timedelta64: value = to_timedelta(value, errors=errors)._values except OutOfBoundsDatetime: raise except (AttributeError, ValueError, TypeError): pass # coerce datetimelike to object elif is_datetime64_dtype(value) and not is_datetime64_dtype(dtype): if is_object_dtype(dtype): if value.dtype != _NS_DTYPE: value = value.astype(_NS_DTYPE) ints = np.asarray(value).view("i8") return tslib.ints_to_pydatetime(ints) # we have a non-castable dtype that was passed raise TypeError(f"Cannot cast datetime64 to {dtype}") else: is_array = isinstance(value, np.ndarray) # catch a datetime/timedelta that is not of ns variety # and no coercion specified if is_array and value.dtype.kind in ["M", "m"]: dtype = value.dtype if dtype.kind == "M" and dtype != _NS_DTYPE: value = tslibs.conversion.ensure_datetime64ns(value) elif dtype.kind == "m" and dtype != _TD_DTYPE: value = to_timedelta(value) # only do this if we have an array and the dtype of the array is not # setup already we are not an integer/object, so don't bother with this # conversion elif not (is_array and not (issubclass(value.dtype.type, np.integer) or value.dtype == np.object_)): value = maybe_infer_to_datetimelike(value) return value
def _convert_listlike(arg, box, format, name=None, tz=tz): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') # these are shortcutable if is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): return DatetimeIndex(arg, tz=tz, name=name) if utc: arg = arg.tz_convert(None).tz_localize('UTC') return arg elif is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass return arg elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") arg = getattr(arg, 'values', arg) result = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) if box: if errors == 'ignore': from pandas import Index return Index(result) return DatetimeIndex(result, tz=tz, name=name) return result elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') arg = _ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = _format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None try: result = None if format is not None: # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg, errors=errors) except: raise ValueError("cannot convert the input to " "'%Y%m%d' date format") # fallback if result is None: try: result = array_strptime(arg, format, exact=exact, errors=errors) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise result = arg except ValueError: # if format was inferred, try falling back # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: if errors == 'raise': raise result = arg if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime( arg, errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, require_iso8601=require_iso8601) if is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz=tz, name=name) return result except ValueError as e: try: values, tz = conversion.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, name=name, tz=tz) except (ValueError, TypeError): raise e
def test_value_counts_unique_nunique_null(self, null_obj, index_or_series_obj): orig = index_or_series_obj obj = orig.copy() klass = type(obj) values = obj._ndarray_values num_values = len(orig) if not allow_na_ops(obj): pytest.skip("type doesn't allow for NA operations") elif isinstance(orig, (pd.CategoricalIndex, pd.IntervalIndex)): pytest.skip(f"values of {klass} cannot be changed") elif isinstance(orig, pd.MultiIndex): pytest.skip("MultiIndex doesn't support isna") elif orig.duplicated().any(): pytest.xfail( "The test implementation isn't flexible enough to deal" " with duplicated values. This isn't a bug in the" " application code, but in the test code.") # special assign to the numpy array if is_datetime64tz_dtype(obj): if isinstance(obj, DatetimeIndex): v = obj.asi8 v[0:2] = iNaT values = obj._shallow_copy(v) else: obj = obj.copy() obj[0:2] = pd.NaT values = obj._values elif needs_i8_conversion(obj): values[0:2] = iNaT values = obj._shallow_copy(values) else: values[0:2] = null_obj # check values has the same dtype as the original assert values.dtype == obj.dtype # create repeated values, 'n'th element is repeated by n+1 # times if isinstance(obj, (DatetimeIndex, PeriodIndex)): expected_index = obj.copy() expected_index.name = None # attach name to klass obj = klass(values.repeat(range(1, len(obj) + 1))) obj.name = "a" else: if isinstance(obj, DatetimeIndex): expected_index = orig._values._shallow_copy(values) else: expected_index = Index(values) expected_index.name = None obj = obj.repeat(range(1, len(obj) + 1)) obj.name = "a" # check values has the same dtype as the original assert obj.dtype == orig.dtype # check values correctly have NaN nanloc = np.zeros(len(obj), dtype=np.bool) nanloc[:3] = True if isinstance(obj, Index): tm.assert_numpy_array_equal(pd.isna(obj), nanloc) else: exp = Series(nanloc, obj.index, name="a") tm.assert_series_equal(pd.isna(obj), exp) expected_data = list(range(num_values, 2, -1)) expected_data_na = expected_data.copy() if expected_data_na: expected_data_na.append(3) expected_s_na = Series( expected_data_na, index=expected_index[num_values - 1:0:-1], dtype="int64", name="a", ) expected_s = Series( expected_data, index=expected_index[num_values - 1:1:-1], dtype="int64", name="a", ) result_s_na = obj.value_counts(dropna=False) tm.assert_series_equal(result_s_na, expected_s_na) assert result_s_na.index.name is None assert result_s_na.name == "a" result_s = obj.value_counts() tm.assert_series_equal(obj.value_counts(), expected_s) assert result_s.index.name is None assert result_s.name == "a" result = obj.unique() if isinstance(obj, Index): tm.assert_index_equal(result, Index(values[1:], name="a")) elif is_datetime64tz_dtype(obj): # unable to compare NaT / nan tm.assert_extension_array_equal(result[1:], values[2:]) assert result[0] is pd.NaT elif len(obj) > 0: tm.assert_numpy_array_equal(result[1:], values[2:]) assert pd.isna(result[0]) assert result.dtype == orig.dtype assert obj.nunique() == max(0, num_values - 2) assert obj.nunique(dropna=False) == max(0, num_values - 1)
def to_numpy(self, dtype=None, copy=False): """ A NumPy ndarray representing the values in this Series or Index. .. versionadded:: 0.24.0 Parameters ---------- dtype : str or numpy.dtype, optional The dtype to pass to :meth:`numpy.asarray` copy : bool, default False Whether to ensure that the returned value is a not a view on another array. Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. Returns ------- numpy.ndarray See Also -------- Series.array : Get the actual data stored within. Index.array : Get the actual data stored within. DataFrame.to_numpy : Similar method for DataFrame. Notes ----- The returned array will be the same up to equality (values equal in `self` will be equal in the returned array; likewise for values that are not equal). When `self` contains an ExtensionArray, the dtype may be different. For example, for a category-dtype Series, ``to_numpy()`` will return a NumPy array and the categorical dtype will be lost. For NumPy dtypes, this will be a reference to the actual data stored in this Series or Index (assuming ``copy=False``). Modifying the result in place will modify the data stored in the Series or Index (not that we recommend doing that). For extension types, ``to_numpy()`` *may* require copying data and coercing the result to a NumPy type (possibly object), which may be expensive. When you need a no-copy reference to the underlying data, :attr:`Series.array` should be used instead. This table lays out the different dtypes and default return types of ``to_numpy()`` for various dtypes within pandas. ================== ================================ dtype array type ================== ================================ category[T] ndarray[T] (same dtype as input) period ndarray[object] (Periods) interval ndarray[object] (Intervals) IntegerNA ndarray[object] datetime64[ns] datetime64[ns] datetime64[ns, tz] ndarray[object] (Timestamps) ================== ================================ Examples -------- >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.to_numpy() array(['a', 'b', 'a'], dtype=object) Specify the `dtype` to control how datetime-aware data is represented. Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp` objects, each with the correct ``tz``. >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) >>> ser.to_numpy(dtype=object) array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'), Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')], dtype=object) Or ``dtype='datetime64[ns]'`` to return an ndarray of native datetime64 values. The values are converted to UTC and the timezone info is dropped. >>> ser.to_numpy(dtype="datetime64[ns]") ... # doctest: +ELLIPSIS array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], dtype='datetime64[ns]') """ if is_datetime64tz_dtype(self.dtype) and dtype is None: # note: this is going to change very soon. # I have a WIP PR making this unnecessary, but it's # a bit out of scope for the DatetimeArray PR. dtype = "object" result = np.asarray(self._values, dtype=dtype) # TODO(GH-24345): Avoid potential double copy if copy: result = result.copy() return result