def __sub__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(-other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(-other) elif isinstance(other, (datetime, np.datetime64)): result = self._sub_datetimelike_scalar(other) elif lib.is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these maybe_integer_op_deprecated(self) result = self._time_shift(-other) elif isinstance(other, Period): result = self._sub_period(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.sub) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] result = self._sub_datetime_arraylike(other) elif is_period_dtype(other): # PeriodIndex result = self._sub_period_array(other) elif is_integer_dtype(other): maybe_integer_op_deprecated(self) result = self._addsub_int_array(other, operator.sub) elif isinstance(other, ABCIndexClass): raise TypeError("cannot subtract {cls} and {typ}" .format(cls=type(self).__name__, typ=type(other).__name__)) elif is_float_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot subtract {dtype}-dtype from {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_extension_array_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): from pandas.core.arrays import TimedeltaArrayMixin # TODO: infer freq? return TimedeltaArrayMixin(result) return result
def test_transform_casting(self): # 13046 data = """ idx A ID3 DATETIME 0 B-028 b76cd912ff "2014-10-08 13:43:27" 1 B-054 4a57ed0b02 "2014-10-08 14:26:19" 2 B-076 1a682034f8 "2014-10-08 14:29:01" 3 B-023 b76cd912ff "2014-10-08 18:39:34" 4 B-023 f88g8d7sds "2014-10-08 18:40:18" 5 B-033 b76cd912ff "2014-10-08 18:44:30" 6 B-032 b76cd912ff "2014-10-08 18:46:00" 7 B-037 b76cd912ff "2014-10-08 18:52:15" 8 B-046 db959faf02 "2014-10-08 18:59:59" 9 B-053 b76cd912ff "2014-10-08 19:17:48" 10 B-065 b76cd912ff "2014-10-08 19:21:38" """ df = pd.read_csv(StringIO(data), sep='\s+', index_col=[0], parse_dates=['DATETIME']) result = df.groupby('ID3')['DATETIME'].transform(lambda x: x.diff()) assert is_timedelta64_dtype(result.dtype) result = df[['ID3', 'DATETIME']].groupby('ID3').transform( lambda x: x.diff()) assert is_timedelta64_dtype(result.DATETIME.dtype)
def __add__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(other) elif isinstance(other, (datetime, np.datetime64)): result = self._add_datetimelike_scalar(other) elif lib.is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these maybe_integer_op_deprecated(self) result = self._time_shift(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.add) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] return self._add_datetime_arraylike(other) elif is_integer_dtype(other): maybe_integer_op_deprecated(self) result = self._addsub_int_array(other, operator.add) elif is_float_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot add {dtype}-dtype to {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_period_dtype(other): # if self is a TimedeltaArray and other is a PeriodArray with # a timedelta-like (i.e. Tick) freq, this operation is valid. # Defer to the PeriodArray implementation. # In remaining cases, this will end up raising TypeError. return NotImplemented elif is_extension_array_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): from pandas.core.arrays import TimedeltaArrayMixin # TODO: infer freq? return TimedeltaArrayMixin(result) return result
def infer_freq(index, warn=True): """ Infer the most likely frequency given the input index. If the frequency is uncertain, a warning will be printed. Parameters ---------- index : DatetimeIndex or TimedeltaIndex if passed a Series will use the values of the series (NOT THE INDEX) warn : boolean, default True Returns ------- str or None None if no discernible frequency TypeError if the index is not datetime-like ValueError if there are less than three values. """ import pandas as pd if isinstance(index, ABCSeries): values = index._values if not (is_datetime64_dtype(values) or is_timedelta64_dtype(values) or values.dtype == object): raise TypeError("cannot infer freq from a non-convertible dtype " "on a Series of {dtype}".format(dtype=index.dtype)) index = values if is_period_arraylike(index): raise TypeError("PeriodIndex given. Check the `freq` attribute " "instead of using infer_freq.") elif is_timedelta64_dtype(index): # Allow TimedeltaIndex and TimedeltaArray inferer = _TimedeltaFrequencyInferer(index, warn=warn) return inferer.get_freq() if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex): if isinstance(index, (pd.Int64Index, pd.Float64Index)): raise TypeError("cannot infer freq from a non-convertible index " "type {type}".format(type=type(index))) index = index.values if not isinstance(index, pd.DatetimeIndex): try: index = pd.DatetimeIndex(index) except AmbiguousTimeError: index = pd.DatetimeIndex(index.asi8) inferer = _FrequencyInferer(index, warn=warn) return inferer.get_freq()
def test_is_timedelta(self): assert is_timedelta64_dtype('timedelta64') assert is_timedelta64_dtype('timedelta64[ns]') assert not is_timedelta64_ns_dtype('timedelta64') assert is_timedelta64_ns_dtype('timedelta64[ns]') tdi = TimedeltaIndex([1e14, 2e14], dtype='timedelta64') assert is_timedelta64_dtype(tdi) assert is_timedelta64_ns_dtype(tdi) assert is_timedelta64_ns_dtype(tdi.astype('timedelta64[ns]')) # Conversion to Int64Index: assert not is_timedelta64_ns_dtype(tdi.astype('timedelta64')) assert not is_timedelta64_ns_dtype(tdi.astype('timedelta64[h]'))
def test_is_timedelta(self): self.assertTrue(is_timedelta64_dtype('timedelta64')) self.assertTrue(is_timedelta64_dtype('timedelta64[ns]')) self.assertFalse(is_timedelta64_ns_dtype('timedelta64')) self.assertTrue(is_timedelta64_ns_dtype('timedelta64[ns]')) tdi = TimedeltaIndex([1e14, 2e14], dtype='timedelta64') self.assertTrue(is_timedelta64_dtype(tdi)) self.assertTrue(is_timedelta64_ns_dtype(tdi)) self.assertTrue(is_timedelta64_ns_dtype(tdi.astype('timedelta64[ns]'))) # Conversion to Int64Index: self.assertFalse(is_timedelta64_ns_dtype(tdi.astype('timedelta64'))) self.assertFalse(is_timedelta64_ns_dtype(tdi.astype('timedelta64[h]')))
def _add_delta(self, other): """ Add a timedelta-like, Tick or TimedeltaIndex-like object to self, yielding an int64 numpy array Parameters ---------- delta : {timedelta, np.timedelta64, Tick, TimedeltaIndex, ndarray[timedelta64]} Returns ------- result : ndarray[int64] Notes ----- The result's name is set outside of _add_delta by the calling method (__add__ or __sub__), if necessary (i.e. for Indexes). """ if isinstance(other, (Tick, timedelta, np.timedelta64)): new_values = self._add_timedeltalike_scalar(other) elif is_timedelta64_dtype(other): # ndarray[timedelta64] or TimedeltaArray/index new_values = self._add_delta_tdi(other) return new_values
def _add_delta(self, delta): """ Add a timedelta-like, Tick, or TimedeltaIndex-like object to self. Parameters ---------- delta : timedelta, np.timedelta64, Tick, TimedeltaArray, TimedeltaIndex Returns ------- result : same type as self Notes ----- The result's name is set outside of _add_delta by the calling method (__add__ or __sub__) """ if isinstance(delta, (Tick, timedelta, np.timedelta64)): new_values = self._add_delta_td(delta) elif isinstance(delta, TimedeltaArrayMixin): new_values = self._add_delta_tdi(delta) elif is_timedelta64_dtype(delta): # ndarray[timedelta64] --> wrap in TimedeltaArray/Index delta = type(self)(delta) new_values = self._add_delta_tdi(delta) else: raise TypeError("cannot add the type {0} to a TimedeltaIndex" .format(type(delta))) return type(self)(new_values, freq='infer')
def _wrap_results(result, dtype, fill_value=None): """ wrap our results if needed """ if is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): if fill_value is None: # GH#24293 fill_value = iNaT if not isinstance(result, np.ndarray): tz = getattr(dtype, 'tz', None) assert not isna(fill_value), "Expected non-null fill_value" if result == fill_value: result = np.nan result = tslibs.Timestamp(result, tz=tz) else: result = result.view(dtype) elif is_timedelta64_dtype(dtype): if not isinstance(result, np.ndarray): if result == fill_value: result = np.nan # raise if we have a timedelta64[ns] which is too large if np.fabs(result) > _int64_max: raise ValueError("overflow in timedelta operation") result = tslibs.Timedelta(result, unit='ns') else: result = result.astype('i8').view(dtype) return result
def wrapper(left, right, name=name, na_op=na_op): if isinstance(right, ABCDataFrame): return NotImplemented left, right = _align_method_SERIES(left, right) res_name = _get_series_op_result_name(left, right) if is_datetime64_dtype(left) or is_datetime64tz_dtype(left): result = dispatch_to_index_op(op, left, right, pd.DatetimeIndex) return construct_result(left, result, index=left.index, name=res_name, dtype=result.dtype) elif is_timedelta64_dtype(left): result = dispatch_to_index_op(op, left, right, pd.TimedeltaIndex) return construct_result(left, result, index=left.index, name=res_name, dtype=result.dtype) lvalues = left.values rvalues = right if isinstance(rvalues, ABCSeries): rvalues = getattr(rvalues, 'values', rvalues) result = safe_na_op(lvalues, rvalues) return construct_result(left, result, index=left.index, name=res_name, dtype=None)
def nansum(values, axis=None, skipna=True, min_count=0, mask=None): """ Sum the elements along an axis ignoring NaNs Parameters ---------- values : ndarray[dtype] axis: int, optional skipna : bool, default True min_count: int, default 0 mask : ndarray[bool], optional nan-mask if known Returns ------- result : dtype Examples -------- >>> import pandas.core.nanops as nanops >>> s = pd.Series([1, 2, np.nan]) >>> nanops.nansum(s) 3.0 """ values, mask, dtype, dtype_max, _ = _get_values(values, skipna, 0, mask=mask) dtype_sum = dtype_max if is_float_dtype(dtype): dtype_sum = dtype elif is_timedelta64_dtype(dtype): dtype_sum = np.float64 the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask, min_count=min_count) return _wrap_results(the_sum, dtype)
def backfill_1d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if is_float_dtype(values): name = 'backfill_inplace_{name}'.format(name=dtype.name) _method = getattr(algos, name, None) elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _backfill_1d_datetime elif is_integer_dtype(values): values = ensure_float64(values) _method = algos.backfill_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_inplace_object elif is_timedelta64_dtype(values): # NaTs are treated identically to datetime64, so we can dispatch # to that implementation _method = _backfill_1d_datetime if _method is None: raise ValueError('Invalid dtype for backfill_1d [{name}]' .format(name=dtype.name)) if mask is None: mask = isna(values) mask = mask.view(np.uint8) _method(values, mask, limit=limit) return values
def __mul__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): return NotImplemented if is_scalar(other): # numpy will accept float and int, raise TypeError for others result = self._data * other freq = None if self.freq is not None and not isna(other): freq = self.freq * other return type(self)(result, freq=freq) if not hasattr(other, "dtype"): # list, tuple other = np.array(other) if len(other) != len(self) and not is_timedelta64_dtype(other): # Exclude timedelta64 here so we correctly raise TypeError # for that instead of ValueError raise ValueError("Cannot multiply with unequal lengths") if is_object_dtype(other): # this multiplication will succeed only if all elements of other # are int or float scalars, so we will end up with # timedelta64[ns]-dtyped result result = [self[n] * other[n] for n in range(len(self))] result = np.array(result) return type(self)(result) # numpy will accept float or int dtype, raise TypeError for others result = self._data * other return type(self)(result)
def __new__(cls, data): # CombinedDatetimelikeProperties isn't really instantiated. Instead # we need to choose which parent (datetime or timedelta) is # appropriate. Since we're checking the dtypes anyway, we'll just # do all the validation here. from pandas import Series if not isinstance(data, Series): raise TypeError("cannot convert an object of type {0} to a " "datetimelike index".format(type(data))) orig = data if is_categorical_dtype(data) else None if orig is not None: data = Series(orig.values.categories, name=orig.name, copy=False) try: if is_datetime64_dtype(data.dtype): return DatetimeProperties(data, orig) elif is_datetime64tz_dtype(data.dtype): return DatetimeProperties(data, orig) elif is_timedelta64_dtype(data.dtype): return TimedeltaProperties(data, orig) else: if is_period_arraylike(data): return PeriodProperties(data, orig) if is_datetime_arraylike(data): return DatetimeProperties(data, orig) except Exception: pass # we raise an attribute error anyway raise AttributeError("Can only use .dt accessor with datetimelike " "values")
def __rsub__(self, other): if is_datetime64_dtype(other) and is_timedelta64_dtype(self): # ndarray[datetime64] cannot be subtracted from self, so # we need to wrap in DatetimeIndex and flip the operation from pandas import DatetimeIndex return DatetimeIndex(other) - self return -(self - other)
def wrapper(left, right, name=name, na_op=na_op): if isinstance(right, ABCDataFrame): return NotImplemented left, right = _align_method_SERIES(left, right) res_name = _get_series_op_result_name(left, right) if is_datetime64_dtype(left) or is_datetime64tz_dtype(left): result = dispatch_to_index_op(op, left, right, pd.DatetimeIndex) return construct_result(left, result, index=left.index, name=res_name, dtype=result.dtype) elif is_timedelta64_dtype(left): result = dispatch_to_index_op(op, left, right, pd.TimedeltaIndex) return construct_result(left, result, index=left.index, name=res_name, dtype=result.dtype) elif is_categorical_dtype(left): raise TypeError("{typ} cannot perform the operation " "{op}".format(typ=type(left).__name__, op=str_rep)) lvalues = left.values rvalues = right if isinstance(rvalues, ABCSeries): rvalues = rvalues.values result = safe_na_op(lvalues, rvalues) return construct_result(left, result, index=left.index, name=res_name, dtype=None)
def _convert_bin_to_numeric_type(bins, dtype): """ if the passed bin is of datetime/timedelta type, this method converts it to integer Parameters ---------- bins : list-liek of bins dtype : dtype of data Raises ------ ValueError if bins are not of a compat dtype to dtype """ bins_dtype = infer_dtype(bins) if is_timedelta64_dtype(dtype): if bins_dtype in ['timedelta', 'timedelta64']: bins = to_timedelta(bins).view(np.int64) else: raise ValueError("bins must be of timedelta64 dtype") elif is_datetime64_dtype(dtype): if bins_dtype in ['datetime', 'datetime64']: bins = to_datetime(bins).view(np.int64) else: raise ValueError("bins must be of datetime64 dtype") return bins
def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): """Convert a list of objects to a timedelta index object.""" if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): arg = np.array(list(arg), dtype='O') # these are shortcut-able if is_timedelta64_dtype(arg): value = arg.astype('timedelta64[ns]') elif is_integer_dtype(arg): value = arg.astype('timedelta64[{0}]'.format( unit)).astype('timedelta64[ns]', copy=False) else: try: value = tslib.array_to_timedelta64(_ensure_object(arg), unit=unit, errors=errors) value = value.astype('timedelta64[ns]', copy=False) except ValueError: if errors == 'ignore': return arg else: # This else-block accounts for the cases when errors='raise' # and errors='coerce'. If errors == 'raise', these errors # should be raised. If errors == 'coerce', we shouldn't # expect any errors to be raised, since all parsing errors # cause coercion to pd.NaT. However, if an error / bug is # introduced that causes an Exception to be raised, we would # like to surface it. raise if box: from pandas import TimedeltaIndex value = TimedeltaIndex(value, unit='ns', name=name) return value
def _format_labels(bins, precision, right=True, include_lowest=False, dtype=None): """ based on the dtype, return our labels """ closed = 'right' if right else 'left' if is_datetime64_dtype(dtype): formatter = Timestamp adjust = lambda x: x - Timedelta('1ns') elif is_timedelta64_dtype(dtype): formatter = Timedelta adjust = lambda x: x - Timedelta('1ns') else: precision = _infer_precision(precision, bins) formatter = lambda x: _round_frac(x, precision) adjust = lambda x: x - 10 ** (-precision) breaks = [formatter(b) for b in bins] labels = IntervalIndex.from_breaks(breaks, closed=closed) if right and include_lowest: # we will adjust the left hand side by precision to # account that we are all right closed v = adjust(labels[0].left) i = IntervalIndex.from_intervals( [Interval(v, labels[0].right, closed='right')]) labels = i.append(labels[1:]) return labels
def _add_delta(self, delta): """ Add a timedelta-like, DateOffset, or TimedeltaIndex-like object to self. Parameters ---------- delta : {timedelta, np.timedelta64, DateOffset, TimedelaIndex, ndarray[timedelta64]} Returns ------- result : same type as self Notes ----- The result's name is set outside of _add_delta by the calling method (__add__ or __sub__) """ from pandas.core.arrays.timedeltas import TimedeltaArrayMixin if isinstance(delta, (Tick, timedelta, np.timedelta64)): new_values = self._add_delta_td(delta) elif is_timedelta64_dtype(delta): if not isinstance(delta, TimedeltaArrayMixin): delta = TimedeltaArrayMixin(delta) new_values = self._add_delta_tdi(delta) else: new_values = self.astype('O') + delta tz = 'UTC' if self.tz is not None else None result = type(self)(new_values, tz=tz, freq='infer') if self.tz is not None and self.tz is not utc: result = result.tz_convert(self.tz) return result
def _maybe_cast_slice_bound(self, label, side, kind): """ If label is a string, cast it to timedelta according to resolution. Parameters ---------- label : object side : {'left', 'right'} kind : {'ix', 'loc', 'getitem'} Returns ------- label : object """ assert kind in ['ix', 'loc', 'getitem', None] if isinstance(label, compat.string_types): parsed = Timedelta(label) lbound = parsed.round(parsed.resolution) if side == 'left': return lbound else: return (lbound + to_offset(parsed.resolution) - Timedelta(1, 'ns')) elif ((is_integer(label) or is_float(label)) and not is_timedelta64_dtype(label)): self._invalid_indexer('slice', label) return label
def __add__(self, other): from pandas.core.index import Index from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.tseries.offsets import DateOffset other = lib.item_from_zerodim(other) if isinstance(other, ABCSeries): return NotImplemented elif is_timedelta64_dtype(other): return self._add_delta(other) elif isinstance(other, (DateOffset, timedelta)): return self._add_delta(other) elif is_offsetlike(other): # Array/Index of DateOffset objects return self._add_offset_array(other) elif isinstance(self, TimedeltaIndex) and isinstance(other, Index): if hasattr(other, '_add_delta'): return other._add_delta(self) raise TypeError("cannot add TimedeltaIndex and {typ}" .format(typ=type(other))) elif is_integer(other): return self.shift(other) elif isinstance(other, (datetime, np.datetime64)): return self._add_datelike(other) elif isinstance(other, Index): return self._add_datelike(other) elif is_integer_dtype(other) and self.freq is None: # GH#19123 raise NullFrequencyError("Cannot shift with no freq") else: # pragma: no cover return NotImplemented
def _maybe_convert_timedelta(self, other): if isinstance(other, (timedelta, np.timedelta64, offsets.Tick)): offset = frequencies.to_offset(self.freq.rule_code) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) offset_nanos = tslib._delta_to_nanoseconds(offset) if nanos % offset_nanos == 0: return nanos // offset_nanos elif isinstance(other, offsets.DateOffset): freqstr = other.rule_code base = frequencies.get_base_alias(freqstr) if base == self.freq.rule_code: return other.n msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) elif isinstance(other, np.ndarray): if is_integer_dtype(other): return other elif is_timedelta64_dtype(other): offset = frequencies.to_offset(self.freq) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) offset_nanos = tslib._delta_to_nanoseconds(offset) if (nanos % offset_nanos).all() == 0: return nanos // offset_nanos elif is_integer(other): # integer is passed to .shift via # _add_datetimelike_methods basically # but ufunc may pass integer to _add_delta return other # raise when input doesn't have freq msg = "Input has different freq from PeriodIndex(freq={0})" raise IncompatibleFrequency(msg.format(self.freqstr))
def __sub__(self, other): from pandas.core.index import Index from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.tseries.offsets import DateOffset if is_timedelta64_dtype(other): return self._add_delta(-other) elif isinstance(self, TimedeltaIndex) and isinstance(other, Index): if not isinstance(other, TimedeltaIndex): raise TypeError("cannot subtract TimedeltaIndex and {typ}" .format(typ=type(other).__name__)) return self._add_delta(-other) elif isinstance(other, DatetimeIndex): return self._sub_datelike(other) elif isinstance(other, Index): raise TypeError("cannot subtract {typ1} and {typ2}" .format(typ1=type(self).__name__, typ2=type(other).__name__)) elif isinstance(other, (DateOffset, timedelta)): return self._add_delta(-other) elif is_integer(other): return self.shift(-other) elif isinstance(other, (datetime, np.datetime64)): return self._sub_datelike(other) elif isinstance(other, Period): return self._sub_period(other) else: # pragma: no cover return NotImplemented
def _add_delta(self, other): """ Add a timedelta-like, Tick, or TimedeltaIndex-like object to self. Parameters ---------- other : {timedelta, np.timedelta64, Tick, TimedeltaIndex, ndarray[timedelta64]} Returns ------- result : same type as self """ if not isinstance(self.freq, Tick): # We cannot add timedelta-like to non-tick PeriodArray raise IncompatibleFrequency("Input has different freq from " "{cls}(freq={freqstr})" .format(cls=type(self).__name__, freqstr=self.freqstr)) # TODO: standardize across datetimelike subclasses whether to return # i8 view or _shallow_copy if isinstance(other, (Tick, timedelta, np.timedelta64)): new_values = self._add_delta_td(other) return self._shallow_copy(new_values) elif is_timedelta64_dtype(other): # ndarray[timedelta64] or TimedeltaArray/index new_values = self._add_delta_tdi(other) return self._shallow_copy(new_values) else: # pragma: no cover raise TypeError(type(other).__name__)
def _addsub_int_array(self, other, op): """ Add or subtract array-like of integers equivalent to applying `_time_shift` pointwise. Parameters ---------- other : Index, ExtensionArray, np.ndarray integer-dtype op : {operator.add, operator.sub} Returns ------- result : same class as self """ # _addsub_int_array is overriden by PeriodArray assert not is_period_dtype(self) assert op in [operator.add, operator.sub] if self.freq is None: # GH#19123 raise NullFrequencyError("Cannot shift with no freq") elif isinstance(self.freq, Tick): # easy case where we can convert to timedelta64 operation td = Timedelta(self.freq) return op(self, td * other) # We should only get here with DatetimeIndex; dispatch # to _addsub_offset_array assert not is_timedelta64_dtype(self) return op(self, np.array(other) * self.freq)
def astype(self, dtype, copy=True): # We handle # --> timedelta64[ns] # --> timedelta64 # DatetimeLikeArrayMixin super call handles other cases dtype = pandas_dtype(dtype) if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype): # by pandas convention, converting to non-nano timedelta64 # returns an int64-dtyped array with ints representing multiples # of the desired timedelta unit. This is essentially division if self._hasnans: # avoid double-copying result = self._data.astype(dtype, copy=False) values = self._maybe_mask_results(result, fill_value=None, convert='float64') return values result = self._data.astype(dtype, copy=copy) return result.astype('i8') elif is_timedelta64_ns_dtype(dtype): if copy: return self.copy() return self return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy)
def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, periods=None, closed=None, dtype=None, copy=False, name=None, verify_integrity=True): if isinstance(data, TimedeltaIndex) and freq is None and name is None: if copy: return data.copy() else: return data._shallow_copy() freq, freq_infer = dtl.maybe_infer_freq(freq) if data is None: # TODO: Remove this block and associated kwargs; GH#20535 result = cls._generate_range(start, end, periods, freq, closed=closed) result.name = name return result if unit is not None: data = to_timedelta(data, unit=unit, box=False) if is_scalar(data): raise ValueError('TimedeltaIndex() must be called with a ' 'collection of some kind, {data} was passed' .format(data=repr(data))) # convert if not already if getattr(data, 'dtype', None) != _TD_DTYPE: data = to_timedelta(data, unit=unit, box=False) elif copy: data = np.array(data, copy=True) data = np.array(data, copy=False) if data.dtype == np.object_: data = array_to_timedelta64(data) if data.dtype != _TD_DTYPE: if is_timedelta64_dtype(data): # non-nano unit # TODO: watch out for overflows data = data.astype(_TD_DTYPE) else: data = ensure_int64(data).view(_TD_DTYPE) assert data.dtype == 'm8[ns]', data.dtype subarr = cls._simple_new(data, name=name, freq=freq) # check that we are matching freqs if verify_integrity and len(subarr) > 0: if freq is not None and not freq_infer: cls._validate_frequency(subarr, freq) if freq_infer: inferred = subarr.inferred_freq if inferred: subarr.freq = to_offset(inferred) return subarr
def __sub__(self, other): from pandas import Index other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(-other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(-other) elif isinstance(other, (datetime, np.datetime64)): result = self._sub_datelike(other) elif is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these result = self.shift(-other) elif isinstance(other, Period): result = self._sub_period(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.sub) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] result = self._sub_datelike(other) elif isinstance(other, Index): raise TypeError("cannot subtract {cls} and {typ}" .format(cls=type(self).__name__, typ=type(other).__name__)) elif is_integer_dtype(other) and self.freq is None: # GH#19123 raise NullFrequencyError("Cannot shift with no freq") elif is_float_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot subtract {dtype}-dtype from {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) else: # pragma: no cover return NotImplemented if result is NotImplemented: return NotImplemented elif not isinstance(result, Index): # Index.__new__ will choose appropriate subclass for dtype result = Index(result) res_name = ops.get_op_result_name(self, other) result.name = res_name return result
def test_numpy_array_all_dtypes(any_numpy_dtype): ser = pd.Series(dtype=any_numpy_dtype) result = ser.array if is_datetime64_dtype(any_numpy_dtype): assert isinstance(result, DatetimeArray) elif is_timedelta64_dtype(any_numpy_dtype): assert isinstance(result, TimedeltaArray) else: assert isinstance(result, PandasArray)
def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ["transform", "aggregate"] orig_values = values # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values) or is_sparse(values): raise NotImplementedError("{} dtype not supported".format( values.dtype)) elif is_datetime64_any_dtype(values): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( "datetime64 type does not support {} operations".format( how)) elif is_timedelta64_dtype(values): if how in ["prod", "cumprod"]: raise NotImplementedError( "timedelta64 type does not support {} operations".format( how)) if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? kludge can be avoided when # 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: try: values = ensure_float64(values) except TypeError: if lib.infer_dtype(values, skipna=False) == "complex": values = values.astype(complex) else: raise func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = "{kind}{itemsize}".format( kind=values.dtype.kind, itemsize=values.dtype.itemsize) else: out_dtype = "object" labels, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( result, counts, values, labels, func, is_numeric, is_datetimelike, min_count, ) elif kind == "transform": result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, labels, func, is_numeric, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: # TODO names = self._name_functions[how]() else: names = None if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype): result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) return result, names
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. .. versionadded:: 0.19.2 Parameters ---------- vals : ndarray, Categorical encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. .. versionadded:: 0.20.0 Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if not hasattr(vals, 'dtype'): raise TypeError("must pass a ndarray-like") if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. if is_bool_array(vals): vals = vals.astype('u8') elif (is_datetime64_dtype(vals) or is_timedelta64_dtype(vals)): vals = vals.view('i8').astype('u8', copy=False) elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = _hash.hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types vals = _hash.hash_object_array( vals.astype(str).astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals
def test_is_timedelta64_dtype(): assert not com.is_timedelta64_dtype(object) assert not com.is_timedelta64_dtype(None) assert not com.is_timedelta64_dtype([1, 2, 3]) assert not com.is_timedelta64_dtype(np.array([], dtype=np.datetime64)) assert not com.is_timedelta64_dtype("0 days") assert not com.is_timedelta64_dtype("0 days 00:00:00") assert not com.is_timedelta64_dtype(["0 days 00:00:00"]) assert not com.is_timedelta64_dtype("NO DATE") assert com.is_timedelta64_dtype(np.timedelta64) assert com.is_timedelta64_dtype(pd.Series([], dtype="timedelta64[ns]")) assert com.is_timedelta64_dtype(pd.to_timedelta(["0 days", "1 days"]))
def __floordiv__(self, other): if is_scalar(other): if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) if other is NaT: # treat this specifically as timedelta-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # dispatch to Timedelta implementation result = other.__rfloordiv__(self._data) return result # at this point we should only have numeric scalars; anything # else will raise result = self.asi8 // other result[self._isnan] = iNaT freq = None if self.freq is not None: # Note: freq gets division, not floor-division freq = self.freq / other if freq.nanos == 0 and self.freq.nanos != 0: # e.g. if self.freq is Nano(1) then dividing by 2 # rounds down to zero freq = None return type(self)(result.view("m8[ns]"), freq=freq) if not hasattr(other, "dtype"): # list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide with unequal lengths") elif is_timedelta64_dtype(other.dtype): other = type(self)(other) # numpy timedelta64 does not natively support floordiv, so operate # on the i8 values result = self.asi8 // other.asi8 mask = self._isnan | other._isnan if mask.any(): result = result.astype(np.float64) result[mask] = np.nan return result elif is_object_dtype(other.dtype): result = [self[n] // other[n] for n in range(len(self))] result = np.array(result) if lib.infer_dtype(result, skipna=False) == "timedelta": result, _ = sequence_to_td64ns(result) return type(self)(result) return result elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype): result = self._data // other return type(self)(result) else: dtype = getattr(other, "dtype", type(other).__name__) raise TypeError(f"Cannot divide {dtype} by {type(self).__name__}")
def __truediv__(self, other): # timedelta / X is well-defined for timedelta-like or numeric X if isinstance(other, self._recognized_scalars): other = Timedelta(other) if other is NaT: # specifically timedelta64-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # otherwise, dispatch to Timedelta implementation return self._ndarray / other elif lib.is_scalar(other): # assume it is numeric result = self._ndarray / other freq = None if self.freq is not None: # Tick division is not implemented, so operate on Timedelta freq = self.freq.delta / other return type(self)(result, freq=freq) if not hasattr(other, "dtype"): # e.g. list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide vectors with unequal lengths") elif is_timedelta64_dtype(other.dtype): # let numpy handle it return self._ndarray / other elif is_object_dtype(other.dtype): # We operate on raveled arrays to avoid problems in inference # on NaT srav = self.ravel() orav = other.ravel() result = [srav[n] / orav[n] for n in range(len(srav))] result = np.array(result).reshape(self.shape) # We need to do dtype inference in order to keep DataFrame ops # behavior consistent with Series behavior inferred = lib.infer_dtype(result, skipna=False) if inferred == "timedelta": flat = result.ravel() result = type(self)._from_sequence(flat).reshape(result.shape) elif inferred == "floating": result = result.astype(float) elif inferred == "datetime": # GH#39750 this occurs when result is all-NaT, in which case # we want to interpret these NaTs as td64. # We construct an all-td64NaT result. result = self * np.nan return result else: result = self._ndarray / other return type(self)(result)
def __floordiv__(self, other): if is_scalar(other): if isinstance(other, self._recognized_scalars): other = Timedelta(other) if other is NaT: # treat this specifically as timedelta-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # dispatch to Timedelta implementation result = other.__rfloordiv__(self._ndarray) return result # at this point we should only have numeric scalars; anything # else will raise result = self._ndarray // other freq = None if self.freq is not None: # Note: freq gets division, not floor-division freq = self.freq / other if freq.nanos == 0 and self.freq.nanos != 0: # e.g. if self.freq is Nano(1) then dividing by 2 # rounds down to zero freq = None return type(self)(result, freq=freq) if not hasattr(other, "dtype"): # list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide with unequal lengths") elif is_timedelta64_dtype(other.dtype): other = type(self)(other) # numpy timedelta64 does not natively support floordiv, so operate # on the i8 values result = self.asi8 // other.asi8 mask = self._isnan | other._isnan if mask.any(): result = result.astype(np.float64) np.putmask(result, mask, np.nan) return result elif is_object_dtype(other.dtype): # error: Incompatible types in assignment (expression has type # "List[Any]", variable has type "ndarray") srav = self.ravel() orav = other.ravel() res_list = [srav[n] // orav[n] for n in range(len(srav))] result_flat = np.asarray(res_list) inferred = lib.infer_dtype(result_flat, skipna=False) result = result_flat.reshape(self.shape) if inferred == "timedelta": result, _ = sequence_to_td64ns(result) return type(self)(result) if inferred == "datetime": # GH#39750 occurs when result is all-NaT, which in this # case should be interpreted as td64nat. This can only # occur when self is all-td64nat return self * np.nan return result elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype): result = self._ndarray // other return type(self)(result) else: dtype = getattr(other, "dtype", type(other).__name__) raise TypeError(f"Cannot divide {dtype} by {type(self).__name__}")
def __init__( self, index: Index, grouper=None, obj: Optional[FrameOrSeries] = None, name=None, level=None, sort: bool = True, observed: bool = False, in_axis: bool = False, dropna: bool = True, ): self.name = name self.level = level self.grouper = _convert_grouper(index, grouper) self.all_grouper = None self.index = index self.sort = sort self.obj = obj self.observed = observed self.in_axis = in_axis self.dropna = dropna # right place for this? if isinstance(grouper, (Series, Index)) and name is None: self.name = grouper.name if isinstance(grouper, MultiIndex): self.grouper = grouper._values # we have a single grouper which may be a myriad of things, # some of which are dependent on the passing in level if level is not None: if not isinstance(level, int): if level not in index.names: raise AssertionError(f"Level {level} not in index") level = index.names.index(level) if self.name is None: self.name = index.names[level] ( self.grouper, self._codes, self._group_index, ) = index._get_grouper_for_level(self.grouper, level) # a passed Grouper like, directly get the grouper in the same way # as single grouper groupby, use the group_info to get codes elif isinstance(self.grouper, Grouper): # get the new grouper; we already have disambiguated # what key/level refer to exactly, don't need to # check again as we have by this point converted these # to an actual value (rather than a pd.Grouper) _, grouper, _ = self.grouper._get_grouper(self.obj, validate=False) if self.name is None: self.name = grouper.result_index.name self.obj = self.grouper.obj self.grouper = grouper._get_grouper() else: if self.grouper is None and self.name is not None and self.obj is not None: self.grouper = self.obj[self.name] elif isinstance(self.grouper, (list, tuple)): self.grouper = com.asarray_tuplesafe(self.grouper) # a passed Categorical elif is_categorical_dtype(self.grouper): self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed) categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes self._codes = self.grouper.codes if observed: codes = algorithms.unique1d(self.grouper.codes) codes = codes[codes != -1] if sort or self.grouper.ordered: codes = np.sort(codes) else: codes = np.arange(len(categories)) self._group_index = CategoricalIndex( Categorical.from_codes(codes=codes, categories=categories, ordered=self.grouper.ordered), name=self.name, ) # we are done if isinstance(self.grouper, Grouping): self.grouper = self.grouper.grouper # no level passed elif not isinstance(self.grouper, (Series, Index, ExtensionArray, np.ndarray)): if getattr(self.grouper, "ndim", 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError(f"Grouper for '{t}' not 1-dimensional") self.grouper = self.index.map(self.grouper) if not (hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index)): grper = pprint_thing(self.grouper) errmsg = ("Grouper result violates len(labels) == " f"len(data)\nresult: {grper}") self.grouper = None # Try for sanity raise AssertionError(errmsg) # if we have a date/time-like grouper, make sure that we have # Timestamps like if getattr(self.grouper, "dtype", None) is not None: if is_datetime64_dtype(self.grouper): self.grouper = self.grouper.astype("datetime64[ns]") elif is_timedelta64_dtype(self.grouper): self.grouper = self.grouper.astype("timedelta64[ns]")
def _arith_method(self, other, op): """ Parameters ---------- other : Any op : callable that accepts 2 params perform the binary op """ if isinstance(other, ABCTimedeltaIndex): # Defer to TimedeltaIndex implementation return NotImplemented elif isinstance(other, (timedelta, np.timedelta64)): # GH#19333 is_integer evaluated True on timedelta64, # so we need to catch these explicitly return op(self._int64index, other) elif is_timedelta64_dtype(other): # Must be an np.ndarray; GH#22390 return op(self._int64index, other) if op in [ operator.pow, ops.rpow, operator.mod, ops.rmod, ops.rfloordiv, divmod, ops.rdivmod, ]: return op(self._int64index, other) step = False if op in [operator.mul, ops.rmul, operator.truediv, ops.rtruediv]: step = op other = extract_array(other, extract_numpy=True) attrs = self._get_attributes_dict() left, right = self, other try: # apply if we have an override if step: with np.errstate(all="ignore"): rstep = step(left.step, right) # we don't have a representable op # so return a base index if not is_integer(rstep) or not rstep: raise ValueError else: rstep = left.step with np.errstate(all="ignore"): rstart = op(left.start, right) rstop = op(left.stop, right) result = type(self)(rstart, rstop, rstep, **attrs) # for compat with numpy / Int64Index # even if we can represent as a RangeIndex, return # as a Float64Index if we have float-like descriptors if not all(is_integer(x) for x in [rstart, rstop, rstep]): result = result.astype("float64") return result except (ValueError, TypeError, ZeroDivisionError): # Defer to Int64Index implementation return op(self._int64index, other)
def _cython_operation(self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs) -> Tuple[np.ndarray, Optional[List[str]]]: """ Returns the values of a cython operation as a Tuple of [data, names]. Names is only useful when dealing with 2D results, like ohlc (see self._name_functions). """ assert kind in ["transform", "aggregate"] orig_values = values if values.ndim > 2: raise NotImplementedError( "number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values.dtype) or is_sparse(values.dtype): raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values.dtype): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( f"datetime64 type does not support {how} operations") elif is_timedelta64_dtype(values.dtype): if how in ["prod", "cumprod"]: raise NotImplementedError( f"timedelta64 type does not support {how} operations") if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? # TODO(EA2D):kludge can be avoided when 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_int_or_float(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups, ) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, codes, func, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] names: Optional[List[str]] = self._name_functions.get(how, None) if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype( orig_values.dtype): # We need to use the constructors directly for these dtypes # since numpy won't recognize them # https://github.com/pandas-dev/pandas/issues/31471 result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) if is_extension_array_dtype(orig_values.dtype): result = maybe_cast_result(result=result, obj=orig_values, how=how) return result, names
def _get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype na """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.float64, np.nan if _is_uniform_reindex(join_units): # FIXME: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): if unit.block is None: has_none_blocks = True else: dtypes[i] = unit.dtype upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) for dtype, unit in zip(dtypes, join_units): if dtype is None: continue if is_categorical_dtype(dtype): upcast_cls = "category" elif is_datetime64tz_dtype(dtype): upcast_cls = "datetimetz" elif issubclass(dtype.type, np.bool_): upcast_cls = "bool" elif issubclass(dtype.type, np.object_): upcast_cls = "object" elif is_datetime64_dtype(dtype): upcast_cls = "datetime" elif is_timedelta64_dtype(dtype): upcast_cls = "timedelta" elif is_sparse(dtype): upcast_cls = dtype.subtype.name elif is_extension_array_dtype(dtype): upcast_cls = "object" elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: upcast_cls = "float" # Null blocks should not influence upcast class selection, unless there # are only null blocks, when same upcasting rules must be applied to # null upcast classes. if unit.is_na: null_upcast_classes[upcast_cls].append(dtype) else: upcast_classes[upcast_cls].append(dtype) if not upcast_classes: upcast_classes = null_upcast_classes # TODO: de-duplicate with maybe_promote? # create the result if "object" in upcast_classes: return np.dtype(np.object_), np.nan elif "bool" in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None elif "category" in upcast_classes: return np.dtype(np.object_), np.nan elif "datetimetz" in upcast_classes: # GH-25014. We use NaT instead of iNaT, since this eventually # ends up in DatetimeArray.take, which does not allow iNaT. dtype = upcast_classes["datetimetz"] return dtype[0], tslibs.NaT elif "datetime" in upcast_classes: return np.dtype("M8[ns]"), tslibs.iNaT elif "timedelta" in upcast_classes: return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") else: # pragma try: g = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: if is_float_dtype(g): return g, g.type(np.nan) elif is_numeric_dtype(g): if has_none_blocks: return np.float64, np.nan else: return g, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ Can we compare values of the given dtype to our own? """ return is_timedelta64_dtype(dtype)
def _convert_from_pandas(self, pdf: "PandasDataFrameLike", schema: Union[StructType, str, List[str]], timezone: str) -> List: """ Convert a pandas.DataFrame to list of records that can be used to make a DataFrame Returns ------- list list of records """ import pandas as pd from pyspark.sql import SparkSession assert isinstance(self, SparkSession) if timezone is not None: from pyspark.sql.pandas.types import _check_series_convert_timestamps_tz_local from pandas.core.dtypes.common import is_datetime64tz_dtype, is_timedelta64_dtype copied = False if isinstance(schema, StructType): for field in schema: # TODO: handle nested timestamps, such as ArrayType(TimestampType())? if isinstance(field.dataType, TimestampType): s = _check_series_convert_timestamps_tz_local( pdf[field.name], timezone) if s is not pdf[field.name]: if not copied: # Copy once if the series is modified to prevent the original # Pandas DataFrame from being updated pdf = pdf.copy() copied = True pdf[field.name] = s else: should_localize = not is_timestamp_ntz_preferred() for column, series in pdf.iteritems(): s = series if should_localize and is_datetime64tz_dtype( s.dtype) and s.dt.tz is not None: s = _check_series_convert_timestamps_tz_local( series, timezone) if s is not series: if not copied: # Copy once if the series is modified to prevent the original # Pandas DataFrame from being updated pdf = pdf.copy() copied = True pdf[column] = s for column, series in pdf.iteritems(): if is_timedelta64_dtype(series): if not copied: pdf = pdf.copy() copied = True # Explicitly set the timedelta as object so the output of numpy records can # hold the timedelta instances as are. Otherwise, it converts to the internal # numeric values. ser = pdf[column] pdf[column] = pd.Series(ser.dt.to_pytimedelta(), index=ser.index, dtype="object", name=ser.name) # Convert pandas.DataFrame to list of numpy records np_records = pdf.to_records(index=False) # Check if any columns need to be fixed for Spark to infer properly if len(np_records) > 0: record_dtype = self._get_numpy_record_dtype(np_records[0]) if record_dtype is not None: return [r.astype(record_dtype).tolist() for r in np_records] # Convert list of numpy records to python lists return [r.tolist() for r in np_records]
def _cython_operation(self, kind, values, how, axis, min_count=-1, **kwargs): assert kind in ['transform', 'aggregate'] # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values): raise NotImplementedError( "categoricals are not support in cython ops ATM") elif is_datetime64_any_dtype(values): if how in ['add', 'prod', 'cumsum', 'cumprod']: raise NotImplementedError( "datetime64 type does not support {} " "operations".format(how)) elif is_timedelta64_dtype(values): if how in ['prod', 'cumprod']: raise NotImplementedError( "timedelta64 type does not support {} " "operations".format(how)) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True values = values.swapaxes(0, axis) if arity > 1: raise NotImplementedError("arity of more than 1 is not " "supported for the 'how' argument") out_shape = (self.ngroups, ) + values.shape[1:] is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view('int64') is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_float64(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int64_or_float64(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) try: func = self._get_cython_function(kind, how, values, is_numeric) except NotImplementedError: if is_numeric: values = ensure_float64(values) func = self._get_cython_function(kind, how, values, is_numeric) else: raise if how == 'rank': out_dtype = 'float' else: if is_numeric: out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize) else: out_dtype = 'object' labels, _, _ = self.group_info if kind == 'aggregate': result = _maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, labels, func, is_numeric, is_datetimelike, min_count) elif kind == 'transform': result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) # TODO: min_count result = self._transform(result, values, labels, func, is_numeric, is_datetimelike, **kwargs) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype('float64') result[mask] = np.nan if (kind == 'aggregate' and self._filter_empty_groups and not counts.all()): if result.ndim == 2: try: result = lib.row_bool_subset(result, (counts > 0).view(np.uint8)) except ValueError: result = lib.row_bool_subset_object( ensure_object(result), (counts > 0).view(np.uint8)) else: result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if how in self._name_functions: # TODO names = self._name_functions[how]() else: names = None if swapped: result = result.swapaxes(0, axis) return result, names
def sequence_to_td64ns(data, copy=False, unit=None, errors="raise"): """ Parameters ---------- data : list-like copy : bool, default False unit : str, optional The timedelta unit to treat integers as multiples of. For numeric data this defaults to ``'ns'``. Must be un-specified if the data contains a str and ``errors=="raise"``. errors : {"raise", "coerce", "ignore"}, default "raise" How to handle elements that cannot be converted to timedelta64[ns]. See ``pandas.to_timedelta`` for details. Returns ------- converted : numpy.ndarray The sequence converted to a numpy array with dtype ``timedelta64[ns]``. inferred_freq : Tick or None The inferred frequency of the sequence. Raises ------ ValueError : Data cannot be converted to timedelta64[ns]. Notes ----- Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause errors to be ignored; they are caught and subsequently ignored at a higher level. """ inferred_freq = None if unit is not None: unit = parse_timedelta_unit(unit) # Unwrap whatever we have into a np.ndarray if not hasattr(data, "dtype"): # e.g. list, tuple if np.ndim(data) == 0: # i.e. generator data = list(data) data = np.array(data, copy=False) elif isinstance(data, ABCSeries): data = data._values elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)): inferred_freq = data.freq data = data._data elif isinstance(data, IntegerArray): data = data.to_numpy("int64", na_value=tslibs.iNaT) # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data.dtype) or is_string_dtype(data.dtype): # no need to make a copy, need to convert if string-dtyped data = objects_to_td64ns(data, unit=unit, errors=errors) copy = False elif is_integer_dtype(data.dtype): # treat as multiples of the given unit data, copy_made = ints_to_td64ns(data, unit=unit) copy = copy and not copy_made elif is_float_dtype(data.dtype): # cast the unit, multiply base/frac separately # to avoid precision issues from float -> int mask = np.isnan(data) m, p = precision_from_unit(unit or "ns") base = data.astype(np.int64) frac = data - base if p: frac = np.round(frac, p) data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]") data[mask] = iNaT copy = False elif is_timedelta64_dtype(data.dtype): if data.dtype != TD64NS_DTYPE: # non-nano unit # TODO: watch out for overflows data = data.astype(TD64NS_DTYPE) copy = False else: # This includes datetime64-dtype, see GH#23539, GH#29794 raise TypeError( f"dtype {data.dtype} cannot be converted to timedelta64[ns]") data = np.array(data, copy=copy) assert data.dtype == "m8[ns]", data return data, inferred_freq
def toPandas(self) -> "PandasDataFrameLike": """ Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``. This is only available if Pandas is installed and available. .. versionadded:: 1.3.0 Notes ----- This method should only be used if the resulting Pandas ``pandas.DataFrame`` is expected to be small, as all the data is loaded into the driver's memory. Usage with ``spark.sql.execution.arrow.pyspark.enabled=True`` is experimental. Examples -------- >>> df.toPandas() # doctest: +SKIP age name 0 2 Alice 1 5 Bob """ from pyspark.sql.dataframe import DataFrame assert isinstance(self, DataFrame) from pyspark.sql.pandas.utils import require_minimum_pandas_version require_minimum_pandas_version() import numpy as np import pandas as pd from pandas.core.dtypes.common import is_timedelta64_dtype jconf = self.sparkSession._jconf timezone = jconf.sessionLocalTimeZone() if jconf.arrowPySparkEnabled(): use_arrow = True try: from pyspark.sql.pandas.types import to_arrow_schema from pyspark.sql.pandas.utils import require_minimum_pyarrow_version require_minimum_pyarrow_version() to_arrow_schema(self.schema) except Exception as e: if jconf.arrowPySparkFallbackEnabled(): msg = ( "toPandas attempted Arrow optimization because " "'spark.sql.execution.arrow.pyspark.enabled' is set to true; however, " "failed by the reason below:\n %s\n" "Attempting non-optimization as " "'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to " "true." % str(e)) warn(msg) use_arrow = False else: msg = ( "toPandas attempted Arrow optimization because " "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has " "reached the error below and will not continue because automatic fallback " "with 'spark.sql.execution.arrow.pyspark.fallback.enabled' has been set to " "false.\n %s" % str(e)) warn(msg) raise # Try to use Arrow optimization when the schema is supported and the required version # of PyArrow is found, if 'spark.sql.execution.arrow.pyspark.enabled' is enabled. if use_arrow: try: from pyspark.sql.pandas.types import ( _check_series_localize_timestamps, _convert_map_items_to_dict, ) import pyarrow # Rename columns to avoid duplicated column names. tmp_column_names = [ "col_{}".format(i) for i in range(len(self.columns)) ] self_destruct = jconf.arrowPySparkSelfDestructEnabled() batches = self.toDF(*tmp_column_names)._collect_as_arrow( split_batches=self_destruct) if len(batches) > 0: table = pyarrow.Table.from_batches(batches) # Ensure only the table has a reference to the batches, so that # self_destruct (if enabled) is effective del batches # Pandas DataFrame created from PyArrow uses datetime64[ns] for date type # values, but we should use datetime.date to match the behavior with when # Arrow optimization is disabled. pandas_options = {"date_as_object": True} if self_destruct: # Configure PyArrow to use as little memory as possible: # self_destruct - free columns as they are converted # split_blocks - create a separate Pandas block for each column # use_threads - convert one column at a time pandas_options.update({ "self_destruct": True, "split_blocks": True, "use_threads": False, }) pdf = table.to_pandas(**pandas_options) # Rename back to the original column names. pdf.columns = self.columns for field in self.schema: if isinstance(field.dataType, TimestampType): pdf[field. name] = _check_series_localize_timestamps( pdf[field.name], timezone) elif isinstance(field.dataType, MapType): pdf[field.name] = _convert_map_items_to_dict( pdf[field.name]) return pdf else: corrected_panda_types = {} for index, field in enumerate(self.schema): pandas_type = PandasConversionMixin._to_corrected_pandas_type( field.dataType) corrected_panda_types[tmp_column_names[index]] = ( np.object0 if pandas_type is None else pandas_type) pdf = pd.DataFrame(columns=tmp_column_names).astype( dtype=corrected_panda_types) pdf.columns = self.columns return pdf except Exception as e: # We might have to allow fallback here as well but multiple Spark jobs can # be executed. So, simply fail in this case for now. msg = ( "toPandas attempted Arrow optimization because " "'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has " "reached the error below and can not continue. Note that " "'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an " "effect on failures in the middle of " "computation.\n %s" % str(e)) warn(msg) raise # Below is toPandas without Arrow optimization. pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns) column_counter = Counter(self.columns) corrected_dtypes: List[Optional[Type]] = [None] * len(self.schema) for index, field in enumerate(self.schema): # We use `iloc` to access columns with duplicate column names. if column_counter[field.name] > 1: pandas_col = pdf.iloc[:, index] else: pandas_col = pdf[field.name] pandas_type = PandasConversionMixin._to_corrected_pandas_type( field.dataType) # SPARK-21766: if an integer field is nullable and has null values, it can be # inferred by pandas as a float column. If we convert the column with NaN back # to integer type e.g., np.int16, we will hit an exception. So we use the # pandas-inferred float type, rather than the corrected type from the schema # in this case. if pandas_type is not None and not (isinstance( field.dataType, IntegralType) and field.nullable and pandas_col.isnull().any()): corrected_dtypes[index] = pandas_type # Ensure we fall back to nullable numpy types. if isinstance(field.dataType, IntegralType) and pandas_col.isnull().any(): corrected_dtypes[index] = np.float64 if isinstance(field.dataType, BooleanType) and pandas_col.isnull().any(): corrected_dtypes[ index] = np.object # type: ignore[attr-defined] df = pd.DataFrame() for index, t in enumerate(corrected_dtypes): column_name = self.schema[index].name # We use `iloc` to access columns with duplicate column names. if column_counter[column_name] > 1: series = pdf.iloc[:, index] else: series = pdf[column_name] # No need to cast for non-empty series for timedelta. The type is already correct. should_check_timedelta = is_timedelta64_dtype(t) and len(pdf) == 0 if (t is not None and not is_timedelta64_dtype(t)) or should_check_timedelta: series = series.astype(t, copy=False) with catch_warnings(): from pandas.errors import PerformanceWarning simplefilter(action="ignore", category=PerformanceWarning) # `insert` API makes copy of data, # we only do it for Series of duplicate column names. # `pdf.iloc[:, index] = pdf.iloc[:, index]...` doesn't always work # because `iloc` could return a view or a copy depending by context. if column_counter[column_name] > 1: df.insert(index, column_name, series, allow_duplicates=True) else: df[column_name] = series if timezone is None: return df else: from pyspark.sql.pandas.types import _check_series_convert_timestamps_local_tz for field in self.schema: # TODO: handle nested timestamps, such as ArrayType(TimestampType())? if isinstance(field.dataType, TimestampType): df[field.name] = _check_series_convert_timestamps_local_tz( df[field.name], timezone) return df
def infer_freq(index, warn: bool = True) -> str | None: """ Infer the most likely frequency given the input index. If the frequency is uncertain, a warning will be printed. Parameters ---------- index : DatetimeIndex or TimedeltaIndex If passed a Series will use the values of the series (NOT THE INDEX). warn : bool, default True Returns ------- str or None None if no discernible frequency. Raises ------ TypeError If the index is not datetime-like. ValueError If there are fewer than three values. Examples -------- >>> idx = pd.date_range(start='2020/12/01', end='2020/12/30', periods=30) >>> pd.infer_freq(idx) 'D' """ from pandas.core.api import ( DatetimeIndex, Float64Index, Index, Int64Index, ) if isinstance(index, ABCSeries): values = index._values if not (is_datetime64_dtype(values) or is_timedelta64_dtype(values) or values.dtype == object): raise TypeError("cannot infer freq from a non-convertible dtype " f"on a Series of {index.dtype}") index = values inferer: _FrequencyInferer if not hasattr(index, "dtype"): pass elif is_period_dtype(index.dtype): raise TypeError("PeriodIndex given. Check the `freq` attribute " "instead of using infer_freq.") elif is_timedelta64_dtype(index.dtype): # Allow TimedeltaIndex and TimedeltaArray inferer = _TimedeltaFrequencyInferer(index, warn=warn) return inferer.get_freq() if isinstance(index, Index) and not isinstance(index, DatetimeIndex): if isinstance(index, (Int64Index, Float64Index)): raise TypeError( f"cannot infer freq from a non-convertible index type {type(index)}" ) index = index._values if not isinstance(index, DatetimeIndex): index = DatetimeIndex(index) inferer = _FrequencyInferer(index, warn=warn) return inferer.get_freq()
def __floordiv__(self, other): if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): return NotImplemented other = lib.item_from_zerodim(other) if is_scalar(other): if isinstance(other, (timedelta, np.timedelta64, Tick)): other = Timedelta(other) if other is NaT: # treat this specifically as timedelta-NaT result = np.empty(self.shape, dtype=np.float64) result.fill(np.nan) return result # dispatch to Timedelta implementation result = other.__rfloordiv__(self._data) return result # at this point we should only have numeric scalars; anything # else will raise result = self.asi8 // other result[self._isnan] = iNaT freq = None if self.freq is not None: # Note: freq gets division, not floor-division freq = self.freq / other return type(self)(result.view('m8[ns]'), freq=freq) if not hasattr(other, "dtype"): # list, tuple other = np.array(other) if len(other) != len(self): raise ValueError("Cannot divide with unequal lengths") elif is_timedelta64_dtype(other): other = type(self)(other) # numpy timedelta64 does not natively support floordiv, so operate # on the i8 values result = self.asi8 // other.asi8 mask = self._isnan | other._isnan if mask.any(): result = result.astype(np.int64) result[mask] = np.nan return result elif is_object_dtype(other): result = [self[n] // other[n] for n in range(len(self))] result = np.array(result) if lib.infer_dtype(result, skipna=False) == 'timedelta': result, _ = sequence_to_td64ns(result) return type(self)(result) return result elif is_integer_dtype(other) or is_float_dtype(other): result = self._data // other return type(self)(result) else: dtype = getattr(other, "dtype", type(other).__name__) raise TypeError("Cannot divide {typ} by {cls}" .format(typ=dtype, cls=type(self).__name__))
def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): """ Parameters ---------- array : list-like copy : bool, default False unit : str, default "ns" The timedelta unit to treat integers as multiples of. errors : {"raise", "coerce", "ignore"}, default "raise" How to handle elements that cannot be converted to timedelta64[ns]. See ``pandas.to_timedelta`` for details. Returns ------- converted : numpy.ndarray The sequence converted to a numpy array with dtype ``timedelta64[ns]``. inferred_freq : Tick or None The inferred frequency of the sequence. Raises ------ ValueError : Data cannot be converted to timedelta64[ns]. Notes ----- Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause errors to be ignored; they are caught and subsequently ignored at a higher level. """ inferred_freq = None unit = parse_timedelta_unit(unit) # Unwrap whatever we have into a np.ndarray if not hasattr(data, 'dtype'): # e.g. list, tuple if np.ndim(data) == 0: # i.e. generator data = list(data) data = np.array(data, copy=False) elif isinstance(data, ABCSeries): data = data._values elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)): inferred_freq = data.freq data = data._data # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data.dtype) or is_string_dtype(data.dtype): # no need to make a copy, need to convert if string-dtyped data = objects_to_td64ns(data, unit=unit, errors=errors) copy = False elif is_integer_dtype(data.dtype): # treat as multiples of the given unit data, copy_made = ints_to_td64ns(data, unit=unit) copy = copy and not copy_made elif is_float_dtype(data.dtype): # cast the unit, multiply base/frace separately # to avoid precision issues from float -> int mask = np.isnan(data) m, p = precision_from_unit(unit) base = data.astype(np.int64) frac = data - base if p: frac = np.round(frac, p) data = (base * m + (frac * m).astype(np.int64)).view('timedelta64[ns]') data[mask] = iNaT copy = False elif is_timedelta64_dtype(data.dtype): if data.dtype != _TD_DTYPE: # non-nano unit # TODO: watch out for overflows data = data.astype(_TD_DTYPE) copy = False elif is_datetime64_dtype(data): # GH#23539 warnings.warn("Passing datetime64-dtype data to TimedeltaIndex is " "deprecated, will raise a TypeError in a future " "version", FutureWarning, stacklevel=4) data = ensure_int64(data).view(_TD_DTYPE) else: raise TypeError("dtype {dtype} cannot be converted to timedelta64[ns]" .format(dtype=data.dtype)) data = np.array(data, copy=copy) if data.ndim != 1: raise ValueError("Only 1-dimensional input arrays are supported.") assert data.dtype == 'm8[ns]', data return data, inferred_freq
def __sub__(self, other): from pandas import Index other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(-other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(-other) elif isinstance(other, (datetime, np.datetime64)): result = self._sub_datelike(other) elif is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these result = self.shift(-other) elif isinstance(other, Period): result = self._sub_period(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.sub) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] result = self._sub_datelike(other) elif is_period_dtype(other): # PeriodIndex result = self._sub_period_array(other) elif is_integer_dtype(other): result = self._addsub_int_array(other, operator.sub) elif isinstance(other, Index): raise TypeError("cannot subtract {cls} and {typ}" .format(cls=type(self).__name__, typ=type(other).__name__)) elif is_float_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot subtract {dtype}-dtype from {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_categorical_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented if result is NotImplemented: return NotImplemented elif not isinstance(result, Index): # Index.__new__ will choose appropriate subclass for dtype result = Index(result) res_name = ops.get_op_result_name(self, other) result.name = res_name return result
def plot( trj: TrajaDataFrame, n_coords: Optional[int] = None, show_time: bool = False, accessor: Optional[traja.TrajaAccessor] = None, ax=None, **kwargs, ) -> matplotlib.collections.PathCollection: """Plot trajectory for single animal over period. Args: trj (:class:`traja.TrajaDataFrame`): trajectory n_coords (int, optional): Number of coordinates to plot show_time (bool): Show colormap as time accessor (:class:`~traja.accessor.TrajaAccessor`, optional): TrajaAccessor instance ax (:class:`~matplotlib.axes.Axes`): axes for plotting interactive (bool): show plot immediately **kwargs: additional keyword arguments to :meth:`matplotlib.axes.Axes.scatter` Returns: collection (:class:`~matplotlib.collections.PathCollection`): collection that was plotted """ import matplotlib.patches as patches from matplotlib.path import Path after_plot_args, kwargs = _get_after_plot_args(**kwargs) GRAY = "#999999" xlim = kwargs.pop("xlim", None) ylim = kwargs.pop("ylim", None) if not xlim or not ylim: xlim, ylim = traja.trajectory._get_xylim(trj) title = kwargs.pop("title", None) time_units = kwargs.pop("time_units", "s") fps = kwargs.pop("fps", None) figsize = kwargs.pop("figsize", None) coords = trj[["x", "y"]] time_col = traja.trajectory._get_time_col(trj) if time_col == "index": is_datetime = True else: is_datetime = is_datetime64_any_dtype(trj[time_col]) if time_col else False if n_coords is None: # Plot all coords start, end = 0, len(coords) verts = coords.iloc[start:end].values else: # Plot first `n_coords` verts = coords.iloc[:n_coords].values n_coords = len(verts) codes = [Path.MOVETO] + [Path.LINETO] * (len(verts) - 1) path = Path(verts, codes) if not ax: fig, ax = plt.subplots(figsize=figsize) fig.canvas.draw() patch = patches.PathPatch(path, edgecolor=GRAY, facecolor="none", lw=3, alpha=0.3) ax.add_patch(patch) xs, ys = zip(*verts) if time_col == "index": # DatetimeIndex determines color colors = [ind for ind, x in enumerate(trj.index[:n_coords])] elif time_col and time_col != "index": # `time_col` determines color colors = [ind for ind, x in enumerate(trj[time_col].iloc[:n_coords])] else: # Frame count determines color colors = trj.index[:n_coords] if time_col: # TODO: Calculate fps if not in datetime vmin = min(colors) vmax = max(colors) if is_datetime: # Show timestamps without units time_units = "" else: # Index/frame count is our only reference vmin = trj.index[0] vmax = trj.index[n_coords - 1] if not show_time: time_units = "" label = f"Time ({time_units})" if time_units else "" collection = ax.scatter( xs, ys, c=colors, s=kwargs.pop("s", 1), cmap=plt.cm.viridis, alpha=0.7, vmin=vmin, vmax=vmax, **kwargs, ) ax.set_xlim(xlim) ax.set_ylim(ylim) if kwargs.pop("invert_yaxis", None): plt.gca().invert_yaxis() _label_axes(trj, ax) ax.set_title(title) ax.set_aspect("equal") # Number of color bar ticks CBAR_TICKS = 10 if n_coords > 20 else n_coords indices = np.linspace(0, n_coords - 1, CBAR_TICKS, endpoint=True, dtype=int) cbar = plt.colorbar( collection, fraction=0.046, pad=0.04, orientation="vertical", label=label ) # Get colorbar labels from time if time_col == "index": if is_datetime64_any_dtype(trj.index): cbar_labels = ( trj.index[indices].strftime("%Y-%m-%d %H:%M:%S").values.astype(str) ) elif is_timedelta64_dtype(trj.index): if time_units in ("s", "", None): cbar_labels = [round(x, 2) for x in trj.index[indices].total_seconds()] else: logger.error("Time unit {} not yet implemented".format(time_units)) else: raise NotImplementedError( "Indexing on {} is not yet implemented".format(type(trj.index)) ) elif time_col and is_timedelta64_dtype(trj[time_col]): cbar_labels = trj[time_col].iloc[indices].dt.total_seconds().values cbar_labels = ["%.2f" % number for number in cbar_labels] elif time_col and is_datetime: cbar_labels = ( trj[time_col] .iloc[indices] .dt.strftime("%Y-%m-%d %H:%M:%S") .values.astype(str) ) else: # Convert frames to time if time_col: cbar_labels = trj[time_col].iloc[indices].values else: cbar_labels = trj.index[indices].values cbar_labels = np.round(cbar_labels, 6) if fps is not None and fps > 0 and fps != 1 and show_time: cbar_labels = cbar_labels / fps cbar.set_ticks(indices) cbar.set_ticklabels(cbar_labels) plt.tight_layout() _process_after_plot_args(**after_plot_args) return collection