def test_transform_casting(self): # 13046 data = """ idx A ID3 DATETIME 0 B-028 b76cd912ff "2014-10-08 13:43:27" 1 B-054 4a57ed0b02 "2014-10-08 14:26:19" 2 B-076 1a682034f8 "2014-10-08 14:29:01" 3 B-023 b76cd912ff "2014-10-08 18:39:34" 4 B-023 f88g8d7sds "2014-10-08 18:40:18" 5 B-033 b76cd912ff "2014-10-08 18:44:30" 6 B-032 b76cd912ff "2014-10-08 18:46:00" 7 B-037 b76cd912ff "2014-10-08 18:52:15" 8 B-046 db959faf02 "2014-10-08 18:59:59" 9 B-053 b76cd912ff "2014-10-08 19:17:48" 10 B-065 b76cd912ff "2014-10-08 19:21:38" """ df = pd.read_csv(StringIO(data), sep='\s+', index_col=[0], parse_dates=['DATETIME']) result = df.groupby('ID3')['DATETIME'].transform(lambda x: x.diff()) assert is_timedelta64_dtype(result.dtype) result = df[['ID3', 'DATETIME']].groupby('ID3').transform( lambda x: x.diff()) assert is_timedelta64_dtype(result.DATETIME.dtype)
def __init__(self, left, right, name, na_op): super(_TimeOp, self).__init__(left, right, name, na_op) lvalues = self._convert_to_array(left, name=name) rvalues = self._convert_to_array(right, name=name, other=lvalues) # left self.is_offset_lhs = self._is_offset(left) self.is_timedelta_lhs = is_timedelta64_dtype(lvalues) self.is_datetime64_lhs = is_datetime64_dtype(lvalues) self.is_datetime64tz_lhs = is_datetime64tz_dtype(lvalues) self.is_datetime_lhs = (self.is_datetime64_lhs or self.is_datetime64tz_lhs) self.is_integer_lhs = left.dtype.kind in ['i', 'u'] self.is_floating_lhs = left.dtype.kind == 'f' # right self.is_offset_rhs = self._is_offset(right) self.is_datetime64_rhs = is_datetime64_dtype(rvalues) self.is_datetime64tz_rhs = is_datetime64tz_dtype(rvalues) self.is_datetime_rhs = (self.is_datetime64_rhs or self.is_datetime64tz_rhs) self.is_timedelta_rhs = is_timedelta64_dtype(rvalues) self.is_integer_rhs = rvalues.dtype.kind in ('i', 'u') self.is_floating_rhs = rvalues.dtype.kind == 'f' self._validate(lvalues, rvalues, name) self.lvalues, self.rvalues = self._convert_for_datetime(lvalues, rvalues)
def test_is_timedelta(self): self.assertTrue(is_timedelta64_dtype('timedelta64')) self.assertTrue(is_timedelta64_dtype('timedelta64[ns]')) self.assertFalse(is_timedelta64_ns_dtype('timedelta64')) self.assertTrue(is_timedelta64_ns_dtype('timedelta64[ns]')) tdi = TimedeltaIndex([1e14, 2e14], dtype='timedelta64') self.assertTrue(is_timedelta64_dtype(tdi)) self.assertTrue(is_timedelta64_ns_dtype(tdi)) self.assertTrue(is_timedelta64_ns_dtype(tdi.astype('timedelta64[ns]'))) # Conversion to Int64Index: self.assertFalse(is_timedelta64_ns_dtype(tdi.astype('timedelta64'))) self.assertFalse(is_timedelta64_ns_dtype(tdi.astype('timedelta64[h]')))
def get_op(cls, left, right, name, na_op): """ Get op dispatcher, returns _Op or _TimeOp. If ``left`` and ``right`` are appropriate for datetime arithmetic with operation ``name``, processes them and returns a ``_TimeOp`` object that stores all the required values. Otherwise, it will generate either a ``_Op``, indicating that the operation is performed via normal numpy path. """ is_timedelta_lhs = is_timedelta64_dtype(left) is_datetime_lhs = (is_datetime64_dtype(left) or is_datetime64tz_dtype(left)) if isinstance(left, ABCSeries) and isinstance(right, ABCSeries): # avoid repated alignment if not left.index.equals(right.index): left, right = left.align(right, copy=False) index, lidx, ridx = left.index.join(right.index, how='outer', return_indexers=True) # if DatetimeIndex have different tz, convert to UTC left.index = index right.index = index if not (is_datetime_lhs or is_timedelta_lhs): return _Op(left, right, name, na_op) else: return _TimeOp(left, right, name, na_op)
def _maybe_convert_timedelta(self, other): if isinstance(other, (timedelta, np.timedelta64, offsets.Tick)): offset = frequencies.to_offset(self.freq.rule_code) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) offset_nanos = tslib._delta_to_nanoseconds(offset) if nanos % offset_nanos == 0: return nanos // offset_nanos elif isinstance(other, offsets.DateOffset): freqstr = other.rule_code base = frequencies.get_base_alias(freqstr) if base == self.freq.rule_code: return other.n msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) elif isinstance(other, np.ndarray): if is_integer_dtype(other): return other elif is_timedelta64_dtype(other): offset = frequencies.to_offset(self.freq) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) offset_nanos = tslib._delta_to_nanoseconds(offset) if (nanos % offset_nanos).all() == 0: return nanos // offset_nanos elif is_integer(other): # integer is passed to .shift via # _add_datetimelike_methods basically # but ufunc may pass integer to _add_delta return other # raise when input doesn't have freq msg = "Input has different freq from PeriodIndex(freq={0})" raise IncompatibleFrequency(msg.format(self.freqstr))
def _format_label(x, precision=3, dtype=None): fmt_str = "%%.%dg" % precision if is_datetime64_dtype(dtype): return to_datetime(x, unit="ns") if is_timedelta64_dtype(dtype): return to_timedelta(x, unit="ns") if np.isinf(x): return str(x) elif is_float(x): frac, whole = np.modf(x) sgn = "-" if x < 0 else "" whole = abs(whole) if frac != 0.0: val = fmt_str % frac # rounded up or down if "." not in val: if x < 0: return "%d" % (-whole - 1) else: return "%d" % (whole + 1) if "e" in val: return _trim_zeros(fmt_str % x) else: val = _trim_zeros(val) if "." in val: return sgn + ".".join(("%d" % whole, val.split(".")[1])) else: # pragma: no cover return sgn + ".".join(("%d" % whole, val)) else: return sgn + "%0.f" % whole else: return str(x)
def _format_labels(bins, precision, right=True, include_lowest=False, dtype=None): """ based on the dtype, return our labels """ closed = 'right' if right else 'left' if is_datetime64_dtype(dtype): formatter = Timestamp adjust = lambda x: x - Timedelta('1ns') elif is_timedelta64_dtype(dtype): formatter = Timedelta adjust = lambda x: x - Timedelta('1ns') else: precision = _infer_precision(precision, bins) formatter = lambda x: _round_frac(x, precision) adjust = lambda x: x - 10**(-precision) breaks = [formatter(b) for b in bins] labels = IntervalIndex.from_breaks(breaks, closed=closed) if right and include_lowest: # we will adjust the left hand side by precision to # account that we are all right closed v = adjust(labels[0].left) i = IntervalIndex.from_intervals( [Interval(v, labels[0].right, closed='right')]) labels = i.append(labels[1:]) return labels
def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): """Convert a list of objects to a timedelta index object.""" if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): arg = np.array(list(arg), dtype='O') # these are shortcut-able if is_timedelta64_dtype(arg): value = arg.astype('timedelta64[ns]') elif is_integer_dtype(arg): value = arg.astype('timedelta64[{0}]'.format( unit)).astype('timedelta64[ns]', copy=False) else: try: value = tslib.array_to_timedelta64(_ensure_object(arg), unit=unit, errors=errors) value = value.astype('timedelta64[ns]', copy=False) except ValueError: if errors == 'ignore': return arg else: # This else-block accounts for the cases when errors='raise' # and errors='coerce'. If errors == 'raise', these errors # should be raised. If errors == 'coerce', we shouldn't # expect any errors to be raised, since all parsing errors # cause coercion to pd.NaT. However, if an error / bug is # introduced that causes an Exception to be raised, we would # like to surface it. raise if box: from pandas import TimedeltaIndex value = TimedeltaIndex(value, unit='ns', name=name) return value
def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None): """Convert a list of objects to a timedelta index object.""" if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): arg = np.array(list(arg), dtype='O') # these are shortcut-able if is_timedelta64_dtype(arg): value = arg.astype('timedelta64[ns]') elif is_integer_dtype(arg): value = arg.astype('timedelta64[{0}]'.format(unit)).astype( 'timedelta64[ns]', copy=False) else: try: value = tslib.array_to_timedelta64(_ensure_object(arg), unit=unit, errors=errors) value = value.astype('timedelta64[ns]', copy=False) except ValueError: if errors == 'ignore': return arg else: # This else-block accounts for the cases when errors='raise' # and errors='coerce'. If errors == 'raise', these errors # should be raised. If errors == 'coerce', we shouldn't # expect any errors to be raised, since all parsing errors # cause coercion to pd.NaT. However, if an error / bug is # introduced that causes an Exception to be raised, we would # like to surface it. raise if box: from pandas import TimedeltaIndex value = TimedeltaIndex(value, unit='ns', name=name) return value
def _convert_bin_to_numeric_type(bins, dtype): """ if the passed bin is of datetime/timedelta type, this method converts it to integer Parameters ---------- bins : list-liek of bins dtype : dtype of data Raises ------ ValueError if bins are not of a compat dtype to dtype """ bins_dtype = infer_dtype(bins) if is_timedelta64_dtype(dtype): if bins_dtype in ['timedelta', 'timedelta64']: bins = to_timedelta(bins).view(np.int64) else: raise ValueError("bins must be of timedelta64 dtype") elif is_datetime64_dtype(dtype): if bins_dtype in ['datetime', 'datetime64']: bins = to_datetime(bins).view(np.int64) else: raise ValueError("bins must be of datetime64 dtype") return bins
def _hashtable_algo(f, values, return_dtype=None): """ f(HashTable, type_caster) -> result """ dtype = values.dtype if is_float_dtype(dtype): return f(htable.Float64HashTable, _ensure_float64) elif is_signed_integer_dtype(dtype): return f(htable.Int64HashTable, _ensure_int64) elif is_unsigned_integer_dtype(dtype): return f(htable.UInt64HashTable, _ensure_uint64) elif is_datetime64_dtype(dtype): return_dtype = return_dtype or 'M8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) elif is_timedelta64_dtype(dtype): return_dtype = return_dtype or 'm8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) # its cheaper to use a String Hash Table than Object if lib.infer_dtype(values) in ['string']: return f(htable.StringHashTable, _ensure_object) # use Object return f(htable.PyObjectHashTable, _ensure_object)
def _maybe_convert_timedelta(self, other): if isinstance(other, (timedelta, np.timedelta64, offsets.Tick, Timedelta)): offset = frequencies.to_offset(self.freq.rule_code) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) offset_nanos = tslib._delta_to_nanoseconds(offset) if nanos % offset_nanos == 0: return nanos // offset_nanos elif isinstance(other, offsets.DateOffset): freqstr = other.rule_code base = frequencies.get_base_alias(freqstr) if base == self.freq.rule_code: return other.n msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) elif isinstance(other, np.ndarray): if is_integer_dtype(other): return other elif is_timedelta64_dtype(other): offset = frequencies.to_offset(self.freq) if isinstance(offset, offsets.Tick): nanos = tslib._delta_to_nanoseconds(other) offset_nanos = tslib._delta_to_nanoseconds(offset) if (nanos % offset_nanos).all() == 0: return nanos // offset_nanos # raise when input doesn't have freq msg = "Input has different freq from PeriodIndex(freq={0})" raise IncompatibleFrequency(msg.format(self.freqstr))
def _format_label(x, precision=3, dtype=None): fmt_str = '%%.%dg' % precision if is_datetime64_dtype(dtype): return to_datetime(x, unit='ns') if is_timedelta64_dtype(dtype): return to_timedelta(x, unit='ns') if np.isinf(x): return str(x) elif is_float(x): frac, whole = np.modf(x) sgn = '-' if x < 0 else '' whole = abs(whole) if frac != 0.0: val = fmt_str % frac # rounded up or down if '.' not in val: if x < 0: return '%d' % (-whole - 1) else: return '%d' % (whole + 1) if 'e' in val: return _trim_zeros(fmt_str % x) else: val = _trim_zeros(val) if '.' in val: return sgn + '.'.join(('%d' % whole, val.split('.')[1])) else: # pragma: no cover return sgn + '.'.join(('%d' % whole, val)) else: return sgn + '%0.f' % whole else: return str(x)
def nansum(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) dtype_sum = dtype_max if is_float_dtype(dtype): dtype_sum = dtype elif is_timedelta64_dtype(dtype): dtype_sum = np.float64 the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask) return _wrap_results(the_sum, dtype)
def maybe_to_datetimelike(data, copy=False): """ return a DelegatedClass of a Series that is datetimelike (e.g. datetime64[ns],timedelta64[ns] dtype or a Series of Periods) raise TypeError if this is not possible. Parameters ---------- data : Series copy : boolean, default False copy the input data Returns ------- DelegatedClass """ from pandas import Series if not isinstance(data, Series): raise TypeError("cannot convert an object of type {0} to a " "datetimelike index".format(type(data))) index = data.index name = data.name orig = data if is_categorical_dtype(data) else None if orig is not None: data = orig.values.categories if is_datetime64_dtype(data.dtype): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) elif is_datetime64tz_dtype(data.dtype): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer', ambiguous='infer'), index, data.name, orig=orig) elif is_timedelta64_dtype(data.dtype): return TimedeltaProperties(TimedeltaIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) else: if is_period_arraylike(data): return PeriodProperties(PeriodIndex(data, copy=copy), index, name=name, orig=orig) if is_datetime_arraylike(data): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) raise TypeError("cannot convert an object of type {0} to a " "datetimelike index".format(type(data)))
def infer_freq(index, warn=True): """ Infer the most likely frequency given the input index. If the frequency is uncertain, a warning will be printed. Parameters ---------- index : DatetimeIndex or TimedeltaIndex if passed a Series will use the values of the series (NOT THE INDEX) warn : boolean, default True Returns ------- freq : string or None None if no discernible frequency TypeError if the index is not datetime-like ValueError if there are less than three values. """ import pandas as pd if isinstance(index, ABCSeries): values = index._values if not (is_datetime64_dtype(values) or is_timedelta64_dtype(values) or values.dtype == object): raise TypeError("cannot infer freq from a non-convertible " "dtype on a Series of {0}".format(index.dtype)) index = values if is_period_arraylike(index): raise TypeError("PeriodIndex given. Check the `freq` attribute " "instead of using infer_freq.") elif isinstance(index, pd.TimedeltaIndex): inferer = _TimedeltaFrequencyInferer(index, warn=warn) return inferer.get_freq() if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex): if isinstance(index, (pd.Int64Index, pd.Float64Index)): raise TypeError("cannot infer freq from a non-convertible index " "type {0}".format(type(index))) index = index.values if not isinstance(index, pd.DatetimeIndex): try: index = pd.DatetimeIndex(index) except AmbiguousTimeError: index = pd.DatetimeIndex(index.asi8) inferer = _FrequencyInferer(index, warn=warn) return inferer.get_freq()
def isin(comps, values): """ Compute the isin boolean array Parameters ---------- comps: array-like values: array-like Returns ------- boolean array same length as comps """ if not is_list_like(comps): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a " "[{0}]".format(type(comps).__name__)) comps = np.asarray(comps) if not is_list_like(values): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a " "[{0}]".format(type(values).__name__)) if not isinstance(values, np.ndarray): values = list(values) # GH11232 # work-around for numpy < 1.8 and comparisions on py3 # faster for larger cases to use np.in1d if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000: f = lambda x, y: np.in1d(x, np.asarray(list(y))) else: f = lambda x, y: lib.ismember_int64(x, set(y)) # may need i8 conversion for proper membership testing if is_datetime64_dtype(comps): from pandas.tseries.tools import to_datetime values = to_datetime(values)._values.view('i8') comps = comps.view('i8') elif is_timedelta64_dtype(comps): from pandas.tseries.timedeltas import to_timedelta values = to_timedelta(values)._values.view('i8') comps = comps.view('i8') elif is_int64_dtype(comps): pass else: f = lambda x, y: lib.ismember(x, set(values)) return f(comps, values)
def _hashtable_algo(f, dtype, return_dtype=None): """ f(HashTable, type_caster) -> result """ if is_float_dtype(dtype): return f(htable.Float64HashTable, _ensure_float64) elif is_integer_dtype(dtype): return f(htable.Int64HashTable, _ensure_int64) elif is_datetime64_dtype(dtype): return_dtype = return_dtype or 'M8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) elif is_timedelta64_dtype(dtype): return_dtype = return_dtype or 'm8[ns]' return f(htable.Int64HashTable, _ensure_int64).view(return_dtype) else: return f(htable.PyObjectHashTable, _ensure_object)
def _coerce_to_type(x): """ if the passed data is of datetime/timedelta type, this method converts it to integer so that cut method can handle it """ dtype = None if is_timedelta64_dtype(x): x = to_timedelta(x).view(np.int64) dtype = np.timedelta64 elif is_datetime64_dtype(x): x = to_datetime(x).view(np.int64) dtype = np.datetime64 return x, dtype
def as_json_table_type(x): """ Convert a NumPy / pandas type to its corresponding json_table. Parameters ---------- x : array or dtype Returns ------- t : str the Table Schema data types Notes ----- This table shows the relationship between NumPy / pandas dtypes, and Table Schema dtypes. ============== ================= Pandas type Table Schema type ============== ================= int64 integer float64 number bool boolean datetime64[ns] datetime timedelta64[ns] duration object str categorical any =============== ================= """ if is_integer_dtype(x): return 'integer' elif is_bool_dtype(x): return 'boolean' elif is_numeric_dtype(x): return 'number' elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or is_period_dtype(x)): return 'datetime' elif is_timedelta64_dtype(x): return 'duration' elif is_categorical_dtype(x): return 'any' elif is_string_dtype(x): return 'string' else: return 'any'
def astype(self, dtype, copy=True): dtype = np.dtype(dtype) if is_object_dtype(dtype): return self.asobject elif is_timedelta64_ns_dtype(dtype): if copy is True: return self.copy() return self elif is_timedelta64_dtype(dtype): # return an index (essentially this is division) result = self.values.astype(dtype, copy=copy) if self.hasnans: return Index(self._maybe_mask_results(result, convert="float64"), name=self.name) return Index(result.astype("i8"), name=self.name) elif is_integer_dtype(dtype): return Index(self.values.astype("i8", copy=copy), dtype="i8", name=self.name) raise ValueError("Cannot cast TimedeltaIndex to dtype %s" % dtype)
def f(values, axis=None, skipna=True, **kwds): if len(self.kwargs) > 0: for k, v in compat.iteritems(self.kwargs): if k not in kwds: kwds[k] = v try: if self.zero_value is not None and values.size == 0: if values.ndim == 1: # wrap the 0's if needed if is_timedelta64_dtype(values): return lib.Timedelta(0) return 0 else: result_shape = (values.shape[:axis] + values.shape[axis + 1:]) result = np.empty(result_shape) result.fill(0) return result if (_USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name)): result = bn_func(values, axis=axis, **kwds) # prefer to treat inf/-inf as NA, but must compute the func # twice :( if _has_infs(result): result = alt(values, axis=axis, skipna=skipna, **kwds) else: result = alt(values, axis=axis, skipna=skipna, **kwds) except Exception: try: result = alt(values, axis=axis, skipna=skipna, **kwds) except ValueError as e: # we want to transform an object array # ValueError message to the more typical TypeError # e.g. this is normally a disallowed function on # object arrays that contain strings if is_object_dtype(values): raise TypeError(e) raise return result
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't # ask numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we # can manage it. if is_bool_array(vals): vals = vals.astype('u8') elif ((is_datetime64_dtype(vals) or is_timedelta64_dtype(vals) or is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object # dtypes, then hash and rename categories. We allow skipping the # categorization when the values are known/likely to be unique. if categorize: codes, categories = pd.factorize(vals, sort=False) cat = pd.Categorical(codes, pd.Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) vals = hash_object_array(vals, hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals
def _wrap_results(result, dtype): """ wrap our results if needed """ if is_datetime64_dtype(dtype): if not isinstance(result, np.ndarray): result = lib.Timestamp(result) else: result = result.view(dtype) elif is_timedelta64_dtype(dtype): if not isinstance(result, np.ndarray): # raise if we have a timedelta64[ns] which is too large if np.fabs(result) > _int64_max: raise ValueError("overflow in timedelta operation") result = lib.Timedelta(result, unit='ns') else: result = result.astype('i8').view(dtype) return result
def _convert_listlike(arg, box, unit, name=None): if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): arg = np.array(list(arg), dtype='O') # these are shortcutable if is_timedelta64_dtype(arg): value = arg.astype('timedelta64[ns]') elif is_integer_dtype(arg): value = arg.astype('timedelta64[{0}]'.format( unit)).astype('timedelta64[ns]', copy=False) else: value = tslib.array_to_timedelta64(_ensure_object(arg), unit=unit, errors=errors) value = value.astype('timedelta64[ns]', copy=False) if box: from pandas import TimedeltaIndex value = TimedeltaIndex(value, unit='ns', name=name) return value
def _convert_listlike(arg, box, unit, name=None): if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'): arg = np.array(list(arg), dtype='O') # these are shortcutable if is_timedelta64_dtype(arg): value = arg.astype('timedelta64[ns]') elif is_integer_dtype(arg): value = arg.astype('timedelta64[{0}]'.format(unit)).astype( 'timedelta64[ns]', copy=False) else: value = tslib.array_to_timedelta64(_ensure_object(arg), unit=unit, errors=errors) value = value.astype('timedelta64[ns]', copy=False) if box: from pandas import TimedeltaIndex value = TimedeltaIndex(value, unit='ns', name=name) return value
def astype(self, dtype, copy=True): dtype = np.dtype(dtype) if is_object_dtype(dtype): return self.asobject elif is_timedelta64_ns_dtype(dtype): if copy is True: return self.copy() return self elif is_timedelta64_dtype(dtype): # return an index (essentially this is division) result = self.values.astype(dtype, copy=copy) if self.hasnans: return Index(self._maybe_mask_results(result, convert='float64'), name=self.name) return Index(result.astype('i8'), name=self.name) elif is_integer_dtype(dtype): return Index(self.values.astype('i8', copy=copy), dtype='i8', name=self.name) raise ValueError('Cannot cast TimedeltaIndex to dtype %s' % dtype)
def nanmean(values, axis=None, skipna=True): values, mask, dtype, dtype_max = _get_values(values, skipna, 0) dtype_sum = dtype_max dtype_count = np.float64 if is_integer_dtype(dtype) or is_timedelta64_dtype(dtype): dtype_sum = np.float64 elif is_float_dtype(dtype): dtype_sum = dtype dtype_count = dtype count = _get_counts(mask, axis, dtype=dtype_count) the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) if axis is not None and getattr(the_sum, 'ndim', False): the_mean = the_sum / count ct_mask = count == 0 if ct_mask.any(): the_mean[ct_mask] = np.nan else: the_mean = the_sum / count if count > 0 else np.nan return _wrap_results(the_mean, dtype)
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer Returns ------- labels : the indexer to the original array uniques : ndarray (1-d) or Index the unique values. Index is returned when passed values is Index or Series note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ from pandas import Index, Series, DatetimeIndex vals = np.asarray(values) # localize to UTC is_datetimetz_type = is_datetimetz(values) if is_datetimetz_type: values = DatetimeIndex(values) vals = values.asi8 is_datetime = is_datetime64_dtype(vals) is_timedelta = is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel, True) labels = _ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel, assume_unique=True) if is_datetimetz_type: # reset tz uniques = values._shallow_copy(uniques) elif is_datetime: uniques = uniques.astype('M8[ns]') elif is_timedelta: uniques = uniques.astype('m8[ns]') if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): uniques = Index(uniques) return labels, uniques
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer Returns ------- labels : the indexer to the original array uniques : ndarray (1-d) or Index the unique values. Index is returned when passed values is Index or Series note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ from pandas import Index, Series, DatetimeIndex vals = np.asarray(values) # localize to UTC is_datetimetz_type = is_datetimetz(values) if is_datetimetz_type: values = DatetimeIndex(values) vals = values.tz_localize(None) is_datetime = is_datetime64_dtype(vals) is_timedelta = is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel, True) labels = _ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: try: sorter = uniques.argsort() except: # unorderable in py3 if mixed str/int t = hash_klass(len(uniques)) t.map_locations(_ensure_object(uniques)) # order ints before strings ordered = np.concatenate([ np.sort(np.array([e for i, e in enumerate(uniques) if f(e)], dtype=object)) for f in [lambda x: not isinstance(x, string_types), lambda x: isinstance(x, string_types)]]) sorter = _ensure_platform_int(t.lookup( _ensure_object(ordered))) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) if is_datetimetz_type: # reset tz uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize( values.tz) elif is_datetime: uniques = uniques.astype('M8[ns]') elif is_timedelta: uniques = uniques.astype('m8[ns]') if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): uniques = Index(uniques) return labels, uniques
def _convert_to_array(self, values, name=None, other=None): """converts values to ndarray""" from pandas.tseries.timedeltas import to_timedelta ovalues = values supplied_dtype = None if not is_list_like(values): values = np.array([values]) # if this is a Series that contains relevant dtype info, then use this # instead of the inferred type; this avoids coercing Series([NaT], # dtype='datetime64[ns]') to Series([NaT], dtype='timedelta64[ns]') elif (isinstance(values, pd.Series) and (is_timedelta64_dtype(values) or is_datetime64_dtype(values))): supplied_dtype = values.dtype inferred_type = supplied_dtype or lib.infer_dtype(values) if (inferred_type in ('datetime64', 'datetime', 'date', 'time') or is_datetimetz(inferred_type)): # if we have a other of timedelta, but use pd.NaT here we # we are in the wrong path if (supplied_dtype is None and other is not None and (other.dtype in ('timedelta64[ns]', 'datetime64[ns]')) and isnull(values).all()): values = np.empty(values.shape, dtype='timedelta64[ns]') values[:] = iNaT # a datelike elif isinstance(values, pd.DatetimeIndex): values = values.to_series() # datetime with tz elif (isinstance(ovalues, datetime.datetime) and hasattr(ovalues, 'tz')): values = pd.DatetimeIndex(values) # datetime array with tz elif is_datetimetz(values): if isinstance(values, ABCSeries): values = values._values elif not (isinstance(values, (np.ndarray, ABCSeries)) and is_datetime64_dtype(values)): values = tslib.array_to_datetime(values) elif inferred_type in ('timedelta', 'timedelta64'): # have a timedelta, convert to to ns here values = to_timedelta(values, errors='coerce', box=False) elif inferred_type == 'integer': # py3 compat where dtype is 'm' but is an integer if values.dtype.kind == 'm': values = values.astype('timedelta64[ns]') elif isinstance(values, pd.PeriodIndex): values = values.to_timestamp().to_series() elif name not in ('__truediv__', '__div__', '__mul__', '__rmul__'): raise TypeError("incompatible type for a datetime/timedelta " "operation [{0}]".format(name)) elif inferred_type == 'floating': if (isnull(values).all() and name in ('__add__', '__radd__', '__sub__', '__rsub__')): values = np.empty(values.shape, dtype=other.dtype) values[:] = iNaT return values elif self._is_offset(values): return values else: raise TypeError("incompatible type [{0}] for a datetime/timedelta" " operation".format(np.array(values).dtype)) return values
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. .. versionadded:: 0.19.2 Parameters ---------- vals : ndarray encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. .. versionadded:: 0.20.0 Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. if is_bool_array(vals): vals = vals.astype('u8') elif ((is_datetime64_dtype(vals) or is_timedelta64_dtype(vals) or is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) vals = _hash.hash_object_array(vals, hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. .. versionadded:: 0.19.2 Parameters ---------- vals : ndarray, Categorical encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. .. versionadded:: 0.20.0 Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if not hasattr(vals, 'dtype'): raise TypeError("must pass a ndarray-like") if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. if is_bool_array(vals): vals = vals.astype('u8') elif (is_datetime64_dtype(vals) or is_timedelta64_dtype(vals)): vals = vals.view('i8').astype('u8', copy=False) elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = _hash.hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types vals = _hash.hash_object_array(vals.astype(str).astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals