def time_delta_contains(series: pd.Series, state: dict) -> bool: """ Example: >>> x = pd.Series([pd.Timedelta(days=i) for i in range(3)]) >>> x in visions.Timedelta True """ return pdt.is_timedelta64_dtype(series)
def time_ref_unset(self) -> xr.DataArray: """Convert Timedelta + reference Timestamp to DatetimeIndex.""" da = self._obj.copy() time_ref = da.weldx.time_ref if time_ref and is_timedelta64_dtype(da.time): da["time"] = da.time.data + time_ref da.time.attrs = self._obj.time.attrs # restore old attributes ! return da
def is_timedelta_dtype(df: pd.DataFrame) -> pd.Series: """ Check if each series in DataFrame is of a timedelta dtype. Wrapper function to allow function to be applied on the entire dataframe instead of a series level. This is a workaround to dill which fails to pickle local contexts in nested lambda statements. """ return df.apply( lambda s: types.is_timedelta64_dtype(s), result_type="expand" )
def pandas_col_to_ibis_type(col): import numpy as np dty = col.dtype # datetime types if pdcom.is_datetime64tz_dtype(dty): return dt.Timestamp(str(dty.tz)) if pdcom.is_datetime64_dtype(dty): if pdcom.is_datetime64_ns_dtype(dty): return dt.timestamp else: raise com.IbisTypeError("Column {0} has dtype {1}, which is " "datetime64-like but does " "not use nanosecond units".format( col.name, dty)) if pdcom.is_timedelta64_dtype(dty): print("Warning: encoding a timedelta64 as an int64") return dt.int64 if pdcom.is_categorical_dtype(dty): return dt.Category(len(col.cat.categories)) if pdcom.is_bool_dtype(dty): return dt.boolean # simple numerical types if issubclass(dty.type, np.int8): return dt.int8 if issubclass(dty.type, np.int16): return dt.int16 if issubclass(dty.type, np.int32): return dt.int32 if issubclass(dty.type, np.int64): return dt.int64 if issubclass(dty.type, np.float32): return dt.float if issubclass(dty.type, np.float64): return dt.double if issubclass(dty.type, np.uint8): return dt.int16 if issubclass(dty.type, np.uint16): return dt.int32 if issubclass(dty.type, np.uint32): return dt.int64 if issubclass(dty.type, np.uint64): raise com.IbisTypeError("Column {} is an unsigned int64".format( col.name)) if pdcom.is_object_dtype(dty): return _infer_object_dtype(col) raise com.IbisTypeError("Column {0} is dtype {1}".format(col.name, dty))
def from_values(cls, initial_value, values=None, closed="left"): """ Construct :class:`Stairs` from :class:`pandas.Series`. Parameters ---------- initial_value : float, default 0 The value of the step function at negative infinity. values : :class:`pandas.Series` The step function values' when approaching the change points from the right closed : {"left", "right"} Indicates whether the half-open intervals comprising the step function should be interpreted as left-closed or right-closed. Returns ------- :class:`Stairs` """ if not isinstance(values, pd.Series) or values.empty: raise ValueError("values must be a not empty Series") if not (is_numeric_dtype(values.index) or is_datetime64_dtype( values.index) or is_timedelta64_dtype(values.index)): warnings.warn("The index of data is not numeric, or time based") if np.isinf(values.index).any(): raise ValueError("Invalid value for Series index") if not is_numeric_dtype(values) or not is_number(initial_value): raise ValueError("Invalid dtype for from_values()") if not values.index.is_monotonic_increasing: raise ValueError("Series index must be monotonic") series_values_inf_mask = np.isinf(values) if series_values_inf_mask.any(): values = values.replace([np.inf], np.nan) warnings.warn( "Infinity values detected and have been converted to NaN") new_instance = cls(closed=closed) new_instance.initial_value = initial_value new_instance._data = values.to_frame("value") new_instance._valid_deltas = False new_instance._valid_values = True return new_instance
def time_ref(self, value: pd.Timestamp): """Convert INPLACE to new reference time. If no reference time exists, the new value will be assigned TODO: should None be allowed and pass through or raise TypeError ? """ if "time" in self._obj.coords: value = _as_valid_timestamp(value) if self._obj.weldx.time_ref and is_timedelta64_dtype(self._obj.time): if value == self._obj.weldx.time_ref: return _attrs = self._obj.time.attrs time_delta = value - self._obj.weldx.time_ref self._obj["time"] = self._obj.time.data - time_delta self._obj.time.attrs = _attrs # restore old attributes ! self._obj.time.attrs["time_ref"] = value # set new time_ref value else: self._obj.time.attrs["time_ref"] = value
def _spacing(da, dims): """ Verify correct spacing and return the spacing for each axis :param da: :return: """ delta_x = [] for d in dims: coord = da[d] diff = np.diff(coord) if is_timedelta64_dtype(diff): # convert to seconds so we get hertz diff = diff.astype('timedelta64[s]').astype('f8') delta = diff[0] if not np.allclose(diff, diff[0]): raise ValueError("Can't take Fourier transform because" "coodinate %s is not evenly spaced" % d) delta_x.append(delta) return delta_x
def pandas_iter( df: pd.DataFrame, columns: List[str], mask: Optional[np.array] = None ) -> Generator[List[Any], None, None]: arrays = [] for column in columns: srs = df.loc[:, column] if mask is not None: srs = srs[mask] if is_datetime64_any_dtype(srs) or is_datetime64_ns_dtype(srs): arrays.append(map(pd.Timestamp, srs.values)) elif is_timedelta64_dtype(srs) or is_timedelta64_ns_dtype(srs): arrays.append(map(pd.Timedelta, srs.values)) else: arrays.append(srs.values) yield from zip(*arrays)
def time_ref(self, value: types_timestamp_like): """Convert INPLACE to new reference time. If no reference time exists, the new value will be assigned. """ if value is None: raise TypeError("'None' is not allowed as value.") if "time" in self._obj.coords: value = Time(value).as_timestamp() if self._obj.weldx.time_ref and is_timedelta64_dtype( self._obj.time): if value == self._obj.weldx.time_ref: return _attrs = self._obj.time.attrs time_delta = value - self._obj.weldx.time_ref self._obj["time"] = self._obj.time.data - time_delta self._obj.time.attrs = _attrs # restore old attributes ! self._obj.time.attrs[ "time_ref"] = value # set new time_ref value else: self._obj.time.attrs["time_ref"] = value
def to_pandas_time_index( time: Union[ pint.Quantity, np.ndarray, pd.TimedeltaIndex, pd.DatetimeIndex, xr.DataArray, "tf.LocalCoordinateSystem", ], ) -> Union[pd.TimedeltaIndex, pd.DatetimeIndex]: """Convert a time variable to the corresponding pandas time index type. Parameters ---------- time : Variable that should be converted. Returns ------- Union[pandas.TimedeltaIndex, pandas.DatetimeIndex] : Time union of all input objects """ from weldx.transformations import LocalCoordinateSystem _input_type = type(time) if isinstance(time, (pd.DatetimeIndex, pd.TimedeltaIndex)): return time if isinstance(time, LocalCoordinateSystem): return to_pandas_time_index(time.time) if isinstance(time, pint.Quantity): base = "s" # using low base unit could cause rounding errors if not np.iterable(time): # catch zero-dim arrays time = np.expand_dims(time, 0) return pd.TimedeltaIndex(data=time.to(base).magnitude, unit=base) if isinstance(time, (xr.DataArray, xr.Dataset)): if "time" in time.coords: time = time.time time_index = pd.Index(time.values) if is_timedelta64_dtype(time_index) and time.weldx.time_ref: time_index = time_index + time.weldx.time_ref return time_index if not np.iterable(time) or isinstance(time, str): time = [time] time = pd.Index(time) if isinstance(time, (pd.DatetimeIndex, pd.TimedeltaIndex)): return time # try manual casting for object dtypes (i.e. strings), should avoid integers # warning: this allows something like ["1","2","3"] which will be ns !! if is_object_dtype(time): for func in (pd.DatetimeIndex, pd.TimedeltaIndex): try: return func(time) except (ValueError, TypeError): continue raise TypeError( f"Could not convert {_input_type} " f"to pd.DatetimeIndex or pd.TimedeltaIndex" )
def timedelta_func(series): if pdtypes.is_timedelta64_dtype(series.dtype): return True return False
def censor(x, range=(0, 1), only_finite=True): """ Convert any values outside of range to a **NULL** type object. Parameters ---------- x : array_like Values to manipulate range : tuple (min, max) giving desired output range only_finite : bool If True (the default), will only modify finite values. Returns ------- x : array_like Censored array Examples -------- >>> a = [1, 2, np.inf, 3, 4, -np.inf, 5] >>> censor(a, (0, 10)) [1, 2, inf, 3, 4, -inf, 5] >>> censor(a, (0, 10), False) [1, 2, nan, 3, 4, nan, 5] >>> censor(a, (2, 4)) [nan, 2, inf, 3, 4, -inf, nan] Notes ----- All values in ``x`` should be of the same type. ``only_finite`` parameter is not considered for Datetime and Timedelta types. The **NULL** type object depends on the type of values in **x**. - :class:`float` - :py:`float('nan')` - :class:`int` - :py:`float('nan')` - :class:`datetime.datetime` : :py:`np.datetime64(NaT)` - :class:`datetime.timedelta` : :py:`np.timedelta64(NaT)` """ if not len(x): return x py_time_types = (datetime.datetime, datetime.timedelta) np_pd_time_types = (pd.Timestamp, pd.Timedelta, np.datetime64, np.timedelta64) x0 = first_element(x) # Yes, we want type not isinstance if type(x0) in py_time_types: return _censor_with(x, range, 'NaT') if not hasattr(x, 'dtype') and isinstance(x0, np_pd_time_types): return _censor_with(x, range, type(x0)('NaT')) x_array = np.asarray(x) if pdtypes.is_number(x0) and not isinstance(x0, np.timedelta64): null = float('nan') elif com.is_datetime_arraylike(x_array): null = pd.Timestamp('NaT') elif pdtypes.is_datetime64_dtype(x_array): null = np.datetime64('NaT') elif isinstance(x0, pd.Timedelta): null = pd.Timedelta('NaT') elif pdtypes.is_timedelta64_dtype(x_array): null = np.timedelta64('NaT') else: raise ValueError( "Do not know how to censor values of type " "{}".format(type(x0))) if only_finite: try: finite = np.isfinite(x) except TypeError: finite = np.repeat(True, len(x)) else: finite = np.repeat(True, len(x)) if hasattr(x, 'dtype'): outside = (x < range[0]) | (x > range[1]) bool_idx = finite & outside x = x.copy() x[bool_idx] = null else: x = [null if not range[0] <= val <= range[1] and f else val for val, f in zip(x, finite)] return x
category_count = df[y].value_counts().count() if category_count == 1: # it is helpful to separate this case in order to save unnecessary calculation time return df, "target_is_constant" if _dtype_represents_categories(df[y]) and (category_count == len(df[y])): # it is important to separate this case in order to save unnecessary calculation time return df, "target_is_id" if _dtype_represents_categories(df[y]): return df, "classification" if is_numeric_dtype(df[y]): # this check needs to be after is_bool_dtype (which is part of _dtype_represents_categories) because bool is considered numeric by pandas return df, "regression" if is_datetime64_any_dtype(df[y]) or is_timedelta64_dtype(df[y]): # IDEA: show warning # raise TypeError( # f"The target column {y} has the dtype {df[y].dtype} which is not supported. A possible solution might be to convert {y} to a string column" # ) return df, "target_is_datetime" # IDEA: show warning # raise Exception( # f"Could not infer a valid task based on the target {y}. The dtype {df[y].dtype} is not yet supported" # ) # pragma: no cover return df, "target_data_type_not_supported" def _feature_is_id(df, x): "Returns Boolean if the feature column x is an ID"
def convert_col_dtype(col, int_to_category=True, force_fp32=True): """Convert datatypes for columns according to "sensible" rules for the tasks in this module: * integer types are reduced to smallest integer type without losing information, or to a categorical if that uses less memory (roughly) * float types are all made the same: either the type of the first element, or all are reduced to single precision * object types that contain strings are converted to categoricals * object types that contain numbers are converted according to the rules above to either floats, shortest-possible ints, or a categorical * bool types are forced to ``numpy.dtype('bool')`` Parameters ---------- col : pandas.Series Column int_to_category : bool Whether to convert integer types to categoricals in the case that this will save memory. force_fp32 : bool Force all floating-point data types to be single precision (fp32). If False, the type of the first element is used instead (for all values in the column). Returns ------- col : pandas.Series """ from pisa.utils.fileio import fsort categorical_dtype = CategoricalDtype() recognized_dtype = False original_dtype = col.dtype col_name = col.name if len(col) == 0: #pylint: disable=len-as-condition return col first_item = col.iloc[0] # Default: keep current dtype new_dtype = original_dtype if (is_categorical_dtype(original_dtype) or is_datetime64_any_dtype(original_dtype) or is_timedelta64_dtype(original_dtype) or is_timedelta64_ns_dtype(original_dtype)): recognized_dtype = True new_dtype = original_dtype elif is_object_dtype(original_dtype): if isinstance(first_item, basestring): recognized_dtype = True new_dtype = categorical_dtype # NOTE: Must check bool before int since bools look like ints (but not # vice versa) elif isinstance(first_item, BOOL_TYPES): recognized_dtype = True new_dtype = np.dtype('bool') elif isinstance(first_item, INT_TYPES + UINT_TYPES): recognized_dtype = True new_dtype = np.dtype('int') elif isinstance(first_item, FLOAT_TYPES): recognized_dtype = True new_dtype = np.dtype(type(first_item)) # Convert ints to either shortest int possible or categorical, # whichever is smaller (use int if same size) if new_dtype in INT_DTYPES + UINT_DTYPES: recognized_dtype = True # See how large an int would be necessary col_min, col_max = col.min(), col.max() found_int_dtype = False int_dtype = None for int_dtype in INT_DTYPES: exponent = 8 * int_dtype.itemsize - 1 min_representable = -2**exponent max_representable = (2**exponent) - 1 if col_min >= min_representable and col_max <= max_representable: found_int_dtype = True break if not found_int_dtype: raise ValueError('Value(s) in column "%s" exceed %s bounds' % (col_name, int_dtype)) # Check if categorical is probably smaller than int dtype; note that # the below is not perfect (i.e. is not based on exact internal # representation of categoricals in Pandas...) but should get us pretty # close, so that at least order-of-magnitude efficiencies will be # found) if int_to_category: num_unique = len(col.unique()) category_bytes = int(np.ceil(np.log2(num_unique) / 8)) if category_bytes < int_dtype.itemsize: new_dtype = categorical_dtype else: new_dtype = int_dtype elif new_dtype in FLOAT_DTYPES: recognized_dtype = True if force_fp32: new_dtype = np.dtype('float32') else: new_dtype = np.dtype(type(first_item)) elif new_dtype in BOOL_DTYPES: recognized_dtype = True new_dtype = np.dtype('bool') if not recognized_dtype: wstderr('WARNING: Not modifying column "%s" with unhandled dtype "%s"' ' and/or sub-type "%s"\n' % (col_name, original_dtype.name, type(first_item))) if is_dtype_equal(new_dtype, original_dtype): if isinstance(first_item, basestring): return col.cat.reorder_categories(fsort(col.cat.categories)) return col if is_categorical_dtype(new_dtype): new_col = col.astype('category') if isinstance(first_item, basestring): new_col.cat.reorder_categories(fsort(new_col.cat.categories), inplace=True) return new_col try: return col.astype(new_dtype) except ValueError: wstderr('WARNING: Could not convert column "%s" to dtype "%s"; keeping' ' original dtype "%s"\n' % (col_name, new_dtype, original_dtype)) return col
def is_timedelta(value): if isinstance(value, (list, tuple)): value = pd.Series(value) return is_timedelta64_dtype(value) or isinstance(value, timedelta)
def duration(s1: pd.Series, s2: pd.Series = None, unit: Union[str, None] = None, round: Union[bool, int] = 2, freq: str = 'd') -> pd.Series: ''' calculate duration between two columns (series) Parameters ---------- s1 'from' datetime series s2 'to' datetime series. Default None. If None, defaults to today. interval default None - returns timedelta in days 'd' - days as an integer, 'years' (based on 365.25 days per year), 'months' (based on 30 day month) Other possible options are: - ‘W’, ‘D’, ‘T’, ‘S’, ‘L’, ‘U’, or ‘N’ - ‘days’ or ‘day’ - ‘hours’, ‘hour’, ‘hr’, or ‘h’ - ‘minutes’, ‘minute’, ‘min’, or ‘m’ - ‘seconds’, ‘second’, or ‘sec’ - ‘milliseconds’, ‘millisecond’, ‘millis’, or ‘milli’ - ‘microseconds’, ‘microsecond’, ‘micros’, or ‘micro’- - ‘nanoseconds’, ‘nanosecond’, ‘nanos’, ‘nano’, or ‘ns’. check out pandas `timedelta object <https://pandas.pydata.org/docs/reference/api/pandas.Timedelta.html>`_ for details. round Default False. If duration result is an integer and this parameter contains a positive integer, the result is round to this decimal precision. freq Default is 'd'(days). If the duration result is a pd.Timedelta dtype, the value can be 'rounded' using this frequency parameter. Must be a fixed frequency like 'S' (second) not 'ME' (month end). For a list of valid values, check out `pandas offset aliases <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_ Returns ------- series if unit is None - series is of data type timedelta64[ns] otherwise series of type int. Examples -------- .. code-block:: %%piper sample_data() >> select(['-countries', '-regions', '-ids', '-values_1', '-values_2']) >> assign(new_date_col=pd.to_datetime('2018-01-01')) >> assign(duration = lambda x: duration(x.new_date_col, x.order_dates, unit='months')) >> assign(duration_dates_age = lambda x: duration(x['dates'])) >> head(tablefmt='plain') dates rder_dates new_date_col duration duration_dates_age 0 2020-01-01 2020-01-07 2018-01-01 25 452 days 1 2020-01-02 2020-01-08 2018-01-01 25 451 days 2 2020-01-03 2020-01-09 2018-01-01 25 450 days 3 2020-01-04 2020-01-10 2018-01-01 25 449 days ''' if s2 is None: s2 = datetime.today() if unit is None: result = s2 - s1 elif unit == 'years': result = ((s2 - s1) / pd.Timedelta(365.25, 'd')) elif unit == 'months': result = ((s2 - s1) / pd.Timedelta(30, 'd')) else: result = ((s2 - s1)) / pd.Timedelta(1, unit) if is_numeric_dtype(result): result = result.round(round) elif is_timedelta64_dtype(result): result = result.dt.round(freq=freq) return result
def contains_op(cls, series: pd.Series) -> bool: return pdt.is_timedelta64_dtype(series)
def convert_col_dtype(col, int_to_category=True, force_fp32=True): """Convert datatypes for columns according to "sensible" rules for the tasks in this module: * integer types are reduced to smallest integer type without losing information, or to a categorical if that uses less memory (roughly) * float types are all made the same: either the type of the first element, or all are reduced to single precision * object types that contain strings are converted to categoricals * object types that contain numbers are converted according to the rules above to either floats, shortest-possible ints, or a categorical * bool types are forced to ``numpy.dtype('bool')`` Parameters ---------- col : pandas.Series Column int_to_category : bool Whether to convert integer types to categoricals in the case that this will save memory. force_fp32 : bool Force all floating-point data types to be single precision (fp32). If False, the type of the first element is used instead (for all values in the column). Returns ------- col : pandas.Series """ from pisa.utils.fileio import fsort categorical_dtype = CategoricalDtype() recognized_dtype = False original_dtype = col.dtype col_name = col.name if len(col) == 0: #pylint: disable=len-as-condition return col first_item = col.iloc[0] # Default: keep current dtype new_dtype = original_dtype if (is_categorical_dtype(original_dtype) or is_datetime64_any_dtype(original_dtype) or is_timedelta64_dtype(original_dtype) or is_timedelta64_ns_dtype(original_dtype)): recognized_dtype = True new_dtype = original_dtype elif is_object_dtype(original_dtype): if isinstance(first_item, basestring): recognized_dtype = True new_dtype = categorical_dtype # NOTE: Must check bool before int since bools look like ints (but not # vice versa) elif isinstance(first_item, BOOL_TYPES): recognized_dtype = True new_dtype = np.dtype('bool') elif isinstance(first_item, INT_TYPES + UINT_TYPES): recognized_dtype = True new_dtype = np.dtype('int') elif isinstance(first_item, FLOAT_TYPES): recognized_dtype = True new_dtype = np.dtype(type(first_item)) # Convert ints to either shortest int possible or categorical, # whichever is smaller (use int if same size) if new_dtype in INT_DTYPES + UINT_DTYPES: recognized_dtype = True # See how large an int would be necessary col_min, col_max = col.min(), col.max() found_int_dtype = False int_dtype = None for int_dtype in INT_DTYPES: exponent = 8*int_dtype.itemsize - 1 min_representable = -2 ** exponent max_representable = (2 ** exponent) - 1 if col_min >= min_representable and col_max <= max_representable: found_int_dtype = True break if not found_int_dtype: raise ValueError('Value(s) in column "%s" exceed %s bounds' % (col_name, int_dtype)) # Check if categorical is probably smaller than int dtype; note that # the below is not perfect (i.e. is not based on exact internal # representation of categoricals in Pandas...) but should get us pretty # close, so that at least order-of-magnitude efficiencies will be # found) if int_to_category: num_unique = len(col.unique()) category_bytes = int(np.ceil(np.log2(num_unique) / 8)) if category_bytes < int_dtype.itemsize: new_dtype = categorical_dtype else: new_dtype = int_dtype elif new_dtype in FLOAT_DTYPES: recognized_dtype = True if force_fp32: new_dtype = np.dtype('float32') else: new_dtype = np.dtype(type(first_item)) elif new_dtype in BOOL_DTYPES: recognized_dtype = True new_dtype = np.dtype('bool') if not recognized_dtype: wstderr('WARNING: Not modifying column "%s" with unhandled dtype "%s"' ' and/or sub-type "%s"\n' % (col_name, original_dtype.name, type(first_item))) if is_dtype_equal(new_dtype, original_dtype): if isinstance(first_item, basestring): return col.cat.reorder_categories(fsort(col.cat.categories)) return col if is_categorical_dtype(new_dtype): new_col = col.astype('category') if isinstance(first_item, basestring): new_col.cat.reorder_categories(fsort(new_col.cat.categories), inplace=True) return new_col try: return col.astype(new_dtype) except ValueError: wstderr('WARNING: Could not convert column "%s" to dtype "%s"; keeping' ' original dtype "%s"\n' % (col_name, new_dtype, original_dtype)) return col