def __init__(self, left, right, name, na_op): # need to make sure that we are aligning the data if isinstance(left, pd.Series) and isinstance(right, pd.Series): left, right = left.align(right,copy=False) lvalues = self._convert_to_array(left, name=name) rvalues = self._convert_to_array(right, name=name, other=lvalues) self.name = name self.na_op = na_op # left self.left = left self.is_offset_lhs = self._is_offset(left) self.is_timedelta_lhs = is_timedelta64_dtype(lvalues) self.is_datetime64_lhs = is_datetime64_dtype(lvalues) self.is_datetime64tz_lhs = is_datetime64tz_dtype(lvalues) self.is_datetime_lhs = self.is_datetime64_lhs or self.is_datetime64tz_lhs self.is_integer_lhs = left.dtype.kind in ['i', 'u'] self.is_floating_lhs = left.dtype.kind == 'f' # right self.right = right self.is_offset_rhs = self._is_offset(right) self.is_datetime64_rhs = is_datetime64_dtype(rvalues) self.is_datetime64tz_rhs = is_datetime64tz_dtype(rvalues) self.is_datetime_rhs = self.is_datetime64_rhs or self.is_datetime64tz_rhs self.is_timedelta_rhs = is_timedelta64_dtype(rvalues) self.is_integer_rhs = rvalues.dtype.kind in ('i', 'u') self.is_floating_rhs = rvalues.dtype.kind == 'f' self._validate(lvalues, rvalues, name) self.lvalues, self.rvalues = self._convert_for_datetime(lvalues, rvalues)
def __init__(self, left, right, name, na_op): super(_TimeOp, self).__init__(left, right, name, na_op) lvalues = self._convert_to_array(left, name=name) rvalues = self._convert_to_array(right, name=name, other=lvalues) # left self.is_offset_lhs = self._is_offset(left) self.is_timedelta_lhs = is_timedelta64_dtype(lvalues) self.is_datetime64_lhs = is_datetime64_dtype(lvalues) self.is_datetime64tz_lhs = is_datetime64tz_dtype(lvalues) self.is_datetime_lhs = (self.is_datetime64_lhs or self.is_datetime64tz_lhs) self.is_integer_lhs = left.dtype.kind in ['i', 'u'] self.is_floating_lhs = left.dtype.kind == 'f' # right self.is_offset_rhs = self._is_offset(right) self.is_datetime64_rhs = is_datetime64_dtype(rvalues) self.is_datetime64tz_rhs = is_datetime64tz_dtype(rvalues) self.is_datetime_rhs = (self.is_datetime64_rhs or self.is_datetime64tz_rhs) self.is_timedelta_rhs = is_timedelta64_dtype(rvalues) self.is_integer_rhs = rvalues.dtype.kind in ('i', 'u') self.is_floating_rhs = rvalues.dtype.kind == 'f' self._validate(lvalues, rvalues, name) self.lvalues, self.rvalues = self._convert_for_datetime(lvalues, rvalues)
def _convert_listlike(arg, box): if isinstance(arg, (list,tuple)): arg = np.array(arg, dtype='O') if com.is_datetime64_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None) except ValueError as e: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) return arg arg = com._ensure_object(arg) try: if format is not None: result = tslib.array_strptime(arg, format) else: result = tslib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst, coerce=coerce, unit=unit) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e
def backfill_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if com.is_float_dtype(values): _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None) elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values): _method = _backfill_2d_datetime elif com.is_integer_dtype(values): values = com._ensure_float64(values) _method = algos.backfill_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_2d_inplace_object if _method is None: raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype.name) if mask is None: mask = com.isnull(values) mask = mask.view(np.uint8) if np.all(values.shape): _method(values, mask, limit=limit) else: # for test coverage pass return values
def _sqlalchemy_type(self, col): from sqlalchemy.types import (BigInteger, Float, Text, Boolean, DateTime, Date, Time, Interval) if com.is_datetime64_dtype(col): try: tz = col.tzinfo return DateTime(timezone=True) except: return DateTime if com.is_timedelta64_dtype(col): warnings.warn("the 'timedelta' type is not supported, and will be " "written as integer values (ns frequency) to the " "database.", UserWarning) return BigInteger elif com.is_float_dtype(col): return Float elif com.is_integer_dtype(col): # TODO: Refine integer size. return BigInteger elif com.is_bool_dtype(col): return Boolean inferred = lib.infer_dtype(com._ensure_object(col)) if inferred == 'date': return Date if inferred == 'time': return Time return Text
def get_dtype_kinds(l): """ Parameters ---------- l : list of arrays Returns ------- a set of kinds that exist in this list of arrays """ typs = set() for arr in l: dtype = arr.dtype if com.is_categorical_dtype(dtype): typ = "category" elif com.is_sparse(arr): typ = "sparse" elif com.is_datetimetz(arr): typ = "datetimetz" elif com.is_datetime64_dtype(dtype): typ = "datetime" elif com.is_timedelta64_dtype(dtype): typ = "timedelta" elif com.is_object_dtype(dtype): typ = "object" elif com.is_bool_dtype(dtype): typ = "bool" else: typ = dtype.kind typs.add(typ) return typs
def get_dtype_kinds(l): """ Parameters ---------- l : list of arrays Returns ------- a set of kinds that exist in this list of arrays """ typs = set() for arr in l: dtype = arr.dtype if com.is_categorical_dtype(dtype): typ = 'category' elif com.is_sparse(arr): typ = 'sparse' elif com.is_datetimetz(arr): typ = 'datetimetz' elif com.is_datetime64_dtype(dtype): typ = 'datetime' elif com.is_timedelta64_dtype(dtype): typ = 'timedelta' elif com.is_object_dtype(dtype): typ = 'object' elif com.is_bool_dtype(dtype): typ = 'bool' else: typ = dtype.kind typs.add(typ) return typs
def _wrap_results(result, dtype): """ wrap our results if needed """ if is_datetime64_dtype(dtype): if not isinstance(result, np.ndarray): result = lib.Timestamp(result) else: result = result.view(dtype) elif is_timedelta64_dtype(dtype): if not isinstance(result, np.ndarray): # this is a scalar timedelta result! # we have series convert then take the element (scalar) # as series will do the right thing in py3 (and deal with numpy # 1.6.2 bug in that it results dtype of timedelta64[us] from pandas import Series # coerce float to results if is_float(result): result = int(result) result = Series([result], dtype='timedelta64[ns]') else: result = result.view(dtype) return result
def _sqlalchemy_type(self, arr_or_dtype): from sqlalchemy.types import (BigInteger, Float, Text, Boolean, DateTime, Date, Interval) if arr_or_dtype is date: return Date if com.is_datetime64_dtype(arr_or_dtype): try: tz = arr_or_dtype.tzinfo return DateTime(timezone=True) except: return DateTime if com.is_timedelta64_dtype(arr_or_dtype): warnings.warn("the 'timedelta' type is not supported, and will be " "written as integer values (ns frequency) to the " "database.", UserWarning) return BigInteger elif com.is_float_dtype(arr_or_dtype): return Float elif com.is_integer_dtype(arr_or_dtype): # TODO: Refine integer size. return BigInteger elif com.is_bool_dtype(arr_or_dtype): return Boolean return Text
def describe_1d(data): count = data.count() leng = len(data) distinct_count = data.nunique(dropna=False) if count > distinct_count > 1: mode = data.mode().iloc[0] else: mode = data[0] results_data = {'count': count, 'distinct_count': distinct_count, 'p_missing': 1 - count / leng, 'n_missing': leng - count, 'is_unique': distinct_count == leng, 'mode': mode, 'p_unique': distinct_count / count} try: # pandas 0.17 onwards results_data['memorysize'] = data.memory_usage() except: results_data['memorysize'] = 0 result = pd.Series(results_data, name=data.name) if distinct_count <= 1: result = result.append(describe_constant_1d(data)) elif com.is_numeric_dtype(data): result = result.append(describe_numeric_1d(data, result)) elif com.is_datetime64_dtype(data): result = result.append(describe_date_1d(data, result)) elif distinct_count == leng: result = result.append(describe_unique_1d(data)) else: result = result.append(describe_categorical_1d(data)) return result
def maybe_to_datetimelike(data, copy=False): """ return a DelegatedClass of a Series that is datetimelike (e.g. datetime64[ns],timedelta64[ns] dtype or a Series of Periods) raise TypeError if this is not possible. Parameters ---------- data : Series copy : boolean, default False copy the input data Returns ------- DelegatedClass """ from pandas import Series if not isinstance(data, Series): raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data))) index = data.index if is_datetime64_dtype(data.dtype) or is_datetime64tz_dtype(data.dtype): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=data.name) elif is_timedelta64_dtype(data.dtype): return TimedeltaProperties(TimedeltaIndex(data, copy=copy, freq='infer'), index, name=data.name) else: if is_period_arraylike(data): return PeriodProperties(PeriodIndex(data, copy=copy), index, name=data.name) if is_datetime_arraylike(data): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=data.name) raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data)))
def format_array(values, formatter, float_format=None, na_rep='NaN', digits=None, space=None, justify='right'): if com.is_float_dtype(values.dtype): fmt_klass = FloatArrayFormatter elif com.is_integer_dtype(values.dtype): fmt_klass = IntArrayFormatter elif com.is_datetime64_dtype(values.dtype): fmt_klass = Datetime64Formatter else: fmt_klass = GenericArrayFormatter if space is None: space = get_option("print.column_space") if float_format is None: float_format = get_option("print.float_format") if digits is None: digits = get_option("print.precision") fmt_obj = fmt_klass(values, digits, na_rep=na_rep, float_format=float_format, formatter=formatter, space=space, justify=justify) return fmt_obj.get_result()
def _plot_histogram(series, bins=10, figsize=(6, 4), facecolor='#337ab7'): """Plot an histogram from the data and return the AxesSubplot object. Parameters ---------- series: Series, default None The data to plot figsize: a tuple (width, height) in inches, default (6,4) The size of the figure. facecolor: str The color code. Returns ------- matplotlib.AxesSubplot, The plot. """ if com.is_datetime64_dtype(series): # TODO: These calls should be merged fig = plt.figure(figsize=figsize) plot = fig.add_subplot(111) plot.set_ylabel('Frequency') try: plot.hist(series.values, facecolor=facecolor, bins=bins) except TypeError: # matplotlib 1.4 can't plot dates so will show empty plot instead pass else: plot = series.plot(kind='hist', figsize=figsize, facecolor=facecolor, bins=bins) # TODO when running on server, send this off to a different thread return plot
def _convert_to_array(self, values, name=None, other=None): """converts values to ndarray""" from pandas.tseries.timedeltas import to_timedelta coerce = True if not is_list_like(values): values = np.array([values]) inferred_type = lib.infer_dtype(values) if inferred_type in ('datetime64', 'datetime', 'date', 'time'): # if we have a other of timedelta, but use pd.NaT here we # we are in the wrong path if (other is not None and other.dtype == 'timedelta64[ns]' and all(isnull(v) for v in values)): values = np.empty(values.shape, dtype=other.dtype) values[:] = iNaT # a datelike elif isinstance(values, pd.DatetimeIndex): values = values.to_series() elif not (isinstance(values, (np.ndarray, pd.Series)) and is_datetime64_dtype(values)): values = tslib.array_to_datetime(values) elif inferred_type in ('timedelta', 'timedelta64'): # have a timedelta, convert to to ns here values = to_timedelta(values, coerce=coerce) elif inferred_type == 'integer': # py3 compat where dtype is 'm' but is an integer if values.dtype.kind == 'm': values = values.astype('timedelta64[ns]') elif isinstance(values, pd.PeriodIndex): values = values.to_timestamp().to_series() elif name not in ('__truediv__', '__div__', '__mul__'): raise TypeError("incompatible type for a datetime/timedelta " "operation [{0}]".format(name)) elif isinstance(values[0], pd.DateOffset): # handle DateOffsets os = np.array([getattr(v, 'delta', None) for v in values]) mask = isnull(os) if mask.any(): raise TypeError("cannot use a non-absolute DateOffset in " "datetime/timedelta operations [{0}]".format( ', '.join([com.pprint_thing(v) for v in values[mask]]))) values = to_timedelta(os, coerce=coerce) elif inferred_type == 'floating': # all nan, so ok, use the other dtype (e.g. timedelta or datetime) if isnull(values).all(): values = np.empty(values.shape, dtype=other.dtype) values[:] = iNaT else: raise TypeError( 'incompatible type [{0}] for a datetime/timedelta ' 'operation'.format(np.array(values).dtype)) else: raise TypeError("incompatible type [{0}] for a datetime/timedelta" " operation".format(np.array(values).dtype)) return values
def describe_1d(data): # Is unique # Percent missing names = ['count', 'distinct_count', 'p_missing', 'n_missing', 'is_unique', 'mode', 'p_unique', 'memorysize'] count = data.count() leng = len(data) distinct_count = data.nunique(dropna=False) if count > distinct_count > 1: mode = data.mode().iloc[0] else: mode = data[0] results_data = [count, distinct_count, 1 - count / leng, leng - count, distinct_count == leng, mode, distinct_count / count, data.memory_usage()] result = pd.Series(results_data, index=names, name=data.name) if distinct_count <= 1: result = result.append(describe_constant_1d(data)) elif com.is_numeric_dtype(data): result = result.append(describe_numeric_1d(data, result)) elif com.is_datetime64_dtype(data): result = result.append(describe_date_1d(data, result)) elif distinct_count == leng: result = result.append(describe_unique_1d(data)) else: result = result.append(describe_categorical_1d(data)) return result
def na_value_for_dtype(dtype): """ Return a dtype compat na value Parameters ---------- dtype : string / dtype Returns ------- dtype compat na value """ from pandas.core import common as com from pandas import NaT dtype = pandas_dtype(dtype) if (com.is_datetime64_dtype(dtype) or com.is_datetime64tz_dtype(dtype) or com.is_timedelta64_dtype(dtype)): return NaT elif com.is_float_dtype(dtype): return np.nan elif com.is_integer_dtype(dtype): return 0 elif com.is_bool_dtype(dtype): return False return np.nan
def get_op(cls, left, right, name, na_op): """ Get op dispatcher, returns _Op or _TimeOp. If ``left`` and ``right`` are appropriate for datetime arithmetic with operation ``name``, processes them and returns a ``_TimeOp`` object that stores all the required values. Otherwise, it will generate either a ``_Op``, indicating that the operation is performed via normal numpy path. """ is_timedelta_lhs = is_timedelta64_dtype(left) is_datetime_lhs = (is_datetime64_dtype(left) or is_datetime64tz_dtype(left)) if isinstance(left, ABCSeries) and isinstance(right, ABCSeries): # avoid repated alignment if not left.index.equals(right.index): left, right = left.align(right, copy=False) index, lidx, ridx = left.index.join(right.index, how='outer', return_indexers=True) # if DatetimeIndex have different tz, convert to UTC left.index = index right.index = index if not (is_datetime_lhs or is_timedelta_lhs): return _Op(left, right, name, na_op) else: return _TimeOp(left, right, name, na_op)
def factorize(values, sort=False, order=None, na_sentinel=-1): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values order : na_sentinel: int, default -1 Value to mark "not found" Returns ------- """ from pandas.tseries.period import PeriodIndex vals = np.asarray(values) is_datetime = com.is_datetime64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel) labels = com._ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: try: sorter = uniques.argsort() except: # unorderable in py3 if mixed str/int t = hash_klass(len(uniques)) t.map_locations(com._ensure_object(uniques)) # order ints before strings ordered = np.concatenate([ np.sort(np.array([ e for i, e in enumerate(uniques) if f(e) ],dtype=object)) for f in [ lambda x: not isinstance(x,string_types), lambda x: isinstance(x,string_types) ] ]) sorter = com._ensure_platform_int(t.lookup(com._ensure_object(ordered))) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) if is_datetime: uniques = uniques.astype('M8[ns]') if isinstance(values, PeriodIndex): uniques = PeriodIndex(ordinal=uniques, freq=values.freq) return labels, uniques
def infer_freq(index, warn=True): """ Infer the most likely frequency given the input index. If the frequency is uncertain, a warning will be printed Parameters ---------- index : DatetimeIndex if passed a Series will use the values of the series (NOT THE INDEX) warn : boolean, default True Returns ------- freq : string or None None if no discernible frequency TypeError if the index is not datetime-like """ import pandas as pd if isinstance(index, com.ABCSeries): values = index.values if not (com.is_datetime64_dtype(index.values) or values.dtype == object): raise TypeError("cannot infer freq from a non-convertible dtype on a Series of {0}".format(index.dtype)) index = values if isinstance(index, pd.PeriodIndex): raise TypeError("PeriodIndex given. Check the `freq` attribute " "instead of using infer_freq.") if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex): if isinstance(index, (pd.Int64Index, pd.Float64Index)): raise TypeError("cannot infer freq from a non-convertible index type {0}".format(type(index))) index = index.values index = pd.DatetimeIndex(index) inferer = _FrequencyInferer(index, warn=warn) return inferer.get_freq()
def _convert_to_array(self, values, name=None, other=None): """converts values to ndarray""" from pandas.tseries.timedeltas import to_timedelta ovalues = values if not is_list_like(values): values = np.array([values]) inferred_type = lib.infer_dtype(values) if inferred_type in ('datetime64', 'datetime', 'date', 'time'): # if we have a other of timedelta, but use pd.NaT here we # we are in the wrong path if (other is not None and other.dtype == 'timedelta64[ns]' and all(isnull(v) for v in values)): values = np.empty(values.shape, dtype=other.dtype) values[:] = iNaT # a datelike elif isinstance(values, pd.DatetimeIndex): values = values.to_series() # datetime with tz elif isinstance(ovalues, datetime.datetime) and hasattr(ovalues,'tz'): values = pd.DatetimeIndex(values) # datetime array with tz elif com.is_datetimetz(values): if isinstance(values, pd.Series): values = values._values elif not (isinstance(values, (np.ndarray, pd.Series)) and is_datetime64_dtype(values)): values = tslib.array_to_datetime(values) elif inferred_type in ('timedelta', 'timedelta64'): # have a timedelta, convert to to ns here values = to_timedelta(values, errors='coerce') elif inferred_type == 'integer': # py3 compat where dtype is 'm' but is an integer if values.dtype.kind == 'm': values = values.astype('timedelta64[ns]') elif isinstance(values, pd.PeriodIndex): values = values.to_timestamp().to_series() elif name not in ('__truediv__', '__div__', '__mul__'): raise TypeError("incompatible type for a datetime/timedelta " "operation [{0}]".format(name)) elif inferred_type == 'floating': # all nan, so ok, use the other dtype (e.g. timedelta or datetime) if isnull(values).all(): values = np.empty(values.shape, dtype=other.dtype) values[:] = iNaT else: raise TypeError( 'incompatible type [{0}] for a datetime/timedelta ' 'operation'.format(np.array(values).dtype)) elif self._is_offset(values): return values else: raise TypeError("incompatible type [{0}] for a datetime/timedelta" " operation".format(np.array(values).dtype)) return values
def _convert_to_array(self, values, name=None, other=None): """converts values to ndarray""" from pandas.tseries.timedeltas import _possibly_cast_to_timedelta coerce = "compat" if pd._np_version_under1p7 else True if not is_list_like(values): values = np.array([values]) inferred_type = lib.infer_dtype(values) if inferred_type in ("datetime64", "datetime", "date", "time"): # if we have a other of timedelta, but use pd.NaT here we # we are in the wrong path if other is not None and other.dtype == "timedelta64[ns]" and all(isnull(v) for v in values): values = np.empty(values.shape, dtype=other.dtype) values[:] = tslib.iNaT # a datetlike elif not (isinstance(values, (pa.Array, pd.Series)) and com.is_datetime64_dtype(values)): values = tslib.array_to_datetime(values) elif isinstance(values, pd.DatetimeIndex): values = values.to_series() elif inferred_type in ("timedelta", "timedelta64"): # have a timedelta, convert to to ns here values = _possibly_cast_to_timedelta(values, coerce=coerce) elif inferred_type == "integer": # py3 compat where dtype is 'm' but is an integer if values.dtype.kind == "m": values = values.astype("timedelta64[ns]") elif isinstance(values, pd.PeriodIndex): values = values.to_timestamp().to_series() elif name not in ("__truediv__", "__div__", "__mul__"): raise TypeError("incompatible type for a datetime/timedelta " "operation [{0}]".format(name)) elif isinstance(values[0], pd.DateOffset): # handle DateOffsets os = pa.array([getattr(v, "delta", None) for v in values]) mask = isnull(os) if mask.any(): raise TypeError( "cannot use a non-absolute DateOffset in " "datetime/timedelta operations [{0}]".format(", ".join([com.pprint_thing(v) for v in values[mask]])) ) values = _possibly_cast_to_timedelta(os, coerce=coerce) elif inferred_type == "floating": # all nan, so ok, use the other dtype (e.g. timedelta or datetime) if isnull(values).all(): values = np.empty(values.shape, dtype=other.dtype) values[:] = tslib.iNaT else: raise TypeError( "incompatible type [{0}] for a datetime/timedelta " "operation".format(pa.array(values).dtype) ) else: raise TypeError( "incompatible type [{0}] for a datetime/timedelta" " operation".format(pa.array(values).dtype) ) return values
def __init__(self, left, right, name): self.name = name lvalues = self._convert_to_array(left, name=name) rvalues = self._convert_to_array(right, name=name) self.is_timedelta_lhs = com.is_timedelta64_dtype(left) self.is_datetime_lhs = com.is_datetime64_dtype(left) self.is_integer_lhs = left.dtype.kind in ['i','u'] self.is_datetime_rhs = com.is_datetime64_dtype(rvalues) self.is_timedelta_rhs = (com.is_timedelta64_dtype(rvalues) or (not self.is_datetime_rhs and pd._np_version_under1p7)) self.is_integer_rhs = rvalues.dtype.kind in ('i','u') self._validate() self._convert_for_datetime(lvalues, rvalues)
def _convert_listlike(arg, box): if isinstance(arg, (list,tuple)): arg = np.array(arg, dtype='O') if com.is_datetime64_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None) except ValueError as e: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) return arg arg = com._ensure_object(arg) try: if format is not None: result = None # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg) except: raise ValueError("cannot convert the input to '%Y%m%d' date format") # fallback if result is None: result = tslib.array_strptime(arg, format) else: result = tslib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst, coerce=coerce, unit=unit) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e
def _unpickle_array(bytes): arr = read_array(BytesIO(bytes)) # All datetimes should be stored as M8[ns]. When unpickling with # numpy1.6, it will read these as M8[us]. So this ensures all # datetime64 types are read as MS[ns] if com.is_datetime64_dtype(arr): arr = arr.view(com._NS_DTYPE) return arr
def factorize(values, sort=False, order=None, na_sentinel=-1): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : array-like sort : boolean, default True Sort by values order :,optional na_sentinel : int, default -1 Value to mark "not found" Examples -------- >>> factorize([12,3,8,5,9,7,11],sort=True,order=None,na_sentinel=-1) (array([6, 0, 3, 1, 4, 2, 5]), array([ 3, 5, 7, 8, 9, 11, 12], dtype=int64)) >>> factorize([12,3,8,5,9,7,10],sort=False,order=None,na_sentinel=-1) (array([0, 1, 2, 3, 4, 5, 6]), array([12, 3, 8, 5, 9, 7, 10], dtype=int64)) >>> factorize([12,3,8,5,9,7,10,10],sort=False,order=None,na_sentinel=-1) (array([0, 1, 2, 3, 4, 5, 6, 6]), array([12, 3, 8, 5, 9, 7, 10], dtype=int64)) Returns ------- a tuple labels of each number in array form ,corresponding number without duplication """ from pandas.tseries.period import PeriodIndex vals = np.asarray(values) is_datetime = com.is_datetime64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel) labels = com._ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: sorter = uniques.argsort() reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) if is_datetime: uniques = uniques.view('M8[ns]') if isinstance(values, PeriodIndex): uniques = PeriodIndex(ordinal=uniques, freq=values.freq) return labels, uniques
def pandas_col_to_ibis_type(col): import pandas.core.common as pdcom import ibis.expr.datatypes as dt import numpy as np dty = col.dtype # datetime types if pdcom.is_datetime64_dtype(dty): if pdcom.is_datetime64_ns_dtype(dty): return 'timestamp' else: raise com.IbisTypeError("Column {0} has dtype {1}, which is " "datetime64-like but does " "not use nanosecond units" .format(col.name, dty)) if pdcom.is_timedelta64_dtype(dty): print("Warning: encoding a timedelta64 as an int64") return 'int64' if pdcom.is_categorical_dtype(dty): return dt.Category(len(col.cat.categories)) if pdcom.is_bool_dtype(dty): return 'boolean' # simple numerical types if issubclass(dty.type, np.int8): return 'int8' if issubclass(dty.type, np.int16): return 'int16' if issubclass(dty.type, np.int32): return 'int32' if issubclass(dty.type, np.int64): return 'int64' if issubclass(dty.type, np.float32): return 'float' if issubclass(dty.type, np.float64): return 'double' if issubclass(dty.type, np.uint8): return 'int16' if issubclass(dty.type, np.uint16): return 'int32' if issubclass(dty.type, np.uint32): return 'int64' if issubclass(dty.type, np.uint64): raise com.IbisTypeError("Column {0} is an unsigned int64" .format(col.name)) if pdcom.is_object_dtype(dty): # TODO: overly broad? return 'string' raise com.IbisTypeError("Column {0} is dtype {1}" .format(col.name, dty))
def isin(comps, values): """ Compute the isin boolean array Parameters ---------- comps: array-like values: array-like Returns ------- boolean array same length as comps """ if not com.is_list_like(comps): raise TypeError( "only list-like objects are allowed to be passed" " to isin(), you passed a " "[{0}]".format(type(comps).__name__) ) comps = np.asarray(comps) if not com.is_list_like(values): raise TypeError( "only list-like objects are allowed to be passed" " to isin(), you passed a " "[{0}]".format(type(values).__name__) ) # GH11232 # work-around for numpy < 1.8 and comparisions on py3 # faster for larger cases to use np.in1d if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000: f = lambda x, y: np.in1d(x, np.asarray(list(y))) else: f = lambda x, y: lib.ismember_int64(x, set(y)) # may need i8 conversion for proper membership testing if com.is_datetime64_dtype(comps): from pandas.tseries.tools import to_datetime values = to_datetime(values)._values.view("i8") comps = comps.view("i8") elif com.is_timedelta64_dtype(comps): from pandas.tseries.timedeltas import to_timedelta values = to_timedelta(values)._values.view("i8") comps = comps.view("i8") elif com.is_int64_dtype(comps): pass else: f = lambda x, y: lib.ismember(x, set(values)) return f(comps, values)
def __init__(self, left, right, name): self.name = name # need to make sure that we are aligning the data if isinstance(left, pd.Series) and isinstance(right, pd.Series): left, right = left.align(right,copy=False) self.left = left self.right = right lvalues = self._convert_to_array(left, name=name) rvalues = self._convert_to_array(right, name=name, other=lvalues) self.is_timedelta_lhs = is_timedelta64_dtype(left) self.is_datetime_lhs = is_datetime64_dtype(left) self.is_integer_lhs = left.dtype.kind in ['i', 'u'] self.is_datetime_rhs = is_datetime64_dtype(rvalues) self.is_timedelta_rhs = is_timedelta64_dtype(rvalues) self.is_integer_rhs = rvalues.dtype.kind in ('i', 'u') self._validate() self._convert_for_datetime(lvalues, rvalues)
def _get_data_algo(values, func_map): if com.is_float_dtype(values): f = func_map['float64'] values = com._ensure_float64(values) elif com.is_datetime64_dtype(values): f = func_map['int64'] values = values.view('i8') elif com.is_integer_dtype(values): f = func_map['int64'] values = com._ensure_int64(values) else: f = func_map['generic'] values = com._ensure_object(values) return f, values
def _get_data_algo(values, func_map): if com.is_float_dtype(values): f = func_map["float64"] values = com._ensure_float64(values) elif com.is_datetime64_dtype(values): f = func_map["int64"] values = values.view("i8") elif com.is_integer_dtype(values): f = func_map["int64"] values = com._ensure_int64(values) else: f = func_map["generic"] values = com._ensure_object(values) return f, values
def _hashtable_algo(f, dtype, return_dtype=None): """ f(HashTable, type_caster) -> result """ if com.is_float_dtype(dtype): return f(htable.Float64HashTable, com._ensure_float64) elif com.is_integer_dtype(dtype): return f(htable.Int64HashTable, com._ensure_int64) elif com.is_datetime64_dtype(dtype): return_dtype = return_dtype or 'M8[ns]' return f(htable.Int64HashTable, com._ensure_int64).view(return_dtype) elif com.is_timedelta64_dtype(dtype): return_dtype = return_dtype or 'm8[ns]' return f(htable.Int64HashTable, com._ensure_int64).view(return_dtype) else: return f(htable.PyObjectHashTable, com._ensure_object)
def infer_freq(index, warn=True): """ Infer the most likely frequency given the input index. If the frequency is uncertain, a warning will be printed. Parameters ---------- index : DatetimeIndex or TimedeltaIndex if passed a Series will use the values of the series (NOT THE INDEX) warn : boolean, default True Returns ------- freq : string or None None if no discernible frequency TypeError if the index is not datetime-like ValueError if there are less than three values. """ import pandas as pd if isinstance(index, com.ABCSeries): values = index.values if not (com.is_datetime64_dtype(index.values) or com.is_timedelta64_dtype(index.values) or values.dtype == object): raise TypeError( "cannot infer freq from a non-convertible dtype on a Series of {0}" .format(index.dtype)) index = values if com.is_period_arraylike(index): raise TypeError("PeriodIndex given. Check the `freq` attribute " "instead of using infer_freq.") elif isinstance(index, pd.TimedeltaIndex): inferer = _TimedeltaFrequencyInferer(index, warn=warn) return inferer.get_freq() if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex): if isinstance(index, (pd.Int64Index, pd.Float64Index)): raise TypeError( "cannot infer freq from a non-convertible index type {0}". format(type(index))) index = index.values index = pd.DatetimeIndex(index) inferer = _FrequencyInferer(index, warn=warn) return inferer.get_freq()
def factorize(values, sort=False, order=None, na_sentinel=-1): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values order : na_sentinel: int, default -1 Value to mark "not found" Returns ------- """ from pandas.tseries.period import PeriodIndex vals = np.asarray(values) is_datetime = com.is_datetime64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel) labels = com._ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: sorter = uniques.argsort() reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) if is_datetime: uniques = uniques.view('M8[ns]') if isinstance(values, PeriodIndex): uniques = PeriodIndex(ordinal=uniques, freq=values.freq) return labels, uniques
def isin(comps, values): """ Compute the isin boolean array Parameters ---------- comps: array-like values: array-like Returns ------- boolean array same length as comps """ if not com.is_list_like(comps): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a " "[{0}]".format(type(comps).__name__)) comps = np.asarray(comps) if not com.is_list_like(values): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a " "[{0}]".format(type(values).__name__)) # GH11232 # work-around for numpy < 1.8 and comparisions on py3 # faster for larger cases to use np.in1d if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000: f = lambda x, y: np.in1d(x, np.asarray(list(y))) else: f = lambda x, y: lib.ismember_int64(x, set(y)) # may need i8 conversion for proper membership testing if com.is_datetime64_dtype(comps): from pandas.tseries.tools import to_datetime values = to_datetime(values)._values.view('i8') comps = comps.view('i8') elif com.is_timedelta64_dtype(comps): from pandas.tseries.timedeltas import to_timedelta values = to_timedelta(values)._values.view('i8') comps = comps.view('i8') elif com.is_int64_dtype(comps): pass else: f = lambda x, y: lib.ismember(x, set(values)) return f(comps, values)
def _convert_listlike(arg, box): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') if com.is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None) except ValueError: pass return arg arg = com._ensure_object(arg) try: if format is not None: result = None # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg) except: raise ValueError( "cannot convert the input to '%Y%m%d' date format") # fallback if result is None: result = tslib.array_strptime(arg, format, coerce=coerce) else: result = tslib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst, coerce=coerce, unit=unit) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e
def date_to_index(date, date_index): """ Looks up a Parameters ---------- date : string, datetime or datetime64 Date to use when returning the index date_index : 1-d array of datetime64 Index data containing datetime64 values Returns ------- index : int Index location Notes ----- Assumes dates are increasing and unique. Uses last value interpolation if a date is not in the series so that the value returned satisfies date_index[index] is the largest date less than or equal to date. """ if not is_datetime64_dtype(date_index): raise ValueError('date_index must be a datetime64 array') if not np.all((np.diff(date_index.values).astype(dtype=np.int64)) > 0): raise ValueError('date_index is not monotonic and unique') if not isinstance(date, (dt.datetime, np.datetime64, str)): raise ValueError("date must be a datetime, datetime64 or string") elif isinstance(date, dt.datetime): date = np.datetime64(date) elif isinstance(date, str): orig_date = date date = np.datetime64(to_datetime(date, coerce=True)) if date == NaT: raise ValueError('date:' + orig_date + ' cannot be parsed to a date.') date_index = np.asarray(date_index) locs = np.nonzero(date_index <= date)[0] if locs.shape[0] == 0: raise ValueError('All dates in date_index occur after date') else: return locs.max()
def as_json_table_type(x): """ Convert a NumPy / pandas type to its corresponding json_table. Parameters ---------- x : array or dtype Returns ------- t : str the Table Schema data types Notes ----- This table shows the relationship between NumPy / pandas dtypes, and Table Schema dtypes. ============== ================= Pandas type Table Schema type ============== ================= int64 integer float64 number bool boolean datetime64[ns] datetime timedelta64[ns] duration object str categorical any =============== ================= """ if is_integer_dtype(x): return 'integer' elif is_bool_dtype(x): return 'boolean' elif is_numeric_dtype(x): return 'number' elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x)): return 'datetime' elif is_timedelta64_dtype(x): return 'duration' elif is_categorical_dtype(x): return 'any' elif is_string_dtype(x): return 'string' else: return 'any'
def _convert_f(arg): arg = com._ensure_object(arg) try: result = lib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result except ValueError, e: try: values, tz = lib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e
def maybe_to_datetimelike(data, copy=False): """ return a DelegatedClass of a Series that is datetimelike (e.g. datetime64[ns],timedelta64[ns] dtype or a Series of Periods) raise TypeError if this is not possible. Parameters ---------- data : Series copy : boolean, default False copy the input data Returns ------- DelegatedClass """ from pandas import Series if not isinstance(data, Series): raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data))) index = data.index name = data.name orig = data if is_categorical_dtype(data) else None if orig is not None: data = orig.values.categories if is_datetime64_dtype(data.dtype): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) elif is_datetime64tz_dtype(data.dtype): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer', ambiguous='infer'), index, data.name, orig=orig) elif is_timedelta64_dtype(data.dtype): return TimedeltaProperties(TimedeltaIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) else: if is_period_arraylike(data): return PeriodProperties(PeriodIndex(data, copy=copy), index, name=name, orig=orig) if is_datetime_arraylike(data): return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=name, orig=orig) raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data)))
def maybe_convert_for_time_op(cls, left, right, name, na_op): """ if ``left`` and ``right`` are appropriate for datetime arithmetic with operation ``name``, processes them and returns a ``_TimeOp`` object that stores all the required values. Otherwise, it will generate either a ``NotImplementedError`` or ``None``, indicating that the operation is unsupported for datetimes (e.g., an unsupported r_op) or that the data is not the right type for time ops. """ # decide if we can do it is_timedelta_lhs = is_timedelta64_dtype(left) is_datetime_lhs = (is_datetime64_dtype(left) or is_datetime64tz_dtype(left)) if not (is_datetime_lhs or is_timedelta_lhs): return None return cls(left, right, name, na_op)
def _convert_to_array(self, values, name=None): """converts values to ndarray""" from pandas.tseries.timedeltas import _possibly_cast_to_timedelta coerce = 'compat' if pd._np_version_under1p7 else True if not is_list_like(values): values = np.array([values]) inferred_type = lib.infer_dtype(values) if inferred_type in ('datetime64', 'datetime', 'date', 'time'): # a datetlike if not (isinstance(values, (pa.Array, pd.Series)) and com.is_datetime64_dtype(values)): values = tslib.array_to_datetime(values) elif isinstance(values, pd.DatetimeIndex): values = values.to_series() elif inferred_type in ('timedelta', 'timedelta64'): # have a timedelta, convert to to ns here values = _possibly_cast_to_timedelta(values, coerce=coerce) elif inferred_type == 'integer': # py3 compat where dtype is 'm' but is an integer if values.dtype.kind == 'm': values = values.astype('timedelta64[ns]') elif isinstance(values, pd.PeriodIndex): values = values.to_timestamp().to_series() elif name not in ('__truediv__', '__div__', '__mul__'): raise TypeError("incompatible type for a datetime/timedelta " "operation [{0}]".format(name)) elif isinstance(values[0], pd.DateOffset): # handle DateOffsets os = pa.array([getattr(v, 'delta', None) for v in values]) mask = isnull(os) if mask.any(): raise TypeError( "cannot use a non-absolute DateOffset in " "datetime/timedelta operations [{0}]".format(','.join( [com.pprint_thing(v) for v in values[mask]]))) values = _possibly_cast_to_timedelta(os, coerce=coerce) else: raise TypeError( "incompatible type [{0}] for a datetime/timedelta operation". format(pa.array(values).dtype)) return values
def maybe_convert_for_time_op(cls, left, right, name): """ if ``left`` and ``right`` are appropriate for datetime arithmetic with operation ``name``, processes them and returns a ``_TimeOp`` object that stores all the required values. Otherwise, it will generate either a ``NotImplementedError`` or ``None``, indicating that the operation is unsupported for datetimes (e.g., an unsupported r_op) or that the data is not the right type for time ops. """ # decide if we can do it is_timedelta_lhs = com.is_timedelta64_dtype(left) is_datetime_lhs = com.is_datetime64_dtype(left) if not (is_datetime_lhs or is_timedelta_lhs): return None # rops are allowed. No need for special checks, just strip off # r part. if name.startswith('__r'): name = "__" + name[3:] return cls(left, right, name)
def to_datetime(arg, errors='ignore', dayfirst=False, box=True): """ Convert argument to datetime Parameters ---------- arg : string, datetime, array of strings (with possible NAs) errors : {'ignore', 'raise'}, default 'ignore' Errors are ignored by default (values left untouched) Returns ------- ret : datetime if parsing succeeded """ from pandas.core.series import Series from pandas.tseries.index import DatetimeIndex if arg is None: return arg elif isinstance(arg, datetime): return arg elif isinstance(arg, Series): values = lib.array_to_datetime(com._ensure_object(arg.values), raise_=errors == 'raise', dayfirst=dayfirst) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (np.ndarray, list)): if isinstance(arg, list): arg = np.array(arg, dtype='O') result = lib.array_to_datetime(com._ensure_object(arg), raise_=errors == 'raise', dayfirst=dayfirst) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result) return result try: if not arg: return arg return _dtparser.parse(arg, dayfirst=dayfirst) except Exception: if errors == 'raise': raise return arg
def _wrap_results(result, dtype): """ wrap our results if needed """ if is_datetime64_dtype(dtype): if not isinstance(result, np.ndarray): result = lib.Timestamp(result) else: result = result.view(dtype) elif is_timedelta64_dtype(dtype): if not isinstance(result, np.ndarray): # raise if we have a timedelta64[ns] which is too large if np.fabs(result) > _int64_max: raise ValueError("overflow in timedelta operation") result = lib.Timedelta(result, unit='ns') else: result = result.astype('i8').view(dtype) return result
def _sqlalchemy_type(self, arr_or_dtype): from sqlalchemy.types import Integer, Float, Text, Boolean, DateTime, Date, Interval if arr_or_dtype is date: return Date if com.is_datetime64_dtype(arr_or_dtype): try: tz = arr_or_dtype.tzinfo return DateTime(timezone=True) except: return DateTime if com.is_timedelta64_dtype(arr_or_dtype): return Interval elif com.is_float_dtype(arr_or_dtype): return Float elif com.is_integer_dtype(arr_or_dtype): # TODO: Refine integer size. return Integer elif com.is_bool(arr_or_dtype): return Boolean return Text
def factorize(values, sort=False, order=None, na_sentinel=-1): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : sequence sort : order : Returns ------- """ values = np.asarray(values) is_datetime = com.is_datetime64_dtype(values) (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables) table = hash_klass(len(values)) uniques = vec_klass() labels = table.get_labels(values, uniques, 0, na_sentinel) labels = com._ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: sorter = uniques.argsort() reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) if is_datetime: uniques = uniques.view('M8[ns]') return labels, uniques
def pad_1d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if com.is_float_dtype(values): _method = getattr(algos, 'pad_inplace_%s' % dtype.name, None) elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values): _method = _pad_1d_datetime elif com.is_integer_dtype(values): values = com._ensure_float64(values) _method = algos.pad_inplace_float64 elif values.dtype == np.object_: _method = algos.pad_inplace_object if _method is None: raise ValueError('Invalid dtype for pad_1d [%s]' % dtype.name) if mask is None: mask = com.isnull(values) mask = mask.view(np.uint8) _method(values, mask, limit=limit) return values
def describe_1d(data): count = data.count() leng = len(data) distinct_count = data.nunique(dropna=False) if count > distinct_count > 1: mode = data.mode().iloc[0] else: mode = data[0] results_data = { 'count': count, 'distinct_count': distinct_count, 'p_missing': 1 - count / leng, 'n_missing': leng - count, 'is_unique': distinct_count == leng, 'mode': mode, 'p_unique': distinct_count / count } try: # pandas 0.17 onwards results_data['memorysize'] = data.memory_usage() except: results_data['memorysize'] = 0 result = pd.Series(results_data, name=data.name) if distinct_count <= 1: result = result.append(describe_constant_1d(data)) elif com.is_numeric_dtype(data): result = result.append(describe_numeric_1d(data, result)) elif com.is_datetime64_dtype(data): result = result.append(describe_date_1d(data, result)) elif distinct_count == leng: result = result.append(describe_unique_1d(data)) else: result = result.append(describe_categorical_1d(data)) return result
def _sqlalchemy_type(self, arr_or_dtype): from sqlalchemy.types import Integer, Float, Text, Boolean, DateTime, Date, Interval if arr_or_dtype is date: return Date if com.is_datetime64_dtype(arr_or_dtype): try: tz = arr_or_dtype.tzinfo return DateTime(timezone=True) except: return DateTime if com.is_timedelta64_dtype(arr_or_dtype): warnings.warn("the 'timedelta' type is not supported, and will be " "written as integer values (ns frequency) to the " "database.", UserWarning) return Integer elif com.is_float_dtype(arr_or_dtype): return Float elif com.is_integer_dtype(arr_or_dtype): # TODO: Refine integer size. return Integer elif com.is_bool(arr_or_dtype): return Boolean return Text
def _get_data_algo(values, func_map): mask = None if com.is_float_dtype(values): f = func_map['float64'] values = com._ensure_float64(values) elif com.is_datetime64_dtype(values): # if we have NaT, punt to object dtype mask = com.isnull(values) if mask.ravel().any(): f = func_map['generic'] values = com._ensure_object(values) values[mask] = np.nan else: f = func_map['int64'] values = values.view('i8') elif com.is_integer_dtype(values): f = func_map['int64'] values = com._ensure_int64(values) else: f = func_map['generic'] values = com._ensure_object(values) return f, values
def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, format=None): """ Convert argument to datetime Parameters ---------- arg : string, datetime, array of strings (with possible NAs) errors : {'ignore', 'raise'}, default 'ignore' Errors are ignored by default (values left untouched) dayfirst : boolean, default False If True parses dates with the day first, eg 20/01/2005 utc : boolean, default None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well) box : boolean, default True If True returns a DatetimeIndex, if False returns ndarray of values format : string, default None strftime to parse time, eg "%d/%m/%Y" Returns ------- ret : datetime if parsing succeeded """ from pandas.core.series import Series from pandas.tseries.index import DatetimeIndex def _convert_f(arg): arg = com._ensure_object(arg) try: if format is not None: result = tslib.array_strptime(arg, format) else: result = tslib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e if arg is None: return arg elif isinstance(arg, datetime): return arg elif isinstance(arg, Series): values = arg.values if not com.is_datetime64_dtype(values): values = _convert_f(values) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (np.ndarray, list)): if isinstance(arg, list): arg = np.array(arg, dtype='O') if com.is_datetime64_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None) except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e return arg try: return _convert_f(arg) except ValueError: raise return arg try: if not arg: return arg default = datetime(1, 1, 1) return parse(arg, dayfirst=dayfirst, default=default) except Exception: if errors == 'raise': raise return arg
def _convert_listlike(arg, box, format): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') if com.is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None) except ValueError: pass return arg arg = com._ensure_object(arg) if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = ('%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or '%Y-%m-%d %H:%M:%S.%f'.startswith(format)) if format_is_iso8601: format = None try: result = None if format is not None: # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg, coerce=coerce) except: raise ValueError( "cannot convert the input to '%Y%m%d' date format") # fallback if result is None: try: result = tslib.array_strptime(arg, format, exact=exact, coerce=coerce) except (tslib.OutOfBoundsDatetime): if errors == 'raise': raise result = arg except ValueError: # Only raise this error if the user provided the # datetime format, and not when it was inferred if not infer_datetime_format: raise if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst, coerce=coerce, unit=unit) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e
def _convert_listlike(arg, box, format, name=None): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') # these are shortcutable if com.is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None, name=name) except ValueError: pass return arg elif format is None and com.is_integer_dtype(arg) and unit == 'ns': result = arg.astype('datetime64[ns]') if box: return DatetimeIndex(result, tz='utc' if utc else None, name=name) return result arg = com._ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = (('%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or '%Y-%m-%d %H:%M:%S.%f'.startswith(format)) and format != '%Y') if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None try: result = None if format is not None: # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg, errors=errors) except: raise ValueError( "cannot convert the input to '%Y%m%d' date format") # fallback if result is None: try: result = tslib.array_strptime(arg, format, exact=exact, errors=errors) except (tslib.OutOfBoundsDatetime): if errors == 'raise': raise result = arg except ValueError: # if format was inferred, try falling back # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: if errors == 'raise': raise result = arg if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime( arg, errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, freq=freq, unit=unit, require_iso8601=require_iso8601) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None, name=name) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, name=name, tz=tz) except (ValueError, TypeError): raise e
def _convert_to_array(self, values, name=None, other=None): """converts values to ndarray""" from pandas.tseries.timedeltas import _possibly_cast_to_timedelta coerce = True if not is_list_like(values): values = np.array([values]) inferred_type = lib.infer_dtype(values) if inferred_type in ('datetime64', 'datetime', 'date', 'time'): # if we have a other of timedelta, but use pd.NaT here we # we are in the wrong path if (other is not None and other.dtype == 'timedelta64[ns]' and all(isnull(v) for v in values)): values = np.empty(values.shape, dtype=other.dtype) values[:] = tslib.iNaT # a datelike elif isinstance(values, pd.DatetimeIndex): values = values.to_series() elif not (isinstance(values, (pa.Array, pd.Series)) and com.is_datetime64_dtype(values)): values = tslib.array_to_datetime(values) elif inferred_type in ('timedelta', 'timedelta64'): # have a timedelta, convert to to ns here values = _possibly_cast_to_timedelta(values, coerce=coerce, dtype='timedelta64[ns]') elif inferred_type == 'integer': # py3 compat where dtype is 'm' but is an integer if values.dtype.kind == 'm': values = values.astype('timedelta64[ns]') elif isinstance(values, pd.PeriodIndex): values = values.to_timestamp().to_series() elif name not in ('__truediv__', '__div__', '__mul__'): raise TypeError("incompatible type for a datetime/timedelta " "operation [{0}]".format(name)) elif isinstance(values[0], pd.DateOffset): # handle DateOffsets os = pa.array([getattr(v, 'delta', None) for v in values]) mask = isnull(os) if mask.any(): raise TypeError( "cannot use a non-absolute DateOffset in " "datetime/timedelta operations [{0}]".format(', '.join( [com.pprint_thing(v) for v in values[mask]]))) values = _possibly_cast_to_timedelta(os, coerce=coerce) elif inferred_type == 'floating': # all nan, so ok, use the other dtype (e.g. timedelta or datetime) if isnull(values).all(): values = np.empty(values.shape, dtype=other.dtype) values[:] = tslib.iNaT else: raise TypeError( 'incompatible type [{0}] for a datetime/timedelta ' 'operation'.format(pa.array(values).dtype)) else: raise TypeError("incompatible type [{0}] for a datetime/timedelta" " operation".format(pa.array(values).dtype)) return values
result = DatetimeIndex(result, tz='utc' if utc else None) return result except ValueError, e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e if arg is None: return arg elif isinstance(arg, datetime): return arg elif isinstance(arg, Series): values = arg.values if not com.is_datetime64_dtype(values): values = _convert_f(values) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (np.ndarray, list)): if isinstance(arg, list): arg = np.array(arg, dtype='O') if com.is_datetime64_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None) except ValueError, e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError):
return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e if arg is None: return arg elif isinstance(arg, datetime): return arg elif isinstance(arg, Series): values = _convert_f(arg.values) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (np.ndarray, list)): if isinstance(arg, list): arg = np.array(arg, dtype='O') if com.is_datetime64_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None) except ValueError, e: try: values, tz = lib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e return arg try: return _convert_f(arg) except ValueError: raise
def test_compat(self): self.assertFalse(is_datetime64_ns_dtype(self.dtype)) self.assertFalse(is_datetime64_ns_dtype('datetime64[ns, US/Eastern]')) self.assertFalse(is_datetime64_dtype(self.dtype)) self.assertFalse(is_datetime64_dtype('datetime64[ns, US/Eastern]'))
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values order : deprecated na_sentinel : int, default -1 Value to mark "not found" size_hint : hint to the hashtable sizer Returns ------- labels : the indexer to the original array uniques : ndarray (1-d) or Index the unique values. Index is returned when passed values is Index or Series note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ if order is not None: msg = "order is deprecated. See https://github.com/pydata/pandas/issues/6926" warn(msg, FutureWarning, stacklevel=2) from pandas.core.index import Index from pandas.core.series import Series vals = np.asarray(values) is_datetime = com.is_datetime64_dtype(vals) is_timedelta = com.is_timedelta64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(size_hint or len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel) labels = com._ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: try: sorter = uniques.argsort() except: # unorderable in py3 if mixed str/int t = hash_klass(len(uniques)) t.map_locations(com._ensure_object(uniques)) # order ints before strings ordered = np.concatenate([ np.sort( np.array([e for i, e in enumerate(uniques) if f(e)], dtype=object)) for f in [ lambda x: not isinstance(x, string_types), lambda x: isinstance(x, string_types) ] ]) sorter = com._ensure_platform_int( t.lookup(com._ensure_object(ordered))) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) if is_datetime: uniques = uniques.astype('M8[ns]') elif is_timedelta: uniques = uniques.astype('m8[ns]') if isinstance(values, Index): uniques = values._shallow_copy(uniques, name=None) elif isinstance(values, Series): uniques = Index(uniques) return labels, uniques
def _convert_listlike(arg, box, format, name=None): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') # these are shortcutable if com.is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None, name=name) except ValueError: pass return arg elif com.is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): return DatetimeIndex(arg, tz='utc' if utc else None) if utc: arg = arg.tz_convert(None).tz_localize('UTC') return arg elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") arg = getattr(arg, 'values', arg) result = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) if box: if errors == 'ignore': from pandas import Index return Index(result) return DatetimeIndex(result, tz='utc' if utc else None, name=name) return result elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') arg = com._ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = _format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None try: result = None if format is not None: # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg, errors=errors) except: raise ValueError("cannot convert the input to " "'%Y%m%d' date format") # fallback if result is None: try: result = tslib.array_strptime(arg, format, exact=exact, errors=errors) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise result = arg except ValueError: # if format was inferred, try falling back # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: if errors == 'raise': raise result = arg if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime( arg, errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, freq=freq, require_iso8601=require_iso8601) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None, name=name) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, name=name, tz=tz) except (ValueError, TypeError): raise e
def factorize(values, sort=False, order=None, na_sentinel=-1): """ Encode input values as an enumerated type or categorical variable Parameters ---------- values : ndarray (1-d) Sequence sort : boolean, default False Sort by values order : na_sentinel: int, default -1 Value to mark "not found" Returns ------- labels : the indexer to the original array uniques : the unique values note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ from pandas.tseries.period import PeriodIndex vals = np.asarray(values) is_datetime = com.is_datetime64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) table = hash_klass(len(vals)) uniques = vec_klass() labels = table.get_labels(vals, uniques, 0, na_sentinel) labels = com._ensure_platform_int(labels) uniques = uniques.to_array() if sort and len(uniques) > 0: try: sorter = uniques.argsort() except: # unorderable in py3 if mixed str/int t = hash_klass(len(uniques)) t.map_locations(com._ensure_object(uniques)) # order ints before strings ordered = np.concatenate([ np.sort( np.array([e for i, e in enumerate(uniques) if f(e)], dtype=object)) for f in [ lambda x: not isinstance(x, string_types), lambda x: isinstance(x, string_types) ] ]) sorter = com._ensure_platform_int( t.lookup(com._ensure_object(ordered))) reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) mask = labels < 0 labels = reverse_indexer.take(labels) np.putmask(labels, mask, -1) uniques = uniques.take(sorter) if is_datetime: uniques = uniques.astype('M8[ns]') if isinstance(values, PeriodIndex): uniques = PeriodIndex(ordinal=uniques, freq=values.freq) return labels, uniques