예제 #1
0
    def test_transform_casting(self):
        # 13046
        data = """
        idx     A         ID3              DATETIME
        0   B-028  b76cd912ff "2014-10-08 13:43:27"
        1   B-054  4a57ed0b02 "2014-10-08 14:26:19"
        2   B-076  1a682034f8 "2014-10-08 14:29:01"
        3   B-023  b76cd912ff "2014-10-08 18:39:34"
        4   B-023  f88g8d7sds "2014-10-08 18:40:18"
        5   B-033  b76cd912ff "2014-10-08 18:44:30"
        6   B-032  b76cd912ff "2014-10-08 18:46:00"
        7   B-037  b76cd912ff "2014-10-08 18:52:15"
        8   B-046  db959faf02 "2014-10-08 18:59:59"
        9   B-053  b76cd912ff "2014-10-08 19:17:48"
        10  B-065  b76cd912ff "2014-10-08 19:21:38"
        """
        df = pd.read_csv(StringIO(data), sep='\s+',
                         index_col=[0], parse_dates=['DATETIME'])

        result = df.groupby('ID3')['DATETIME'].transform(lambda x: x.diff())
        assert is_timedelta64_dtype(result.dtype)

        result = df[['ID3', 'DATETIME']].groupby('ID3').transform(
            lambda x: x.diff())
        assert is_timedelta64_dtype(result.DATETIME.dtype)
예제 #2
0
    def __init__(self, left, right, name, na_op):
        super(_TimeOp, self).__init__(left, right, name, na_op)

        lvalues = self._convert_to_array(left, name=name)
        rvalues = self._convert_to_array(right, name=name, other=lvalues)

        # left
        self.is_offset_lhs = self._is_offset(left)
        self.is_timedelta_lhs = is_timedelta64_dtype(lvalues)
        self.is_datetime64_lhs = is_datetime64_dtype(lvalues)
        self.is_datetime64tz_lhs = is_datetime64tz_dtype(lvalues)
        self.is_datetime_lhs = (self.is_datetime64_lhs or
                                self.is_datetime64tz_lhs)
        self.is_integer_lhs = left.dtype.kind in ['i', 'u']
        self.is_floating_lhs = left.dtype.kind == 'f'

        # right
        self.is_offset_rhs = self._is_offset(right)
        self.is_datetime64_rhs = is_datetime64_dtype(rvalues)
        self.is_datetime64tz_rhs = is_datetime64tz_dtype(rvalues)
        self.is_datetime_rhs = (self.is_datetime64_rhs or
                                self.is_datetime64tz_rhs)
        self.is_timedelta_rhs = is_timedelta64_dtype(rvalues)
        self.is_integer_rhs = rvalues.dtype.kind in ('i', 'u')
        self.is_floating_rhs = rvalues.dtype.kind == 'f'

        self._validate(lvalues, rvalues, name)
        self.lvalues, self.rvalues = self._convert_for_datetime(lvalues,
                                                                rvalues)
예제 #3
0
    def test_is_timedelta(self):
        self.assertTrue(is_timedelta64_dtype('timedelta64'))
        self.assertTrue(is_timedelta64_dtype('timedelta64[ns]'))
        self.assertFalse(is_timedelta64_ns_dtype('timedelta64'))
        self.assertTrue(is_timedelta64_ns_dtype('timedelta64[ns]'))

        tdi = TimedeltaIndex([1e14, 2e14], dtype='timedelta64')
        self.assertTrue(is_timedelta64_dtype(tdi))
        self.assertTrue(is_timedelta64_ns_dtype(tdi))
        self.assertTrue(is_timedelta64_ns_dtype(tdi.astype('timedelta64[ns]')))

        # Conversion to Int64Index:
        self.assertFalse(is_timedelta64_ns_dtype(tdi.astype('timedelta64')))
        self.assertFalse(is_timedelta64_ns_dtype(tdi.astype('timedelta64[h]')))
예제 #4
0
    def get_op(cls, left, right, name, na_op):
        """
        Get op dispatcher, returns _Op or _TimeOp.

        If ``left`` and ``right`` are appropriate for datetime arithmetic with
        operation ``name``, processes them and returns a ``_TimeOp`` object
        that stores all the required values.  Otherwise, it will generate
        either a ``_Op``, indicating that the operation is performed via
        normal numpy path.
        """
        is_timedelta_lhs = is_timedelta64_dtype(left)
        is_datetime_lhs = (is_datetime64_dtype(left) or
                           is_datetime64tz_dtype(left))

        if isinstance(left, ABCSeries) and isinstance(right, ABCSeries):
            # avoid repated alignment
            if not left.index.equals(right.index):
                left, right = left.align(right, copy=False)

                index, lidx, ridx = left.index.join(right.index, how='outer',
                                                    return_indexers=True)
                # if DatetimeIndex have different tz, convert to UTC
                left.index = index
                right.index = index

        if not (is_datetime_lhs or is_timedelta_lhs):
            return _Op(left, right, name, na_op)
        else:
            return _TimeOp(left, right, name, na_op)
예제 #5
0
 def _maybe_convert_timedelta(self, other):
     if isinstance(other, (timedelta, np.timedelta64, offsets.Tick)):
         offset = frequencies.to_offset(self.freq.rule_code)
         if isinstance(offset, offsets.Tick):
             nanos = tslib._delta_to_nanoseconds(other)
             offset_nanos = tslib._delta_to_nanoseconds(offset)
             if nanos % offset_nanos == 0:
                 return nanos // offset_nanos
     elif isinstance(other, offsets.DateOffset):
         freqstr = other.rule_code
         base = frequencies.get_base_alias(freqstr)
         if base == self.freq.rule_code:
             return other.n
         msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr)
         raise IncompatibleFrequency(msg)
     elif isinstance(other, np.ndarray):
         if is_integer_dtype(other):
             return other
         elif is_timedelta64_dtype(other):
             offset = frequencies.to_offset(self.freq)
             if isinstance(offset, offsets.Tick):
                 nanos = tslib._delta_to_nanoseconds(other)
                 offset_nanos = tslib._delta_to_nanoseconds(offset)
                 if (nanos % offset_nanos).all() == 0:
                     return nanos // offset_nanos
     elif is_integer(other):
         # integer is passed to .shift via
         # _add_datetimelike_methods basically
         # but ufunc may pass integer to _add_delta
         return other
     # raise when input doesn't have freq
     msg = "Input has different freq from PeriodIndex(freq={0})"
     raise IncompatibleFrequency(msg.format(self.freqstr))
예제 #6
0
def _format_label(x, precision=3, dtype=None):
    fmt_str = "%%.%dg" % precision

    if is_datetime64_dtype(dtype):
        return to_datetime(x, unit="ns")
    if is_timedelta64_dtype(dtype):
        return to_timedelta(x, unit="ns")
    if np.isinf(x):
        return str(x)
    elif is_float(x):
        frac, whole = np.modf(x)
        sgn = "-" if x < 0 else ""
        whole = abs(whole)
        if frac != 0.0:
            val = fmt_str % frac

            # rounded up or down
            if "." not in val:
                if x < 0:
                    return "%d" % (-whole - 1)
                else:
                    return "%d" % (whole + 1)

            if "e" in val:
                return _trim_zeros(fmt_str % x)
            else:
                val = _trim_zeros(val)
                if "." in val:
                    return sgn + ".".join(("%d" % whole, val.split(".")[1]))
                else:  # pragma: no cover
                    return sgn + ".".join(("%d" % whole, val))
        else:
            return sgn + "%0.f" % whole
    else:
        return str(x)
예제 #7
0
def _format_labels(bins,
                   precision,
                   right=True,
                   include_lowest=False,
                   dtype=None):
    """ based on the dtype, return our labels """

    closed = 'right' if right else 'left'

    if is_datetime64_dtype(dtype):
        formatter = Timestamp
        adjust = lambda x: x - Timedelta('1ns')
    elif is_timedelta64_dtype(dtype):
        formatter = Timedelta
        adjust = lambda x: x - Timedelta('1ns')
    else:
        precision = _infer_precision(precision, bins)
        formatter = lambda x: _round_frac(x, precision)
        adjust = lambda x: x - 10**(-precision)

    breaks = [formatter(b) for b in bins]
    labels = IntervalIndex.from_breaks(breaks, closed=closed)

    if right and include_lowest:
        # we will adjust the left hand side by precision to
        # account that we are all right closed
        v = adjust(labels[0].left)

        i = IntervalIndex.from_intervals(
            [Interval(v, labels[0].right, closed='right')])
        labels = i.append(labels[1:])

    return labels
예제 #8
0
def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None):
    """Convert a list of objects to a timedelta index object."""

    if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'):
        arg = np.array(list(arg), dtype='O')

    # these are shortcut-able
    if is_timedelta64_dtype(arg):
        value = arg.astype('timedelta64[ns]')
    elif is_integer_dtype(arg):
        value = arg.astype('timedelta64[{0}]'.format(
            unit)).astype('timedelta64[ns]', copy=False)
    else:
        try:
            value = tslib.array_to_timedelta64(_ensure_object(arg),
                                               unit=unit, errors=errors)
            value = value.astype('timedelta64[ns]', copy=False)
        except ValueError:
            if errors == 'ignore':
                return arg
            else:
                # This else-block accounts for the cases when errors='raise'
                # and errors='coerce'. If errors == 'raise', these errors
                # should be raised. If errors == 'coerce', we shouldn't
                # expect any errors to be raised, since all parsing errors
                # cause coercion to pd.NaT. However, if an error / bug is
                # introduced that causes an Exception to be raised, we would
                # like to surface it.
                raise

    if box:
        from pandas import TimedeltaIndex
        value = TimedeltaIndex(value, unit='ns', name=name)
    return value
예제 #9
0
def _convert_listlike(arg, unit='ns', box=True, errors='raise', name=None):
    """Convert a list of objects to a timedelta index object."""

    if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'):
        arg = np.array(list(arg), dtype='O')

    # these are shortcut-able
    if is_timedelta64_dtype(arg):
        value = arg.astype('timedelta64[ns]')
    elif is_integer_dtype(arg):
        value = arg.astype('timedelta64[{0}]'.format(unit)).astype(
            'timedelta64[ns]', copy=False)
    else:
        try:
            value = tslib.array_to_timedelta64(_ensure_object(arg),
                                               unit=unit,
                                               errors=errors)
            value = value.astype('timedelta64[ns]', copy=False)
        except ValueError:
            if errors == 'ignore':
                return arg
            else:
                # This else-block accounts for the cases when errors='raise'
                # and errors='coerce'. If errors == 'raise', these errors
                # should be raised. If errors == 'coerce', we shouldn't
                # expect any errors to be raised, since all parsing errors
                # cause coercion to pd.NaT. However, if an error / bug is
                # introduced that causes an Exception to be raised, we would
                # like to surface it.
                raise

    if box:
        from pandas import TimedeltaIndex
        value = TimedeltaIndex(value, unit='ns', name=name)
    return value
예제 #10
0
def _convert_bin_to_numeric_type(bins, dtype):
    """
    if the passed bin is of datetime/timedelta type,
    this method converts it to integer

    Parameters
    ----------
    bins : list-liek of bins
    dtype : dtype of data

    Raises
    ------
    ValueError if bins are not of a compat dtype to dtype
    """
    bins_dtype = infer_dtype(bins)
    if is_timedelta64_dtype(dtype):
        if bins_dtype in ['timedelta', 'timedelta64']:
            bins = to_timedelta(bins).view(np.int64)
        else:
            raise ValueError("bins must be of timedelta64 dtype")
    elif is_datetime64_dtype(dtype):
        if bins_dtype in ['datetime', 'datetime64']:
            bins = to_datetime(bins).view(np.int64)
        else:
            raise ValueError("bins must be of datetime64 dtype")

    return bins
예제 #11
0
def _hashtable_algo(f, values, return_dtype=None):
    """
    f(HashTable, type_caster) -> result
    """

    dtype = values.dtype
    if is_float_dtype(dtype):
        return f(htable.Float64HashTable, _ensure_float64)
    elif is_signed_integer_dtype(dtype):
        return f(htable.Int64HashTable, _ensure_int64)
    elif is_unsigned_integer_dtype(dtype):
        return f(htable.UInt64HashTable, _ensure_uint64)
    elif is_datetime64_dtype(dtype):
        return_dtype = return_dtype or 'M8[ns]'
        return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
    elif is_timedelta64_dtype(dtype):
        return_dtype = return_dtype or 'm8[ns]'
        return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)

    # its cheaper to use a String Hash Table than Object
    if lib.infer_dtype(values) in ['string']:
        return f(htable.StringHashTable, _ensure_object)

    # use Object
    return f(htable.PyObjectHashTable, _ensure_object)
예제 #12
0
파일: period.py 프로젝트: DGrady/pandas
 def _maybe_convert_timedelta(self, other):
     if isinstance(other, (timedelta, np.timedelta64,
                           offsets.Tick, Timedelta)):
         offset = frequencies.to_offset(self.freq.rule_code)
         if isinstance(offset, offsets.Tick):
             nanos = tslib._delta_to_nanoseconds(other)
             offset_nanos = tslib._delta_to_nanoseconds(offset)
             if nanos % offset_nanos == 0:
                 return nanos // offset_nanos
     elif isinstance(other, offsets.DateOffset):
         freqstr = other.rule_code
         base = frequencies.get_base_alias(freqstr)
         if base == self.freq.rule_code:
             return other.n
         msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr)
         raise IncompatibleFrequency(msg)
     elif isinstance(other, np.ndarray):
         if is_integer_dtype(other):
             return other
         elif is_timedelta64_dtype(other):
             offset = frequencies.to_offset(self.freq)
             if isinstance(offset, offsets.Tick):
                 nanos = tslib._delta_to_nanoseconds(other)
                 offset_nanos = tslib._delta_to_nanoseconds(offset)
                 if (nanos % offset_nanos).all() == 0:
                     return nanos // offset_nanos
     # raise when input doesn't have freq
     msg = "Input has different freq from PeriodIndex(freq={0})"
     raise IncompatibleFrequency(msg.format(self.freqstr))
예제 #13
0
def _format_label(x, precision=3, dtype=None):
    fmt_str = '%%.%dg' % precision

    if is_datetime64_dtype(dtype):
        return to_datetime(x, unit='ns')
    if is_timedelta64_dtype(dtype):
        return to_timedelta(x, unit='ns')
    if np.isinf(x):
        return str(x)
    elif is_float(x):
        frac, whole = np.modf(x)
        sgn = '-' if x < 0 else ''
        whole = abs(whole)
        if frac != 0.0:
            val = fmt_str % frac

            # rounded up or down
            if '.' not in val:
                if x < 0:
                    return '%d' % (-whole - 1)
                else:
                    return '%d' % (whole + 1)

            if 'e' in val:
                return _trim_zeros(fmt_str % x)
            else:
                val = _trim_zeros(val)
                if '.' in val:
                    return sgn + '.'.join(('%d' % whole, val.split('.')[1]))
                else:  # pragma: no cover
                    return sgn + '.'.join(('%d' % whole, val))
        else:
            return sgn + '%0.f' % whole
    else:
        return str(x)
예제 #14
0
def _hashtable_algo(f, values, return_dtype=None):
    """
    f(HashTable, type_caster) -> result
    """

    dtype = values.dtype
    if is_float_dtype(dtype):
        return f(htable.Float64HashTable, _ensure_float64)
    elif is_signed_integer_dtype(dtype):
        return f(htable.Int64HashTable, _ensure_int64)
    elif is_unsigned_integer_dtype(dtype):
        return f(htable.UInt64HashTable, _ensure_uint64)
    elif is_datetime64_dtype(dtype):
        return_dtype = return_dtype or 'M8[ns]'
        return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
    elif is_timedelta64_dtype(dtype):
        return_dtype = return_dtype or 'm8[ns]'
        return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)

    # its cheaper to use a String Hash Table than Object
    if lib.infer_dtype(values) in ['string']:
        return f(htable.StringHashTable, _ensure_object)

    # use Object
    return f(htable.PyObjectHashTable, _ensure_object)
예제 #15
0
def nansum(values, axis=None, skipna=True):
    values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
    dtype_sum = dtype_max
    if is_float_dtype(dtype):
        dtype_sum = dtype
    elif is_timedelta64_dtype(dtype):
        dtype_sum = np.float64
    the_sum = values.sum(axis, dtype=dtype_sum)
    the_sum = _maybe_null_out(the_sum, axis, mask)

    return _wrap_results(the_sum, dtype)
예제 #16
0
def nansum(values, axis=None, skipna=True):
    values, mask, dtype, dtype_max = _get_values(values, skipna, 0)
    dtype_sum = dtype_max
    if is_float_dtype(dtype):
        dtype_sum = dtype
    elif is_timedelta64_dtype(dtype):
        dtype_sum = np.float64
    the_sum = values.sum(axis, dtype=dtype_sum)
    the_sum = _maybe_null_out(the_sum, axis, mask)

    return _wrap_results(the_sum, dtype)
예제 #17
0
def maybe_to_datetimelike(data, copy=False):
    """
    return a DelegatedClass of a Series that is datetimelike
      (e.g. datetime64[ns],timedelta64[ns] dtype or a Series of Periods)
    raise TypeError if this is not possible.

    Parameters
    ----------
    data : Series
    copy : boolean, default False
           copy the input data

    Returns
    -------
    DelegatedClass

    """
    from pandas import Series

    if not isinstance(data, Series):
        raise TypeError("cannot convert an object of type {0} to a "
                        "datetimelike index".format(type(data)))

    index = data.index
    name = data.name
    orig = data if is_categorical_dtype(data) else None
    if orig is not None:
        data = orig.values.categories

    if is_datetime64_dtype(data.dtype):
        return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'),
                                  index, name=name, orig=orig)
    elif is_datetime64tz_dtype(data.dtype):
        return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer',
                                                ambiguous='infer'),
                                  index, data.name, orig=orig)
    elif is_timedelta64_dtype(data.dtype):
        return TimedeltaProperties(TimedeltaIndex(data, copy=copy,
                                                  freq='infer'), index,
                                   name=name, orig=orig)
    else:
        if is_period_arraylike(data):
            return PeriodProperties(PeriodIndex(data, copy=copy), index,
                                    name=name, orig=orig)
        if is_datetime_arraylike(data):
            return DatetimeProperties(DatetimeIndex(data, copy=copy,
                                                    freq='infer'), index,
                                      name=name, orig=orig)

    raise TypeError("cannot convert an object of type {0} to a "
                    "datetimelike index".format(type(data)))
예제 #18
0
def maybe_to_datetimelike(data, copy=False):
    """
    return a DelegatedClass of a Series that is datetimelike
      (e.g. datetime64[ns],timedelta64[ns] dtype or a Series of Periods)
    raise TypeError if this is not possible.

    Parameters
    ----------
    data : Series
    copy : boolean, default False
           copy the input data

    Returns
    -------
    DelegatedClass

    """
    from pandas import Series

    if not isinstance(data, Series):
        raise TypeError("cannot convert an object of type {0} to a "
                        "datetimelike index".format(type(data)))

    index = data.index
    name = data.name
    orig = data if is_categorical_dtype(data) else None
    if orig is not None:
        data = orig.values.categories

    if is_datetime64_dtype(data.dtype):
        return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'),
                                  index, name=name, orig=orig)
    elif is_datetime64tz_dtype(data.dtype):
        return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer',
                                                ambiguous='infer'),
                                  index, data.name, orig=orig)
    elif is_timedelta64_dtype(data.dtype):
        return TimedeltaProperties(TimedeltaIndex(data, copy=copy,
                                                  freq='infer'), index,
                                   name=name, orig=orig)
    else:
        if is_period_arraylike(data):
            return PeriodProperties(PeriodIndex(data, copy=copy), index,
                                    name=name, orig=orig)
        if is_datetime_arraylike(data):
            return DatetimeProperties(DatetimeIndex(data, copy=copy,
                                                    freq='infer'), index,
                                      name=name, orig=orig)

    raise TypeError("cannot convert an object of type {0} to a "
                    "datetimelike index".format(type(data)))
예제 #19
0
def infer_freq(index, warn=True):
    """
    Infer the most likely frequency given the input index. If the frequency is
    uncertain, a warning will be printed.

    Parameters
    ----------
    index : DatetimeIndex or TimedeltaIndex
      if passed a Series will use the values of the series (NOT THE INDEX)
    warn : boolean, default True

    Returns
    -------
    freq : string or None
        None if no discernible frequency
        TypeError if the index is not datetime-like
        ValueError if there are less than three values.
    """
    import pandas as pd

    if isinstance(index, ABCSeries):
        values = index._values
        if not (is_datetime64_dtype(values) or
                is_timedelta64_dtype(values) or
                values.dtype == object):
            raise TypeError("cannot infer freq from a non-convertible "
                            "dtype on a Series of {0}".format(index.dtype))
        index = values

    if is_period_arraylike(index):
        raise TypeError("PeriodIndex given. Check the `freq` attribute "
                        "instead of using infer_freq.")
    elif isinstance(index, pd.TimedeltaIndex):
        inferer = _TimedeltaFrequencyInferer(index, warn=warn)
        return inferer.get_freq()

    if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex):
        if isinstance(index, (pd.Int64Index, pd.Float64Index)):
            raise TypeError("cannot infer freq from a non-convertible index "
                            "type {0}".format(type(index)))
        index = index.values

    if not isinstance(index, pd.DatetimeIndex):
        try:
            index = pd.DatetimeIndex(index)
        except AmbiguousTimeError:
            index = pd.DatetimeIndex(index.asi8)

    inferer = _FrequencyInferer(index, warn=warn)
    return inferer.get_freq()
예제 #20
0
def infer_freq(index, warn=True):
    """
    Infer the most likely frequency given the input index. If the frequency is
    uncertain, a warning will be printed.

    Parameters
    ----------
    index : DatetimeIndex or TimedeltaIndex
      if passed a Series will use the values of the series (NOT THE INDEX)
    warn : boolean, default True

    Returns
    -------
    freq : string or None
        None if no discernible frequency
        TypeError if the index is not datetime-like
        ValueError if there are less than three values.
    """
    import pandas as pd

    if isinstance(index, ABCSeries):
        values = index._values
        if not (is_datetime64_dtype(values) or is_timedelta64_dtype(values)
                or values.dtype == object):
            raise TypeError("cannot infer freq from a non-convertible "
                            "dtype on a Series of {0}".format(index.dtype))
        index = values

    if is_period_arraylike(index):
        raise TypeError("PeriodIndex given. Check the `freq` attribute "
                        "instead of using infer_freq.")
    elif isinstance(index, pd.TimedeltaIndex):
        inferer = _TimedeltaFrequencyInferer(index, warn=warn)
        return inferer.get_freq()

    if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex):
        if isinstance(index, (pd.Int64Index, pd.Float64Index)):
            raise TypeError("cannot infer freq from a non-convertible index "
                            "type {0}".format(type(index)))
        index = index.values

    if not isinstance(index, pd.DatetimeIndex):
        try:
            index = pd.DatetimeIndex(index)
        except AmbiguousTimeError:
            index = pd.DatetimeIndex(index.asi8)

    inferer = _FrequencyInferer(index, warn=warn)
    return inferer.get_freq()
예제 #21
0
def isin(comps, values):
    """
    Compute the isin boolean array

    Parameters
    ----------
    comps: array-like
    values: array-like

    Returns
    -------
    boolean array same length as comps
    """

    if not is_list_like(comps):
        raise TypeError("only list-like objects are allowed to be passed"
                        " to isin(), you passed a "
                        "[{0}]".format(type(comps).__name__))
    comps = np.asarray(comps)
    if not is_list_like(values):
        raise TypeError("only list-like objects are allowed to be passed"
                        " to isin(), you passed a "
                        "[{0}]".format(type(values).__name__))
    if not isinstance(values, np.ndarray):
        values = list(values)

    # GH11232
    # work-around for numpy < 1.8 and comparisions on py3
    # faster for larger cases to use np.in1d
    if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000:
        f = lambda x, y: np.in1d(x, np.asarray(list(y)))
    else:
        f = lambda x, y: lib.ismember_int64(x, set(y))

    # may need i8 conversion for proper membership testing
    if is_datetime64_dtype(comps):
        from pandas.tseries.tools import to_datetime
        values = to_datetime(values)._values.view('i8')
        comps = comps.view('i8')
    elif is_timedelta64_dtype(comps):
        from pandas.tseries.timedeltas import to_timedelta
        values = to_timedelta(values)._values.view('i8')
        comps = comps.view('i8')
    elif is_int64_dtype(comps):
        pass
    else:
        f = lambda x, y: lib.ismember(x, set(values))

    return f(comps, values)
예제 #22
0
def isin(comps, values):
    """
    Compute the isin boolean array

    Parameters
    ----------
    comps: array-like
    values: array-like

    Returns
    -------
    boolean array same length as comps
    """

    if not is_list_like(comps):
        raise TypeError("only list-like objects are allowed to be passed"
                        " to isin(), you passed a "
                        "[{0}]".format(type(comps).__name__))
    comps = np.asarray(comps)
    if not is_list_like(values):
        raise TypeError("only list-like objects are allowed to be passed"
                        " to isin(), you passed a "
                        "[{0}]".format(type(values).__name__))
    if not isinstance(values, np.ndarray):
        values = list(values)

    # GH11232
    # work-around for numpy < 1.8 and comparisions on py3
    # faster for larger cases to use np.in1d
    if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000:
        f = lambda x, y: np.in1d(x, np.asarray(list(y)))
    else:
        f = lambda x, y: lib.ismember_int64(x, set(y))

    # may need i8 conversion for proper membership testing
    if is_datetime64_dtype(comps):
        from pandas.tseries.tools import to_datetime
        values = to_datetime(values)._values.view('i8')
        comps = comps.view('i8')
    elif is_timedelta64_dtype(comps):
        from pandas.tseries.timedeltas import to_timedelta
        values = to_timedelta(values)._values.view('i8')
        comps = comps.view('i8')
    elif is_int64_dtype(comps):
        pass
    else:
        f = lambda x, y: lib.ismember(x, set(values))

    return f(comps, values)
예제 #23
0
def _hashtable_algo(f, dtype, return_dtype=None):
    """
    f(HashTable, type_caster) -> result
    """
    if is_float_dtype(dtype):
        return f(htable.Float64HashTable, _ensure_float64)
    elif is_integer_dtype(dtype):
        return f(htable.Int64HashTable, _ensure_int64)
    elif is_datetime64_dtype(dtype):
        return_dtype = return_dtype or 'M8[ns]'
        return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
    elif is_timedelta64_dtype(dtype):
        return_dtype = return_dtype or 'm8[ns]'
        return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
    else:
        return f(htable.PyObjectHashTable, _ensure_object)
예제 #24
0
def _coerce_to_type(x):
    """
    if the passed data is of datetime/timedelta type,
    this method converts it to integer so that cut method can
    handle it
    """
    dtype = None

    if is_timedelta64_dtype(x):
        x = to_timedelta(x).view(np.int64)
        dtype = np.timedelta64
    elif is_datetime64_dtype(x):
        x = to_datetime(x).view(np.int64)
        dtype = np.datetime64

    return x, dtype
예제 #25
0
def _hashtable_algo(f, dtype, return_dtype=None):
    """
    f(HashTable, type_caster) -> result
    """
    if is_float_dtype(dtype):
        return f(htable.Float64HashTable, _ensure_float64)
    elif is_integer_dtype(dtype):
        return f(htable.Int64HashTable, _ensure_int64)
    elif is_datetime64_dtype(dtype):
        return_dtype = return_dtype or 'M8[ns]'
        return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
    elif is_timedelta64_dtype(dtype):
        return_dtype = return_dtype or 'm8[ns]'
        return f(htable.Int64HashTable, _ensure_int64).view(return_dtype)
    else:
        return f(htable.PyObjectHashTable, _ensure_object)
예제 #26
0
def as_json_table_type(x):
    """
    Convert a NumPy / pandas type to its corresponding json_table.

    Parameters
    ----------
    x : array or dtype

    Returns
    -------
    t : str
        the Table Schema data types

    Notes
    -----
    This table shows the relationship between NumPy / pandas dtypes,
    and Table Schema dtypes.

    ==============  =================
    Pandas type     Table Schema type
    ==============  =================
    int64           integer
    float64         number
    bool            boolean
    datetime64[ns]  datetime
    timedelta64[ns] duration
    object          str
    categorical     any
    =============== =================
    """
    if is_integer_dtype(x):
        return 'integer'
    elif is_bool_dtype(x):
        return 'boolean'
    elif is_numeric_dtype(x):
        return 'number'
    elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x)
          or is_period_dtype(x)):
        return 'datetime'
    elif is_timedelta64_dtype(x):
        return 'duration'
    elif is_categorical_dtype(x):
        return 'any'
    elif is_string_dtype(x):
        return 'string'
    else:
        return 'any'
예제 #27
0
def as_json_table_type(x):
    """
    Convert a NumPy / pandas type to its corresponding json_table.

    Parameters
    ----------
    x : array or dtype

    Returns
    -------
    t : str
        the Table Schema data types

    Notes
    -----
    This table shows the relationship between NumPy / pandas dtypes,
    and Table Schema dtypes.

    ==============  =================
    Pandas type     Table Schema type
    ==============  =================
    int64           integer
    float64         number
    bool            boolean
    datetime64[ns]  datetime
    timedelta64[ns] duration
    object          str
    categorical     any
    =============== =================
    """
    if is_integer_dtype(x):
        return 'integer'
    elif is_bool_dtype(x):
        return 'boolean'
    elif is_numeric_dtype(x):
        return 'number'
    elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x) or
          is_period_dtype(x)):
        return 'datetime'
    elif is_timedelta64_dtype(x):
        return 'duration'
    elif is_categorical_dtype(x):
        return 'any'
    elif is_string_dtype(x):
        return 'string'
    else:
        return 'any'
예제 #28
0
    def astype(self, dtype, copy=True):
        dtype = np.dtype(dtype)

        if is_object_dtype(dtype):
            return self.asobject
        elif is_timedelta64_ns_dtype(dtype):
            if copy is True:
                return self.copy()
            return self
        elif is_timedelta64_dtype(dtype):
            # return an index (essentially this is division)
            result = self.values.astype(dtype, copy=copy)
            if self.hasnans:
                return Index(self._maybe_mask_results(result, convert="float64"), name=self.name)
            return Index(result.astype("i8"), name=self.name)
        elif is_integer_dtype(dtype):
            return Index(self.values.astype("i8", copy=copy), dtype="i8", name=self.name)
        raise ValueError("Cannot cast TimedeltaIndex to dtype %s" % dtype)
예제 #29
0
        def f(values, axis=None, skipna=True, **kwds):
            if len(self.kwargs) > 0:
                for k, v in compat.iteritems(self.kwargs):
                    if k not in kwds:
                        kwds[k] = v
            try:
                if self.zero_value is not None and values.size == 0:
                    if values.ndim == 1:

                        # wrap the 0's if needed
                        if is_timedelta64_dtype(values):
                            return lib.Timedelta(0)
                        return 0
                    else:
                        result_shape = (values.shape[:axis] +
                                        values.shape[axis + 1:])
                        result = np.empty(result_shape)
                        result.fill(0)
                        return result

                if (_USE_BOTTLENECK and skipna and
                        _bn_ok_dtype(values.dtype, bn_name)):
                    result = bn_func(values, axis=axis, **kwds)

                    # prefer to treat inf/-inf as NA, but must compute the func
                    # twice :(
                    if _has_infs(result):
                        result = alt(values, axis=axis, skipna=skipna, **kwds)
                else:
                    result = alt(values, axis=axis, skipna=skipna, **kwds)
            except Exception:
                try:
                    result = alt(values, axis=axis, skipna=skipna, **kwds)
                except ValueError as e:
                    # we want to transform an object array
                    # ValueError message to the more typical TypeError
                    # e.g. this is normally a disallowed function on
                    # object arrays that contain strings

                    if is_object_dtype(values):
                        raise TypeError(e)
                    raise

            return result
예제 #30
0
        def f(values, axis=None, skipna=True, **kwds):
            if len(self.kwargs) > 0:
                for k, v in compat.iteritems(self.kwargs):
                    if k not in kwds:
                        kwds[k] = v
            try:
                if self.zero_value is not None and values.size == 0:
                    if values.ndim == 1:

                        # wrap the 0's if needed
                        if is_timedelta64_dtype(values):
                            return lib.Timedelta(0)
                        return 0
                    else:
                        result_shape = (values.shape[:axis] +
                                        values.shape[axis + 1:])
                        result = np.empty(result_shape)
                        result.fill(0)
                        return result

                if (_USE_BOTTLENECK and skipna
                        and _bn_ok_dtype(values.dtype, bn_name)):
                    result = bn_func(values, axis=axis, **kwds)

                    # prefer to treat inf/-inf as NA, but must compute the func
                    # twice :(
                    if _has_infs(result):
                        result = alt(values, axis=axis, skipna=skipna, **kwds)
                else:
                    result = alt(values, axis=axis, skipna=skipna, **kwds)
            except Exception:
                try:
                    result = alt(values, axis=axis, skipna=skipna, **kwds)
                except ValueError as e:
                    # we want to transform an object array
                    # ValueError message to the more typical TypeError
                    # e.g. this is normally a disallowed function on
                    # object arrays that contain strings

                    if is_object_dtype(values):
                        raise TypeError(e)
                    raise

            return result
예제 #31
0
파일: hashing.py 프로젝트: json87/dask
    def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
        if hash_key is None:
            hash_key = _default_hash_key

        # For categoricals, we hash the categories, then remap the codes to the
        # hash values. (This check is above the complex check so that we don't
        # ask numpy if categorical is a subdtype of complex, as it will choke.
        if is_categorical_dtype(vals.dtype):
            return _hash_categorical(vals, encoding, hash_key)

        # we'll be working with everything as 64-bit values, so handle this
        # 128-bit value early
        if np.issubdtype(vals.dtype, np.complex128):
            return hash_array(vals.real) + 23 * hash_array(vals.imag)

        # First, turn whatever array this is into unsigned 64-bit ints, if we
        # can manage it.
        if is_bool_array(vals):
            vals = vals.astype('u8')
        elif ((is_datetime64_dtype(vals) or is_timedelta64_dtype(vals)
               or is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8):
            vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
        else:
            # With repeated values, its MUCH faster to categorize object
            # dtypes, then hash and rename categories. We allow skipping the
            # categorization when the values are known/likely to be unique.
            if categorize:
                codes, categories = pd.factorize(vals, sort=False)
                cat = pd.Categorical(codes,
                                     pd.Index(categories),
                                     ordered=False,
                                     fastpath=True)
                return _hash_categorical(cat, encoding, hash_key)

            vals = hash_object_array(vals, hash_key, encoding)

        # Then, redistribute these 64-bit ints within the space of 64-bit ints
        vals ^= vals >> 30
        vals *= np.uint64(0xbf58476d1ce4e5b9)
        vals ^= vals >> 27
        vals *= np.uint64(0x94d049bb133111eb)
        vals ^= vals >> 31
        return vals
예제 #32
0
    def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
        if hash_key is None:
            hash_key = _default_hash_key

        # For categoricals, we hash the categories, then remap the codes to the
        # hash values. (This check is above the complex check so that we don't
        # ask numpy if categorical is a subdtype of complex, as it will choke.
        if is_categorical_dtype(vals.dtype):
            return _hash_categorical(vals, encoding, hash_key)

        # we'll be working with everything as 64-bit values, so handle this
        # 128-bit value early
        if np.issubdtype(vals.dtype, np.complex128):
            return hash_array(vals.real) + 23 * hash_array(vals.imag)

        # First, turn whatever array this is into unsigned 64-bit ints, if we
        # can manage it.
        if is_bool_array(vals):
            vals = vals.astype('u8')
        elif ((is_datetime64_dtype(vals) or
               is_timedelta64_dtype(vals) or
               is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8):
            vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
        else:
            # With repeated values, its MUCH faster to categorize object
            # dtypes, then hash and rename categories. We allow skipping the
            # categorization when the values are known/likely to be unique.
            if categorize:
                codes, categories = pd.factorize(vals, sort=False)
                cat = pd.Categorical(codes, pd.Index(categories),
                                     ordered=False, fastpath=True)
                return _hash_categorical(cat, encoding, hash_key)

            vals = hash_object_array(vals, hash_key, encoding)

        # Then, redistribute these 64-bit ints within the space of 64-bit ints
        vals ^= vals >> 30
        vals *= np.uint64(0xbf58476d1ce4e5b9)
        vals ^= vals >> 27
        vals *= np.uint64(0x94d049bb133111eb)
        vals ^= vals >> 31
        return vals
예제 #33
0
def _wrap_results(result, dtype):
    """ wrap our results if needed """

    if is_datetime64_dtype(dtype):
        if not isinstance(result, np.ndarray):
            result = lib.Timestamp(result)
        else:
            result = result.view(dtype)
    elif is_timedelta64_dtype(dtype):
        if not isinstance(result, np.ndarray):

            # raise if we have a timedelta64[ns] which is too large
            if np.fabs(result) > _int64_max:
                raise ValueError("overflow in timedelta operation")

            result = lib.Timedelta(result, unit='ns')
        else:
            result = result.astype('i8').view(dtype)

    return result
예제 #34
0
def _wrap_results(result, dtype):
    """ wrap our results if needed """

    if is_datetime64_dtype(dtype):
        if not isinstance(result, np.ndarray):
            result = lib.Timestamp(result)
        else:
            result = result.view(dtype)
    elif is_timedelta64_dtype(dtype):
        if not isinstance(result, np.ndarray):

            # raise if we have a timedelta64[ns] which is too large
            if np.fabs(result) > _int64_max:
                raise ValueError("overflow in timedelta operation")

            result = lib.Timedelta(result, unit='ns')
        else:
            result = result.astype('i8').view(dtype)

    return result
예제 #35
0
    def _convert_listlike(arg, box, unit, name=None):

        if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'):
            arg = np.array(list(arg), dtype='O')

        # these are shortcutable
        if is_timedelta64_dtype(arg):
            value = arg.astype('timedelta64[ns]')
        elif is_integer_dtype(arg):
            value = arg.astype('timedelta64[{0}]'.format(
                unit)).astype('timedelta64[ns]', copy=False)
        else:
            value = tslib.array_to_timedelta64(_ensure_object(arg),
                                               unit=unit, errors=errors)
            value = value.astype('timedelta64[ns]', copy=False)

        if box:
            from pandas import TimedeltaIndex
            value = TimedeltaIndex(value, unit='ns', name=name)
        return value
예제 #36
0
    def _convert_listlike(arg, box, unit, name=None):

        if isinstance(arg, (list, tuple)) or not hasattr(arg, 'dtype'):
            arg = np.array(list(arg), dtype='O')

        # these are shortcutable
        if is_timedelta64_dtype(arg):
            value = arg.astype('timedelta64[ns]')
        elif is_integer_dtype(arg):
            value = arg.astype('timedelta64[{0}]'.format(unit)).astype(
                'timedelta64[ns]', copy=False)
        else:
            value = tslib.array_to_timedelta64(_ensure_object(arg),
                                               unit=unit,
                                               errors=errors)
            value = value.astype('timedelta64[ns]', copy=False)

        if box:
            from pandas import TimedeltaIndex
            value = TimedeltaIndex(value, unit='ns', name=name)
        return value
예제 #37
0
파일: tdi.py 프로젝트: songyang0716/pandas
    def astype(self, dtype, copy=True):
        dtype = np.dtype(dtype)

        if is_object_dtype(dtype):
            return self.asobject
        elif is_timedelta64_ns_dtype(dtype):
            if copy is True:
                return self.copy()
            return self
        elif is_timedelta64_dtype(dtype):
            # return an index (essentially this is division)
            result = self.values.astype(dtype, copy=copy)
            if self.hasnans:
                return Index(self._maybe_mask_results(result,
                                                      convert='float64'),
                             name=self.name)
            return Index(result.astype('i8'), name=self.name)
        elif is_integer_dtype(dtype):
            return Index(self.values.astype('i8', copy=copy), dtype='i8',
                         name=self.name)
        raise ValueError('Cannot cast TimedeltaIndex to dtype %s' % dtype)
예제 #38
0
def nanmean(values, axis=None, skipna=True):
    values, mask, dtype, dtype_max = _get_values(values, skipna, 0)

    dtype_sum = dtype_max
    dtype_count = np.float64
    if is_integer_dtype(dtype) or is_timedelta64_dtype(dtype):
        dtype_sum = np.float64
    elif is_float_dtype(dtype):
        dtype_sum = dtype
        dtype_count = dtype
    count = _get_counts(mask, axis, dtype=dtype_count)
    the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))

    if axis is not None and getattr(the_sum, 'ndim', False):
        the_mean = the_sum / count
        ct_mask = count == 0
        if ct_mask.any():
            the_mean[ct_mask] = np.nan
    else:
        the_mean = the_sum / count if count > 0 else np.nan

    return _wrap_results(the_mean, dtype)
예제 #39
0
def nanmean(values, axis=None, skipna=True):
    values, mask, dtype, dtype_max = _get_values(values, skipna, 0)

    dtype_sum = dtype_max
    dtype_count = np.float64
    if is_integer_dtype(dtype) or is_timedelta64_dtype(dtype):
        dtype_sum = np.float64
    elif is_float_dtype(dtype):
        dtype_sum = dtype
        dtype_count = dtype
    count = _get_counts(mask, axis, dtype=dtype_count)
    the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))

    if axis is not None and getattr(the_sum, 'ndim', False):
        the_mean = the_sum / count
        ct_mask = count == 0
        if ct_mask.any():
            the_mean[ct_mask] = np.nan
    else:
        the_mean = the_sum / count if count > 0 else np.nan

    return _wrap_results(the_mean, dtype)
예제 #40
0
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    na_sentinel : int, default -1
        Value to mark "not found"
    size_hint : hint to the hashtable sizer

    Returns
    -------
    labels : the indexer to the original array
    uniques : ndarray (1-d) or Index
        the unique values. Index is returned when passed values is Index or
        Series

    note: an array of Periods will ignore sort as it returns an always sorted
    PeriodIndex
    """
    from pandas import Index, Series, DatetimeIndex

    vals = np.asarray(values)

    # localize to UTC
    is_datetimetz_type = is_datetimetz(values)
    if is_datetimetz_type:
        values = DatetimeIndex(values)
        vals = values.asi8

    is_datetime = is_datetime64_dtype(vals)
    is_timedelta = is_timedelta64_dtype(vals)
    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(size_hint or len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel, True)

    labels = _ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        uniques, labels = safe_sort(uniques,
                                    labels,
                                    na_sentinel=na_sentinel,
                                    assume_unique=True)

    if is_datetimetz_type:
        # reset tz
        uniques = values._shallow_copy(uniques)
    elif is_datetime:
        uniques = uniques.astype('M8[ns]')
    elif is_timedelta:
        uniques = uniques.astype('m8[ns]')
    if isinstance(values, Index):
        uniques = values._shallow_copy(uniques, name=None)
    elif isinstance(values, Series):
        uniques = Index(uniques)
    return labels, uniques
예제 #41
0
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    na_sentinel : int, default -1
        Value to mark "not found"
    size_hint : hint to the hashtable sizer

    Returns
    -------
    labels : the indexer to the original array
    uniques : ndarray (1-d) or Index
        the unique values. Index is returned when passed values is Index or
        Series

    note: an array of Periods will ignore sort as it returns an always sorted
    PeriodIndex
    """
    from pandas import Index, Series, DatetimeIndex

    vals = np.asarray(values)

    # localize to UTC
    is_datetimetz_type = is_datetimetz(values)
    if is_datetimetz_type:
        values = DatetimeIndex(values)
        vals = values.tz_localize(None)

    is_datetime = is_datetime64_dtype(vals)
    is_timedelta = is_timedelta64_dtype(vals)
    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(size_hint or len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel, True)

    labels = _ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        try:
            sorter = uniques.argsort()
        except:
            # unorderable in py3 if mixed str/int
            t = hash_klass(len(uniques))
            t.map_locations(_ensure_object(uniques))

            # order ints before strings
            ordered = np.concatenate([
                np.sort(np.array([e for i, e in enumerate(uniques) if f(e)],
                                 dtype=object)) for f in
                [lambda x: not isinstance(x, string_types),
                 lambda x: isinstance(x, string_types)]])
            sorter = _ensure_platform_int(t.lookup(
                _ensure_object(ordered)))

        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = labels < 0
        labels = reverse_indexer.take(labels)
        np.putmask(labels, mask, -1)

        uniques = uniques.take(sorter)

    if is_datetimetz_type:

        # reset tz
        uniques = DatetimeIndex(uniques.astype('M8[ns]')).tz_localize(
            values.tz)
    elif is_datetime:
        uniques = uniques.astype('M8[ns]')
    elif is_timedelta:
        uniques = uniques.astype('m8[ns]')
    if isinstance(values, Index):
        uniques = values._shallow_copy(uniques, name=None)
    elif isinstance(values, Series):
        uniques = Index(uniques)
    return labels, uniques
예제 #42
0
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    na_sentinel : int, default -1
        Value to mark "not found"
    size_hint : hint to the hashtable sizer

    Returns
    -------
    labels : the indexer to the original array
    uniques : ndarray (1-d) or Index
        the unique values. Index is returned when passed values is Index or
        Series

    note: an array of Periods will ignore sort as it returns an always sorted
    PeriodIndex
    """
    from pandas import Index, Series, DatetimeIndex

    vals = np.asarray(values)

    # localize to UTC
    is_datetimetz_type = is_datetimetz(values)
    if is_datetimetz_type:
        values = DatetimeIndex(values)
        vals = values.asi8

    is_datetime = is_datetime64_dtype(vals)
    is_timedelta = is_timedelta64_dtype(vals)
    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(size_hint or len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel, True)

    labels = _ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        uniques, labels = safe_sort(uniques, labels, na_sentinel=na_sentinel,
                                    assume_unique=True)

    if is_datetimetz_type:
        # reset tz
        uniques = values._shallow_copy(uniques)
    elif is_datetime:
        uniques = uniques.astype('M8[ns]')
    elif is_timedelta:
        uniques = uniques.astype('m8[ns]')
    if isinstance(values, Index):
        uniques = values._shallow_copy(uniques, name=None)
    elif isinstance(values, Series):
        uniques = Index(uniques)
    return labels, uniques
예제 #43
0
    def _convert_to_array(self, values, name=None, other=None):
        """converts values to ndarray"""
        from pandas.tseries.timedeltas import to_timedelta

        ovalues = values
        supplied_dtype = None
        if not is_list_like(values):
            values = np.array([values])
        # if this is a Series that contains relevant dtype info, then use this
        # instead of the inferred type; this avoids coercing Series([NaT],
        # dtype='datetime64[ns]') to Series([NaT], dtype='timedelta64[ns]')
        elif (isinstance(values, pd.Series) and
              (is_timedelta64_dtype(values) or is_datetime64_dtype(values))):
            supplied_dtype = values.dtype
        inferred_type = supplied_dtype or lib.infer_dtype(values)
        if (inferred_type in ('datetime64', 'datetime', 'date', 'time') or
                is_datetimetz(inferred_type)):
            # if we have a other of timedelta, but use pd.NaT here we
            # we are in the wrong path
            if (supplied_dtype is None and other is not None and
                (other.dtype in ('timedelta64[ns]', 'datetime64[ns]')) and
                    isnull(values).all()):
                values = np.empty(values.shape, dtype='timedelta64[ns]')
                values[:] = iNaT

            # a datelike
            elif isinstance(values, pd.DatetimeIndex):
                values = values.to_series()
            # datetime with tz
            elif (isinstance(ovalues, datetime.datetime) and
                  hasattr(ovalues, 'tz')):
                values = pd.DatetimeIndex(values)
            # datetime array with tz
            elif is_datetimetz(values):
                if isinstance(values, ABCSeries):
                    values = values._values
            elif not (isinstance(values, (np.ndarray, ABCSeries)) and
                      is_datetime64_dtype(values)):
                values = tslib.array_to_datetime(values)
        elif inferred_type in ('timedelta', 'timedelta64'):
            # have a timedelta, convert to to ns here
            values = to_timedelta(values, errors='coerce', box=False)
        elif inferred_type == 'integer':
            # py3 compat where dtype is 'm' but is an integer
            if values.dtype.kind == 'm':
                values = values.astype('timedelta64[ns]')
            elif isinstance(values, pd.PeriodIndex):
                values = values.to_timestamp().to_series()
            elif name not in ('__truediv__', '__div__', '__mul__', '__rmul__'):
                raise TypeError("incompatible type for a datetime/timedelta "
                                "operation [{0}]".format(name))
        elif inferred_type == 'floating':
            if (isnull(values).all() and
                    name in ('__add__', '__radd__', '__sub__', '__rsub__')):
                values = np.empty(values.shape, dtype=other.dtype)
                values[:] = iNaT
            return values
        elif self._is_offset(values):
            return values
        else:
            raise TypeError("incompatible type [{0}] for a datetime/timedelta"
                            " operation".format(np.array(values).dtype))

        return values
예제 #44
0
파일: hashing.py 프로젝트: kordek/pandas
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
    """
    Given a 1d array, return an array of deterministic integers.

    .. versionadded:: 0.19.2

    Parameters
    ----------
    vals : ndarray
    encoding : string, default 'utf8'
        encoding for data & key when strings
    hash_key : string key to encode, default to _default_hash_key
    categorize : bool, default True
        Whether to first categorize object arrays before hashing. This is more
        efficient when the array contains duplicate values.

        .. versionadded:: 0.20.0

    Returns
    -------
    1d uint64 numpy array of hash values, same length as the vals

    """

    if hash_key is None:
        hash_key = _default_hash_key

    # For categoricals, we hash the categories, then remap the codes to the
    # hash values. (This check is above the complex check so that we don't ask
    # numpy if categorical is a subdtype of complex, as it will choke.
    if is_categorical_dtype(vals.dtype):
        return _hash_categorical(vals, encoding, hash_key)

    # we'll be working with everything as 64-bit values, so handle this
    # 128-bit value early
    if np.issubdtype(vals.dtype, np.complex128):
        return hash_array(vals.real) + 23 * hash_array(vals.imag)

    # First, turn whatever array this is into unsigned 64-bit ints, if we can
    # manage it.
    if is_bool_array(vals):
        vals = vals.astype('u8')
    elif ((is_datetime64_dtype(vals) or is_timedelta64_dtype(vals)
           or is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8):
        vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
    else:
        # With repeated values, its MUCH faster to categorize object dtypes,
        # then hash and rename categories. We allow skipping the categorization
        # when the values are known/likely to be unique.
        if categorize:
            codes, categories = factorize(vals, sort=False)
            cat = Categorical(codes,
                              Index(categories),
                              ordered=False,
                              fastpath=True)
            return _hash_categorical(cat, encoding, hash_key)

        vals = _hash.hash_object_array(vals, hash_key, encoding)

    # Then, redistribute these 64-bit ints within the space of 64-bit ints
    vals ^= vals >> 30
    vals *= np.uint64(0xbf58476d1ce4e5b9)
    vals ^= vals >> 27
    vals *= np.uint64(0x94d049bb133111eb)
    vals ^= vals >> 31
    return vals
예제 #45
0
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
    """
    Given a 1d array, return an array of deterministic integers.

    .. versionadded:: 0.19.2

    Parameters
    ----------
    vals : ndarray, Categorical
    encoding : string, default 'utf8'
        encoding for data & key when strings
    hash_key : string key to encode, default to _default_hash_key
    categorize : bool, default True
        Whether to first categorize object arrays before hashing. This is more
        efficient when the array contains duplicate values.

        .. versionadded:: 0.20.0

    Returns
    -------
    1d uint64 numpy array of hash values, same length as the vals

    """

    if not hasattr(vals, 'dtype'):
        raise TypeError("must pass a ndarray-like")

    if hash_key is None:
        hash_key = _default_hash_key

    # For categoricals, we hash the categories, then remap the codes to the
    # hash values. (This check is above the complex check so that we don't ask
    # numpy if categorical is a subdtype of complex, as it will choke.
    if is_categorical_dtype(vals.dtype):
        return _hash_categorical(vals, encoding, hash_key)

    # we'll be working with everything as 64-bit values, so handle this
    # 128-bit value early
    if np.issubdtype(vals.dtype, np.complex128):
        return hash_array(vals.real) + 23 * hash_array(vals.imag)

    # First, turn whatever array this is into unsigned 64-bit ints, if we can
    # manage it.
    if is_bool_array(vals):
        vals = vals.astype('u8')
    elif (is_datetime64_dtype(vals) or
          is_timedelta64_dtype(vals)):
        vals = vals.view('i8').astype('u8', copy=False)
    elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8):
        vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
    else:
        # With repeated values, its MUCH faster to categorize object dtypes,
        # then hash and rename categories. We allow skipping the categorization
        # when the values are known/likely to be unique.
        if categorize:
            codes, categories = factorize(vals, sort=False)
            cat = Categorical(codes, Index(categories),
                              ordered=False, fastpath=True)
            return _hash_categorical(cat, encoding, hash_key)

        try:
            vals = _hash.hash_object_array(vals, hash_key, encoding)
        except TypeError:
            # we have mixed types
            vals = _hash.hash_object_array(vals.astype(str).astype(object),
                                           hash_key, encoding)

    # Then, redistribute these 64-bit ints within the space of 64-bit ints
    vals ^= vals >> 30
    vals *= np.uint64(0xbf58476d1ce4e5b9)
    vals ^= vals >> 27
    vals *= np.uint64(0x94d049bb133111eb)
    vals ^= vals >> 31
    return vals