示例#1
0
    def __init__(self, left, right, name, na_op):

        # need to make sure that we are aligning the data
        if isinstance(left, pd.Series) and isinstance(right, pd.Series):
            left, right = left.align(right,copy=False)

        lvalues = self._convert_to_array(left, name=name)
        rvalues = self._convert_to_array(right, name=name, other=lvalues)

        self.name = name
        self.na_op = na_op

        # left
        self.left = left
        self.is_offset_lhs = self._is_offset(left)
        self.is_timedelta_lhs = is_timedelta64_dtype(lvalues)
        self.is_datetime64_lhs = is_datetime64_dtype(lvalues)
        self.is_datetime64tz_lhs = is_datetime64tz_dtype(lvalues)
        self.is_datetime_lhs = self.is_datetime64_lhs or self.is_datetime64tz_lhs
        self.is_integer_lhs = left.dtype.kind in ['i', 'u']
        self.is_floating_lhs = left.dtype.kind == 'f'

        # right
        self.right = right
        self.is_offset_rhs = self._is_offset(right)
        self.is_datetime64_rhs = is_datetime64_dtype(rvalues)
        self.is_datetime64tz_rhs = is_datetime64tz_dtype(rvalues)
        self.is_datetime_rhs = self.is_datetime64_rhs or self.is_datetime64tz_rhs
        self.is_timedelta_rhs = is_timedelta64_dtype(rvalues)
        self.is_integer_rhs = rvalues.dtype.kind in ('i', 'u')
        self.is_floating_rhs = rvalues.dtype.kind == 'f'

        self._validate(lvalues, rvalues, name)
        self.lvalues, self.rvalues = self._convert_for_datetime(lvalues, rvalues)
示例#2
0
文件: ops.py 项目: ankravch/pandas
    def __init__(self, left, right, name, na_op):
        super(_TimeOp, self).__init__(left, right, name, na_op)

        lvalues = self._convert_to_array(left, name=name)
        rvalues = self._convert_to_array(right, name=name, other=lvalues)

        # left
        self.is_offset_lhs = self._is_offset(left)
        self.is_timedelta_lhs = is_timedelta64_dtype(lvalues)
        self.is_datetime64_lhs = is_datetime64_dtype(lvalues)
        self.is_datetime64tz_lhs = is_datetime64tz_dtype(lvalues)
        self.is_datetime_lhs = (self.is_datetime64_lhs or
                                self.is_datetime64tz_lhs)
        self.is_integer_lhs = left.dtype.kind in ['i', 'u']
        self.is_floating_lhs = left.dtype.kind == 'f'

        # right
        self.is_offset_rhs = self._is_offset(right)
        self.is_datetime64_rhs = is_datetime64_dtype(rvalues)
        self.is_datetime64tz_rhs = is_datetime64tz_dtype(rvalues)
        self.is_datetime_rhs = (self.is_datetime64_rhs or
                                self.is_datetime64tz_rhs)
        self.is_timedelta_rhs = is_timedelta64_dtype(rvalues)
        self.is_integer_rhs = rvalues.dtype.kind in ('i', 'u')
        self.is_floating_rhs = rvalues.dtype.kind == 'f'

        self._validate(lvalues, rvalues, name)
        self.lvalues, self.rvalues = self._convert_for_datetime(lvalues,
                                                                rvalues)
示例#3
0
文件: tools.py 项目: X1mengYu/pandas
    def _convert_listlike(arg, box):

        if isinstance(arg, (list,tuple)):
            arg = np.array(arg, dtype='O')

        if com.is_datetime64_dtype(arg):
            if box and not isinstance(arg, DatetimeIndex):
                try:
                    return DatetimeIndex(arg, tz='utc' if utc else None)
                except ValueError as e:
                    values, tz = tslib.datetime_to_datetime64(arg)
                    return DatetimeIndex._simple_new(values, None, tz=tz)

            return arg

        arg = com._ensure_object(arg)
        try:
            if format is not None:
                result = tslib.array_strptime(arg, format)
            else:
                result = tslib.array_to_datetime(arg, raise_=errors == 'raise',
                                                 utc=utc, dayfirst=dayfirst,
                                                 coerce=coerce, unit=unit)
            if com.is_datetime64_dtype(result) and box:
                result = DatetimeIndex(result, tz='utc' if utc else None)
            return result

        except ValueError as e:
            try:
                values, tz = tslib.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, None, tz=tz)
            except (ValueError, TypeError):
                raise e
示例#4
0
def backfill_2d(values, limit=None, mask=None, dtype=None):

    if dtype is None:
        dtype = values.dtype
    _method = None
    if com.is_float_dtype(values):
        _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None)
    elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values):
        _method = _backfill_2d_datetime
    elif com.is_integer_dtype(values):
        values = com._ensure_float64(values)
        _method = algos.backfill_2d_inplace_float64
    elif values.dtype == np.object_:
        _method = algos.backfill_2d_inplace_object

    if _method is None:
        raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype.name)

    if mask is None:
        mask = com.isnull(values)
    mask = mask.view(np.uint8)

    if np.all(values.shape):
        _method(values, mask, limit=limit)
    else:
        # for test coverage
        pass
    return values
示例#5
0
文件: sql.py 项目: Jemash/pandas
    def _sqlalchemy_type(self, col):
        from sqlalchemy.types import (BigInteger, Float, Text, Boolean,
            DateTime, Date, Time, Interval)

        if com.is_datetime64_dtype(col):
            try:
                tz = col.tzinfo
                return DateTime(timezone=True)
            except:
                return DateTime
        if com.is_timedelta64_dtype(col):
            warnings.warn("the 'timedelta' type is not supported, and will be "
                          "written as integer values (ns frequency) to the "
                          "database.", UserWarning)
            return BigInteger
        elif com.is_float_dtype(col):
            return Float
        elif com.is_integer_dtype(col):
            # TODO: Refine integer size.
            return BigInteger
        elif com.is_bool_dtype(col):
            return Boolean
        inferred = lib.infer_dtype(com._ensure_object(col))
        if inferred == 'date':
            return Date
        if inferred == 'time':
            return Time
        return Text
示例#6
0
def get_dtype_kinds(l):
    """
    Parameters
    ----------
    l : list of arrays

    Returns
    -------
    a set of kinds that exist in this list of arrays
    """

    typs = set()
    for arr in l:

        dtype = arr.dtype
        if com.is_categorical_dtype(dtype):
            typ = "category"
        elif com.is_sparse(arr):
            typ = "sparse"
        elif com.is_datetimetz(arr):
            typ = "datetimetz"
        elif com.is_datetime64_dtype(dtype):
            typ = "datetime"
        elif com.is_timedelta64_dtype(dtype):
            typ = "timedelta"
        elif com.is_object_dtype(dtype):
            typ = "object"
        elif com.is_bool_dtype(dtype):
            typ = "bool"
        else:
            typ = dtype.kind
        typs.add(typ)
    return typs
示例#7
0
def get_dtype_kinds(l):
    """
    Parameters
    ----------
    l : list of arrays

    Returns
    -------
    a set of kinds that exist in this list of arrays
    """

    typs = set()
    for arr in l:

        dtype = arr.dtype
        if com.is_categorical_dtype(dtype):
            typ = 'category'
        elif com.is_sparse(arr):
            typ = 'sparse'
        elif com.is_datetimetz(arr):
            typ = 'datetimetz'
        elif com.is_datetime64_dtype(dtype):
            typ = 'datetime'
        elif com.is_timedelta64_dtype(dtype):
            typ = 'timedelta'
        elif com.is_object_dtype(dtype):
            typ = 'object'
        elif com.is_bool_dtype(dtype):
            typ = 'bool'
        else:
            typ = dtype.kind
        typs.add(typ)
    return typs
示例#8
0
文件: nanops.py 项目: 5i7788/pandas
def _wrap_results(result, dtype):
    """ wrap our results if needed """

    if is_datetime64_dtype(dtype):
        if not isinstance(result, np.ndarray):
            result = lib.Timestamp(result)
        else:
            result = result.view(dtype)
    elif is_timedelta64_dtype(dtype):
        if not isinstance(result, np.ndarray):

            # this is a scalar timedelta result!
            # we have series convert then take the element (scalar)
            # as series will do the right thing in py3 (and deal with numpy
            # 1.6.2 bug in that it results dtype of timedelta64[us]
            from pandas import Series

            # coerce float to results
            if is_float(result):
                result = int(result)
            result = Series([result], dtype='timedelta64[ns]')
        else:
            result = result.view(dtype)

    return result
示例#9
0
    def _sqlalchemy_type(self, arr_or_dtype):
        from sqlalchemy.types import (BigInteger, Float, Text, Boolean,
            DateTime, Date, Interval)

        if arr_or_dtype is date:
            return Date
        if com.is_datetime64_dtype(arr_or_dtype):
            try:
                tz = arr_or_dtype.tzinfo
                return DateTime(timezone=True)
            except:
                return DateTime
        if com.is_timedelta64_dtype(arr_or_dtype):
            warnings.warn("the 'timedelta' type is not supported, and will be "
                          "written as integer values (ns frequency) to the "
                          "database.", UserWarning)
            return BigInteger
        elif com.is_float_dtype(arr_or_dtype):
            return Float
        elif com.is_integer_dtype(arr_or_dtype):
            # TODO: Refine integer size.
            return BigInteger
        elif com.is_bool_dtype(arr_or_dtype):
            return Boolean
        return Text
示例#10
0
    def describe_1d(data):
        count = data.count()
        leng = len(data)
        distinct_count = data.nunique(dropna=False)
        if count > distinct_count > 1:
            mode = data.mode().iloc[0]
        else:
            mode = data[0]

        results_data = {'count': count, 'distinct_count': distinct_count, 'p_missing': 1 - count / leng,
                        'n_missing': leng - count,
                        'is_unique': distinct_count == leng,
                        'mode': mode,
                        'p_unique': distinct_count / count}
        try:
            # pandas 0.17 onwards
            results_data['memorysize'] = data.memory_usage()
        except:
            results_data['memorysize'] = 0

        result = pd.Series(results_data, name=data.name)

        if distinct_count <= 1:
            result = result.append(describe_constant_1d(data))
        elif com.is_numeric_dtype(data):
            result = result.append(describe_numeric_1d(data, result))
        elif com.is_datetime64_dtype(data):
            result = result.append(describe_date_1d(data, result))
        elif distinct_count == leng:
            result = result.append(describe_unique_1d(data))
        else:
            result = result.append(describe_categorical_1d(data))
        return result
示例#11
0
文件: common.py 项目: BrenBarn/pandas
def maybe_to_datetimelike(data, copy=False):
    """
    return a DelegatedClass of a Series that is datetimelike
      (e.g. datetime64[ns],timedelta64[ns] dtype or a Series of Periods)
    raise TypeError if this is not possible.

    Parameters
    ----------
    data : Series
    copy : boolean, default False
           copy the input data

    Returns
    -------
    DelegatedClass

    """
    from pandas import Series

    if not isinstance(data, Series):
        raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data)))

    index = data.index
    if is_datetime64_dtype(data.dtype) or is_datetime64tz_dtype(data.dtype):
        return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=data.name)
    elif is_timedelta64_dtype(data.dtype):
        return TimedeltaProperties(TimedeltaIndex(data, copy=copy, freq='infer'), index, name=data.name)
    else:
        if is_period_arraylike(data):
            return PeriodProperties(PeriodIndex(data, copy=copy), index, name=data.name)
        if is_datetime_arraylike(data):
            return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=data.name)

    raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data)))
示例#12
0
def format_array(values, formatter, float_format=None, na_rep='NaN',
                 digits=None, space=None, justify='right'):
    if com.is_float_dtype(values.dtype):
        fmt_klass = FloatArrayFormatter
    elif com.is_integer_dtype(values.dtype):
        fmt_klass = IntArrayFormatter
    elif com.is_datetime64_dtype(values.dtype):
        fmt_klass = Datetime64Formatter
    else:
        fmt_klass = GenericArrayFormatter

    if space is None:
        space = get_option("print.column_space")

    if float_format is None:
        float_format = get_option("print.float_format")

    if digits is None:
        digits = get_option("print.precision")

    fmt_obj = fmt_klass(values, digits, na_rep=na_rep,
                        float_format=float_format,
                        formatter=formatter, space=space,
                        justify=justify)

    return fmt_obj.get_result()
示例#13
0
def _plot_histogram(series, bins=10, figsize=(6, 4), facecolor='#337ab7'):
    """Plot an histogram from the data and return the AxesSubplot object.

    Parameters
    ----------
    series: Series, default None
        The data to plot
    figsize: a tuple (width, height) in inches, default (6,4)
        The size of the figure.
    facecolor: str
        The color code.

    Returns
    -------
    matplotlib.AxesSubplot, The plot.
    """
    if com.is_datetime64_dtype(series):
        # TODO: These calls should be merged
        fig = plt.figure(figsize=figsize)
        plot = fig.add_subplot(111)
        plot.set_ylabel('Frequency')
        try:
            plot.hist(series.values, facecolor=facecolor, bins=bins)
        except TypeError: # matplotlib 1.4 can't plot dates so will show empty plot instead
            pass
    else:
        plot = series.plot(kind='hist', figsize=figsize,
                           facecolor=facecolor,
                           bins=bins)  # TODO when running on server, send this off to a different thread
    return plot
示例#14
0
文件: ops.py 项目: agijsberts/pandas
    def _convert_to_array(self, values, name=None, other=None):
        """converts values to ndarray"""
        from pandas.tseries.timedeltas import to_timedelta

        coerce = True
        if not is_list_like(values):
            values = np.array([values])
        inferred_type = lib.infer_dtype(values)

        if inferred_type in ('datetime64', 'datetime', 'date', 'time'):
            # if we have a other of timedelta, but use pd.NaT here we
            # we are in the wrong path
            if (other is not None and other.dtype == 'timedelta64[ns]' and
                    all(isnull(v) for v in values)):
                values = np.empty(values.shape, dtype=other.dtype)
                values[:] = iNaT

            # a datelike
            elif isinstance(values, pd.DatetimeIndex):
                values = values.to_series()
            elif not (isinstance(values, (np.ndarray, pd.Series)) and
                      is_datetime64_dtype(values)):
                values = tslib.array_to_datetime(values)
        elif inferred_type in ('timedelta', 'timedelta64'):
            # have a timedelta, convert to to ns here
            values = to_timedelta(values, coerce=coerce)
        elif inferred_type == 'integer':
            # py3 compat where dtype is 'm' but is an integer
            if values.dtype.kind == 'm':
                values = values.astype('timedelta64[ns]')
            elif isinstance(values, pd.PeriodIndex):
                values = values.to_timestamp().to_series()
            elif name not in ('__truediv__', '__div__', '__mul__'):
                raise TypeError("incompatible type for a datetime/timedelta "
                                "operation [{0}]".format(name))
        elif isinstance(values[0], pd.DateOffset):
            # handle DateOffsets
            os = np.array([getattr(v, 'delta', None) for v in values])
            mask = isnull(os)
            if mask.any():
                raise TypeError("cannot use a non-absolute DateOffset in "
                                "datetime/timedelta operations [{0}]".format(
                                    ', '.join([com.pprint_thing(v)
                                               for v in values[mask]])))
            values = to_timedelta(os, coerce=coerce)
        elif inferred_type == 'floating':

            # all nan, so ok, use the other dtype (e.g. timedelta or datetime)
            if isnull(values).all():
                values = np.empty(values.shape, dtype=other.dtype)
                values[:] = iNaT
            else:
                raise TypeError(
                    'incompatible type [{0}] for a datetime/timedelta '
                    'operation'.format(np.array(values).dtype))
        else:
            raise TypeError("incompatible type [{0}] for a datetime/timedelta"
                            " operation".format(np.array(values).dtype))

        return values
示例#15
0
    def describe_1d(data):
        # Is unique
        # Percent missing
        names = ['count', 'distinct_count', 'p_missing', 'n_missing', 'is_unique', 'mode', 'p_unique', 'memorysize']
        count = data.count()
        leng = len(data)
        distinct_count = data.nunique(dropna=False)
        if count > distinct_count > 1:
            mode = data.mode().iloc[0]
        else:
            mode = data[0]

        results_data = [count, distinct_count, 1 - count / leng, leng - count, distinct_count == leng, mode,
                        distinct_count / count, data.memory_usage()]
        result = pd.Series(results_data, index=names, name=data.name)

        if distinct_count <= 1:
            result = result.append(describe_constant_1d(data))
        elif com.is_numeric_dtype(data):
            result = result.append(describe_numeric_1d(data, result))
        elif com.is_datetime64_dtype(data):
            result = result.append(describe_date_1d(data, result))
        elif distinct_count == leng:
            result = result.append(describe_unique_1d(data))
        else:
            result = result.append(describe_categorical_1d(data))
        return result
示例#16
0
文件: api.py 项目: AkiraKane/pandas
def na_value_for_dtype(dtype):
    """
    Return a dtype compat na value

    Parameters
    ----------
    dtype : string / dtype

    Returns
    -------
    dtype compat na value
    """

    from pandas.core import common as com
    from pandas import NaT
    dtype = pandas_dtype(dtype)

    if (com.is_datetime64_dtype(dtype) or
        com.is_datetime64tz_dtype(dtype) or
        com.is_timedelta64_dtype(dtype)):
        return NaT
    elif com.is_float_dtype(dtype):
        return np.nan
    elif com.is_integer_dtype(dtype):
        return 0
    elif com.is_bool_dtype(dtype):
        return False
    return np.nan
示例#17
0
文件: ops.py 项目: ankravch/pandas
    def get_op(cls, left, right, name, na_op):
        """
        Get op dispatcher, returns _Op or _TimeOp.

        If ``left`` and ``right`` are appropriate for datetime arithmetic with
        operation ``name``, processes them and returns a ``_TimeOp`` object
        that stores all the required values.  Otherwise, it will generate
        either a ``_Op``, indicating that the operation is performed via
        normal numpy path.
        """
        is_timedelta_lhs = is_timedelta64_dtype(left)
        is_datetime_lhs = (is_datetime64_dtype(left) or
                           is_datetime64tz_dtype(left))

        if isinstance(left, ABCSeries) and isinstance(right, ABCSeries):
            # avoid repated alignment
            if not left.index.equals(right.index):
                left, right = left.align(right, copy=False)

                index, lidx, ridx = left.index.join(right.index, how='outer',
                                                    return_indexers=True)
                # if DatetimeIndex have different tz, convert to UTC
                left.index = index
                right.index = index

        if not (is_datetime_lhs or is_timedelta_lhs):
            return _Op(left, right, name, na_op)
        else:
            return _TimeOp(left, right, name, na_op)
示例#18
0
def factorize(values, sort=False, order=None, na_sentinel=-1):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    order :
    na_sentinel: int, default -1
        Value to mark "not found"

    Returns
    -------
    """
    from pandas.tseries.period import PeriodIndex
    vals = np.asarray(values)
    is_datetime = com.is_datetime64_dtype(vals)
    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel)

    labels = com._ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        try:
            sorter = uniques.argsort()
        except:
            # unorderable in py3 if mixed str/int
            t = hash_klass(len(uniques))
            t.map_locations(com._ensure_object(uniques))

            # order ints before strings
            ordered = np.concatenate([
                np.sort(np.array([ e for i, e in enumerate(uniques) if f(e) ],dtype=object)) for f in [ lambda x: not isinstance(x,string_types),
                                                                                                        lambda x: isinstance(x,string_types) ]
                ])
            sorter = com._ensure_platform_int(t.lookup(com._ensure_object(ordered)))

        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = labels < 0
        labels = reverse_indexer.take(labels)
        np.putmask(labels, mask, -1)

        uniques = uniques.take(sorter)

    if is_datetime:
        uniques = uniques.astype('M8[ns]')
    if isinstance(values, PeriodIndex):
        uniques = PeriodIndex(ordinal=uniques, freq=values.freq)

    return labels, uniques
示例#19
0
def infer_freq(index, warn=True):
    """
    Infer the most likely frequency given the input index. If the frequency is
    uncertain, a warning will be printed

    Parameters
    ----------
    index : DatetimeIndex
            if passed a Series will use the values of the series (NOT THE INDEX)
    warn : boolean, default True

    Returns
    -------
    freq : string or None
        None if no discernible frequency
        TypeError if the index is not datetime-like
    """
    import pandas as pd

    if isinstance(index, com.ABCSeries):
        values = index.values
        if not (com.is_datetime64_dtype(index.values) or values.dtype == object):
            raise TypeError("cannot infer freq from a non-convertible dtype on a Series of {0}".format(index.dtype))
        index = values
    if isinstance(index, pd.PeriodIndex):
        raise TypeError("PeriodIndex given. Check the `freq` attribute "
                         "instead of using infer_freq.")
    if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex):
        if isinstance(index, (pd.Int64Index, pd.Float64Index)):
            raise TypeError("cannot infer freq from a non-convertible index type {0}".format(type(index)))
        index = index.values

    index = pd.DatetimeIndex(index)
    inferer = _FrequencyInferer(index, warn=warn)
    return inferer.get_freq()
示例#20
0
文件: ops.py 项目: ghl3/pandas
    def _convert_to_array(self, values, name=None, other=None):
        """converts values to ndarray"""
        from pandas.tseries.timedeltas import to_timedelta

        ovalues = values
        if not is_list_like(values):
            values = np.array([values])

        inferred_type = lib.infer_dtype(values)

        if inferred_type in ('datetime64', 'datetime', 'date', 'time'):
            # if we have a other of timedelta, but use pd.NaT here we
            # we are in the wrong path
            if (other is not None and other.dtype == 'timedelta64[ns]' and
                    all(isnull(v) for v in values)):
                values = np.empty(values.shape, dtype=other.dtype)
                values[:] = iNaT

            # a datelike
            elif isinstance(values, pd.DatetimeIndex):
                values = values.to_series()
            # datetime with tz
            elif isinstance(ovalues, datetime.datetime) and hasattr(ovalues,'tz'):
                values = pd.DatetimeIndex(values)
            # datetime array with tz
            elif com.is_datetimetz(values):
                if isinstance(values, pd.Series):
                    values = values._values
            elif not (isinstance(values, (np.ndarray, pd.Series)) and
                      is_datetime64_dtype(values)):
                values = tslib.array_to_datetime(values)
        elif inferred_type in ('timedelta', 'timedelta64'):
            # have a timedelta, convert to to ns here
            values = to_timedelta(values, errors='coerce')
        elif inferred_type == 'integer':
            # py3 compat where dtype is 'm' but is an integer
            if values.dtype.kind == 'm':
                values = values.astype('timedelta64[ns]')
            elif isinstance(values, pd.PeriodIndex):
                values = values.to_timestamp().to_series()
            elif name not in ('__truediv__', '__div__', '__mul__'):
                raise TypeError("incompatible type for a datetime/timedelta "
                                "operation [{0}]".format(name))
        elif inferred_type == 'floating':
            # all nan, so ok, use the other dtype (e.g. timedelta or datetime)
            if isnull(values).all():
                values = np.empty(values.shape, dtype=other.dtype)
                values[:] = iNaT
            else:
                raise TypeError(
                    'incompatible type [{0}] for a datetime/timedelta '
                    'operation'.format(np.array(values).dtype))
        elif self._is_offset(values):
            return values
        else:
            raise TypeError("incompatible type [{0}] for a datetime/timedelta"
                            " operation".format(np.array(values).dtype))

        return values
示例#21
0
    def _convert_to_array(self, values, name=None, other=None):
        """converts values to ndarray"""
        from pandas.tseries.timedeltas import _possibly_cast_to_timedelta

        coerce = "compat" if pd._np_version_under1p7 else True
        if not is_list_like(values):
            values = np.array([values])
        inferred_type = lib.infer_dtype(values)

        if inferred_type in ("datetime64", "datetime", "date", "time"):
            # if we have a other of timedelta, but use pd.NaT here we
            # we are in the wrong path
            if other is not None and other.dtype == "timedelta64[ns]" and all(isnull(v) for v in values):
                values = np.empty(values.shape, dtype=other.dtype)
                values[:] = tslib.iNaT

            # a datetlike
            elif not (isinstance(values, (pa.Array, pd.Series)) and com.is_datetime64_dtype(values)):
                values = tslib.array_to_datetime(values)
            elif isinstance(values, pd.DatetimeIndex):
                values = values.to_series()
        elif inferred_type in ("timedelta", "timedelta64"):
            # have a timedelta, convert to to ns here
            values = _possibly_cast_to_timedelta(values, coerce=coerce)
        elif inferred_type == "integer":
            # py3 compat where dtype is 'm' but is an integer
            if values.dtype.kind == "m":
                values = values.astype("timedelta64[ns]")
            elif isinstance(values, pd.PeriodIndex):
                values = values.to_timestamp().to_series()
            elif name not in ("__truediv__", "__div__", "__mul__"):
                raise TypeError("incompatible type for a datetime/timedelta " "operation [{0}]".format(name))
        elif isinstance(values[0], pd.DateOffset):
            # handle DateOffsets
            os = pa.array([getattr(v, "delta", None) for v in values])
            mask = isnull(os)
            if mask.any():
                raise TypeError(
                    "cannot use a non-absolute DateOffset in "
                    "datetime/timedelta operations [{0}]".format(", ".join([com.pprint_thing(v) for v in values[mask]]))
                )
            values = _possibly_cast_to_timedelta(os, coerce=coerce)
        elif inferred_type == "floating":

            # all nan, so ok, use the other dtype (e.g. timedelta or datetime)
            if isnull(values).all():
                values = np.empty(values.shape, dtype=other.dtype)
                values[:] = tslib.iNaT
            else:
                raise TypeError(
                    "incompatible type [{0}] for a datetime/timedelta " "operation".format(pa.array(values).dtype)
                )
        else:
            raise TypeError(
                "incompatible type [{0}] for a datetime/timedelta" " operation".format(pa.array(values).dtype)
            )

        return values
示例#22
0
文件: ops.py 项目: fish444555/pandas
    def __init__(self, left, right, name):
        self.name = name

        lvalues = self._convert_to_array(left, name=name)
        rvalues = self._convert_to_array(right, name=name)

        self.is_timedelta_lhs = com.is_timedelta64_dtype(left)
        self.is_datetime_lhs  = com.is_datetime64_dtype(left)
        self.is_integer_lhs = left.dtype.kind in ['i','u']
        self.is_datetime_rhs  = com.is_datetime64_dtype(rvalues)
        self.is_timedelta_rhs = (com.is_timedelta64_dtype(rvalues)
                                 or (not self.is_datetime_rhs
                                     and pd._np_version_under1p7))
        self.is_integer_rhs = rvalues.dtype.kind in ('i','u')

        self._validate()

        self._convert_for_datetime(lvalues, rvalues)
示例#23
0
文件: tools.py 项目: APWaldo/pandas
    def _convert_listlike(arg, box):

        if isinstance(arg, (list,tuple)):
            arg = np.array(arg, dtype='O')

        if com.is_datetime64_dtype(arg):
            if box and not isinstance(arg, DatetimeIndex):
                try:
                    return DatetimeIndex(arg, tz='utc' if utc else None)
                except ValueError as e:
                    values, tz = tslib.datetime_to_datetime64(arg)
                    return DatetimeIndex._simple_new(values, None, tz=tz)

            return arg

        arg = com._ensure_object(arg)
        try:
            if format is not None:
                result = None

                # shortcut formatting here
                if format == '%Y%m%d':
                    try:
                        result = _attempt_YYYYMMDD(arg)
                    except:
                        raise ValueError("cannot convert the input to '%Y%m%d' date format")

                # fallback
                if result is None:
                    result = tslib.array_strptime(arg, format)
            else:
                result = tslib.array_to_datetime(arg, raise_=errors == 'raise',
                                                 utc=utc, dayfirst=dayfirst,
                                                 coerce=coerce, unit=unit)
            if com.is_datetime64_dtype(result) and box:
                result = DatetimeIndex(result, tz='utc' if utc else None)
            return result

        except ValueError as e:
            try:
                values, tz = tslib.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, None, tz=tz)
            except (ValueError, TypeError):
                raise e
示例#24
0
def _unpickle_array(bytes):
    arr = read_array(BytesIO(bytes))

    # All datetimes should be stored as M8[ns].  When unpickling with
    # numpy1.6, it will read these as M8[us].  So this ensures all
    # datetime64 types are read as MS[ns]
    if com.is_datetime64_dtype(arr):
        arr = arr.view(com._NS_DTYPE)

    return arr
示例#25
0
def factorize(values, sort=False, order=None, na_sentinel=-1):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : array-like
    sort : boolean, default True
        Sort by values
    order :,optional
    na_sentinel : int, default -1
        Value to mark "not found"
    Examples
    --------
    >>> factorize([12,3,8,5,9,7,11],sort=True,order=None,na_sentinel=-1)
    (array([6, 0, 3, 1, 4, 2, 5]), array([ 3,  5,  7,  8,  9, 11, 12], dtype=int64))
    >>> factorize([12,3,8,5,9,7,10],sort=False,order=None,na_sentinel=-1)
    (array([0, 1, 2, 3, 4, 5, 6]), array([12,  3,  8,  5,  9,  7, 10], dtype=int64))
    >>> factorize([12,3,8,5,9,7,10,10],sort=False,order=None,na_sentinel=-1) 
    (array([0, 1, 2, 3, 4, 5, 6, 6]), array([12,  3,  8,  5,  9,  7, 10], dtype=int64))
    Returns
    -------
    a tuple
    labels of each number in array form ,corresponding number without duplication
    """
    from pandas.tseries.period import PeriodIndex
    vals = np.asarray(values)
    is_datetime = com.is_datetime64_dtype(vals)
    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel)

    labels = com._ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        sorter = uniques.argsort()
        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = labels < 0
        labels = reverse_indexer.take(labels)
        np.putmask(labels, mask, -1)

        uniques = uniques.take(sorter)

    if is_datetime:
        uniques = uniques.view('M8[ns]')
    if isinstance(values, PeriodIndex):
        uniques = PeriodIndex(ordinal=uniques, freq=values.freq)

    return labels, uniques
示例#26
0
def pandas_col_to_ibis_type(col):
    import pandas.core.common as pdcom
    import ibis.expr.datatypes as dt
    import numpy as np
    dty = col.dtype

    # datetime types
    if pdcom.is_datetime64_dtype(dty):
        if pdcom.is_datetime64_ns_dtype(dty):
            return 'timestamp'
        else:
            raise com.IbisTypeError("Column {0} has dtype {1}, which is "
                                    "datetime64-like but does "
                                    "not use nanosecond units"
                                    .format(col.name, dty))
    if pdcom.is_timedelta64_dtype(dty):
        print("Warning: encoding a timedelta64 as an int64")
        return 'int64'

    if pdcom.is_categorical_dtype(dty):
        return dt.Category(len(col.cat.categories))

    if pdcom.is_bool_dtype(dty):
        return 'boolean'

    # simple numerical types
    if issubclass(dty.type, np.int8):
        return 'int8'
    if issubclass(dty.type, np.int16):
        return 'int16'
    if issubclass(dty.type, np.int32):
        return 'int32'
    if issubclass(dty.type, np.int64):
        return 'int64'
    if issubclass(dty.type, np.float32):
        return 'float'
    if issubclass(dty.type, np.float64):
        return 'double'
    if issubclass(dty.type, np.uint8):
        return 'int16'
    if issubclass(dty.type, np.uint16):
        return 'int32'
    if issubclass(dty.type, np.uint32):
        return 'int64'
    if issubclass(dty.type, np.uint64):
        raise com.IbisTypeError("Column {0} is an unsigned int64"
                                .format(col.name))

    if pdcom.is_object_dtype(dty):
        # TODO: overly broad?
        return 'string'

    raise com.IbisTypeError("Column {0} is dtype {1}"
                            .format(col.name, dty))
示例#27
0
def isin(comps, values):
    """
    Compute the isin boolean array

    Parameters
    ----------
    comps: array-like
    values: array-like

    Returns
    -------
    boolean array same length as comps
    """

    if not com.is_list_like(comps):
        raise TypeError(
            "only list-like objects are allowed to be passed"
            " to isin(), you passed a "
            "[{0}]".format(type(comps).__name__)
        )
    comps = np.asarray(comps)
    if not com.is_list_like(values):
        raise TypeError(
            "only list-like objects are allowed to be passed"
            " to isin(), you passed a "
            "[{0}]".format(type(values).__name__)
        )

    # GH11232
    # work-around for numpy < 1.8 and comparisions on py3
    # faster for larger cases to use np.in1d
    if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000:
        f = lambda x, y: np.in1d(x, np.asarray(list(y)))
    else:
        f = lambda x, y: lib.ismember_int64(x, set(y))

    # may need i8 conversion for proper membership testing
    if com.is_datetime64_dtype(comps):
        from pandas.tseries.tools import to_datetime

        values = to_datetime(values)._values.view("i8")
        comps = comps.view("i8")
    elif com.is_timedelta64_dtype(comps):
        from pandas.tseries.timedeltas import to_timedelta

        values = to_timedelta(values)._values.view("i8")
        comps = comps.view("i8")
    elif com.is_int64_dtype(comps):
        pass
    else:
        f = lambda x, y: lib.ismember(x, set(values))

    return f(comps, values)
示例#28
0
文件: ops.py 项目: agijsberts/pandas
    def __init__(self, left, right, name):
        self.name = name

        # need to make sure that we are aligning the data
        if isinstance(left, pd.Series) and isinstance(right, pd.Series):
            left, right = left.align(right,copy=False)

        self.left = left
        self.right = right
        lvalues = self._convert_to_array(left, name=name)
        rvalues = self._convert_to_array(right, name=name, other=lvalues)

        self.is_timedelta_lhs = is_timedelta64_dtype(left)
        self.is_datetime_lhs = is_datetime64_dtype(left)
        self.is_integer_lhs = left.dtype.kind in ['i', 'u']
        self.is_datetime_rhs = is_datetime64_dtype(rvalues)
        self.is_timedelta_rhs = is_timedelta64_dtype(rvalues)
        self.is_integer_rhs = rvalues.dtype.kind in ('i', 'u')

        self._validate()

        self._convert_for_datetime(lvalues, rvalues)
示例#29
0
def _get_data_algo(values, func_map):
    if com.is_float_dtype(values):
        f = func_map['float64']
        values = com._ensure_float64(values)
    elif com.is_datetime64_dtype(values):
        f = func_map['int64']
        values = values.view('i8')
    elif com.is_integer_dtype(values):
        f = func_map['int64']
        values = com._ensure_int64(values)
    else:
        f = func_map['generic']
        values = com._ensure_object(values)
    return f, values
示例#30
0
def _get_data_algo(values, func_map):
    if com.is_float_dtype(values):
        f = func_map["float64"]
        values = com._ensure_float64(values)
    elif com.is_datetime64_dtype(values):
        f = func_map["int64"]
        values = values.view("i8")
    elif com.is_integer_dtype(values):
        f = func_map["int64"]
        values = com._ensure_int64(values)
    else:
        f = func_map["generic"]
        values = com._ensure_object(values)
    return f, values
示例#31
0
文件: algorithms.py 项目: wjt/pandas
def _hashtable_algo(f, dtype, return_dtype=None):
    """
    f(HashTable, type_caster) -> result
    """
    if com.is_float_dtype(dtype):
        return f(htable.Float64HashTable, com._ensure_float64)
    elif com.is_integer_dtype(dtype):
        return f(htable.Int64HashTable, com._ensure_int64)
    elif com.is_datetime64_dtype(dtype):
        return_dtype = return_dtype or 'M8[ns]'
        return f(htable.Int64HashTable, com._ensure_int64).view(return_dtype)
    elif com.is_timedelta64_dtype(dtype):
        return_dtype = return_dtype or 'm8[ns]'
        return f(htable.Int64HashTable, com._ensure_int64).view(return_dtype)
    else:
        return f(htable.PyObjectHashTable, com._ensure_object)
示例#32
0
def infer_freq(index, warn=True):
    """
    Infer the most likely frequency given the input index. If the frequency is
    uncertain, a warning will be printed.

    Parameters
    ----------
    index : DatetimeIndex or TimedeltaIndex
            if passed a Series will use the values of the series (NOT THE INDEX)
    warn : boolean, default True

    Returns
    -------
    freq : string or None
        None if no discernible frequency
        TypeError if the index is not datetime-like
        ValueError if there are less than three values.
    """
    import pandas as pd

    if isinstance(index, com.ABCSeries):
        values = index.values
        if not (com.is_datetime64_dtype(index.values)
                or com.is_timedelta64_dtype(index.values)
                or values.dtype == object):
            raise TypeError(
                "cannot infer freq from a non-convertible dtype on a Series of {0}"
                .format(index.dtype))
        index = values

    if com.is_period_arraylike(index):
        raise TypeError("PeriodIndex given. Check the `freq` attribute "
                        "instead of using infer_freq.")
    elif isinstance(index, pd.TimedeltaIndex):
        inferer = _TimedeltaFrequencyInferer(index, warn=warn)
        return inferer.get_freq()

    if isinstance(index, pd.Index) and not isinstance(index, pd.DatetimeIndex):
        if isinstance(index, (pd.Int64Index, pd.Float64Index)):
            raise TypeError(
                "cannot infer freq from a non-convertible index type {0}".
                format(type(index)))
        index = index.values

    index = pd.DatetimeIndex(index)
    inferer = _FrequencyInferer(index, warn=warn)
    return inferer.get_freq()
示例#33
0
def factorize(values, sort=False, order=None, na_sentinel=-1):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    order :
    na_sentinel: int, default -1
        Value to mark "not found"

    Returns
    -------
    """
    from pandas.tseries.period import PeriodIndex
    vals = np.asarray(values)
    is_datetime = com.is_datetime64_dtype(vals)
    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel)

    labels = com._ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        sorter = uniques.argsort()
        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = labels < 0
        labels = reverse_indexer.take(labels)
        np.putmask(labels, mask, -1)

        uniques = uniques.take(sorter)

    if is_datetime:
        uniques = uniques.view('M8[ns]')
    if isinstance(values, PeriodIndex):
        uniques = PeriodIndex(ordinal=uniques, freq=values.freq)

    return labels, uniques
示例#34
0
def isin(comps, values):
    """
    Compute the isin boolean array

    Parameters
    ----------
    comps: array-like
    values: array-like

    Returns
    -------
    boolean array same length as comps
    """

    if not com.is_list_like(comps):
        raise TypeError("only list-like objects are allowed to be passed"
                        " to isin(), you passed a "
                        "[{0}]".format(type(comps).__name__))
    comps = np.asarray(comps)
    if not com.is_list_like(values):
        raise TypeError("only list-like objects are allowed to be passed"
                        " to isin(), you passed a "
                        "[{0}]".format(type(values).__name__))

    # GH11232
    # work-around for numpy < 1.8 and comparisions on py3
    # faster for larger cases to use np.in1d
    if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000:
        f = lambda x, y: np.in1d(x, np.asarray(list(y)))
    else:
        f = lambda x, y: lib.ismember_int64(x, set(y))

    # may need i8 conversion for proper membership testing
    if com.is_datetime64_dtype(comps):
        from pandas.tseries.tools import to_datetime
        values = to_datetime(values)._values.view('i8')
        comps = comps.view('i8')
    elif com.is_timedelta64_dtype(comps):
        from pandas.tseries.timedeltas import to_timedelta
        values = to_timedelta(values)._values.view('i8')
        comps = comps.view('i8')
    elif com.is_int64_dtype(comps):
        pass
    else:
        f = lambda x, y: lib.ismember(x, set(values))

    return f(comps, values)
示例#35
0
文件: tools.py 项目: wudcwctw/pandas
    def _convert_listlike(arg, box):

        if isinstance(arg, (list, tuple)):
            arg = np.array(arg, dtype='O')

        if com.is_datetime64_ns_dtype(arg):
            if box and not isinstance(arg, DatetimeIndex):
                try:
                    return DatetimeIndex(arg, tz='utc' if utc else None)
                except ValueError:
                    pass

            return arg

        arg = com._ensure_object(arg)
        try:
            if format is not None:
                result = None

                # shortcut formatting here
                if format == '%Y%m%d':
                    try:
                        result = _attempt_YYYYMMDD(arg)
                    except:
                        raise ValueError(
                            "cannot convert the input to '%Y%m%d' date format")

                # fallback
                if result is None:
                    result = tslib.array_strptime(arg, format, coerce=coerce)
            else:
                result = tslib.array_to_datetime(arg,
                                                 raise_=errors == 'raise',
                                                 utc=utc,
                                                 dayfirst=dayfirst,
                                                 coerce=coerce,
                                                 unit=unit)
            if com.is_datetime64_dtype(result) and box:
                result = DatetimeIndex(result, tz='utc' if utc else None)
            return result

        except ValueError as e:
            try:
                values, tz = tslib.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, None, tz=tz)
            except (ValueError, TypeError):
                raise e
示例#36
0
文件: array.py 项目: jkr2110/arch
def date_to_index(date, date_index):
    """
    Looks up a

    Parameters
    ----------
    date : string, datetime or datetime64
        Date to use when returning the index
    date_index : 1-d array of datetime64
        Index data containing datetime64 values

    Returns
    -------
    index : int
        Index location

    Notes
    -----
    Assumes dates are increasing and unique.

    Uses last value interpolation if a date is not in the series so that the
    value returned satisfies date_index[index] is the largest date less than or
    equal to date.
    """
    if not is_datetime64_dtype(date_index):
        raise ValueError('date_index must be a datetime64 array')

    if not np.all((np.diff(date_index.values).astype(dtype=np.int64)) > 0):
        raise ValueError('date_index is not monotonic and unique')
    if not isinstance(date, (dt.datetime, np.datetime64, str)):
        raise ValueError("date must be a datetime, datetime64 or string")
    elif isinstance(date, dt.datetime):
        date = np.datetime64(date)
    elif isinstance(date, str):
        orig_date = date
        date = np.datetime64(to_datetime(date, coerce=True))
        if date == NaT:
            raise ValueError('date:' + orig_date +
                             ' cannot be parsed to a date.')

    date_index = np.asarray(date_index)

    locs = np.nonzero(date_index <= date)[0]
    if locs.shape[0] == 0:
        raise ValueError('All dates in date_index occur after date')
    else:
        return locs.max()
示例#37
0
def as_json_table_type(x):
    """
    Convert a NumPy / pandas type to its corresponding json_table.

    Parameters
    ----------
    x : array or dtype

    Returns
    -------
    t : str
        the Table Schema data types

    Notes
    -----
    This table shows the relationship between NumPy / pandas dtypes,
    and Table Schema dtypes.

    ==============  =================
    Pandas type     Table Schema type
    ==============  =================
    int64           integer
    float64         number
    bool            boolean
    datetime64[ns]  datetime
    timedelta64[ns] duration
    object          str
    categorical     any
    =============== =================
    """
    if is_integer_dtype(x):
        return 'integer'
    elif is_bool_dtype(x):
        return 'boolean'
    elif is_numeric_dtype(x):
        return 'number'
    elif (is_datetime64_dtype(x) or is_datetime64tz_dtype(x)):
        return 'datetime'
    elif is_timedelta64_dtype(x):
        return 'duration'
    elif is_categorical_dtype(x):
        return 'any'
    elif is_string_dtype(x):
        return 'string'
    else:
        return 'any'
示例#38
0
文件: tools.py 项目: tkiran/pandas
    def _convert_f(arg):
        arg = com._ensure_object(arg)

        try:
            result = lib.array_to_datetime(arg,
                                           raise_=errors == 'raise',
                                           utc=utc,
                                           dayfirst=dayfirst)
            if com.is_datetime64_dtype(result) and box:
                result = DatetimeIndex(result, tz='utc' if utc else None)
            return result
        except ValueError, e:
            try:
                values, tz = lib.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, None, tz=tz)
            except (ValueError, TypeError):
                raise e
示例#39
0
def maybe_to_datetimelike(data, copy=False):
    """
    return a DelegatedClass of a Series that is datetimelike
      (e.g. datetime64[ns],timedelta64[ns] dtype or a Series of Periods)
    raise TypeError if this is not possible.

    Parameters
    ----------
    data : Series
    copy : boolean, default False
           copy the input data

    Returns
    -------
    DelegatedClass

    """
    from pandas import Series

    if not isinstance(data, Series):
        raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data)))

    index = data.index
    name = data.name
    orig = data if is_categorical_dtype(data) else None
    if orig is not None:
        data = orig.values.categories

    if is_datetime64_dtype(data.dtype):
        return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index, name=name,
                                                orig=orig)
    elif is_datetime64tz_dtype(data.dtype):
        return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer', ambiguous='infer'),
                                  index, data.name, orig=orig)
    elif is_timedelta64_dtype(data.dtype):
        return TimedeltaProperties(TimedeltaIndex(data, copy=copy, freq='infer'), index,
                                   name=name, orig=orig)
    else:
        if is_period_arraylike(data):
            return PeriodProperties(PeriodIndex(data, copy=copy), index, name=name, orig=orig)
        if is_datetime_arraylike(data):
            return DatetimeProperties(DatetimeIndex(data, copy=copy, freq='infer'), index,
                                      name=name, orig=orig)

    raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data)))
示例#40
0
    def maybe_convert_for_time_op(cls, left, right, name, na_op):
        """
        if ``left`` and ``right`` are appropriate for datetime arithmetic with
        operation ``name``, processes them and returns a ``_TimeOp`` object
        that stores all the required values.  Otherwise, it will generate
        either a ``NotImplementedError`` or ``None``, indicating that the
        operation is unsupported for datetimes (e.g., an unsupported r_op) or
        that the data is not the right type for time ops.
        """
        # decide if we can do it
        is_timedelta_lhs = is_timedelta64_dtype(left)
        is_datetime_lhs = (is_datetime64_dtype(left) or
                           is_datetime64tz_dtype(left))

        if not (is_datetime_lhs or is_timedelta_lhs):
            return None

        return cls(left, right, name, na_op)
示例#41
0
文件: ops.py 项目: quaintm/pandas
    def _convert_to_array(self, values, name=None):
        """converts values to ndarray"""
        from pandas.tseries.timedeltas import _possibly_cast_to_timedelta

        coerce = 'compat' if pd._np_version_under1p7 else True
        if not is_list_like(values):
            values = np.array([values])
        inferred_type = lib.infer_dtype(values)
        if inferred_type in ('datetime64', 'datetime', 'date', 'time'):
            # a datetlike
            if not (isinstance(values, (pa.Array, pd.Series))
                    and com.is_datetime64_dtype(values)):
                values = tslib.array_to_datetime(values)
            elif isinstance(values, pd.DatetimeIndex):
                values = values.to_series()
        elif inferred_type in ('timedelta', 'timedelta64'):
            # have a timedelta, convert to to ns here
            values = _possibly_cast_to_timedelta(values, coerce=coerce)
        elif inferred_type == 'integer':
            # py3 compat where dtype is 'm' but is an integer
            if values.dtype.kind == 'm':
                values = values.astype('timedelta64[ns]')
            elif isinstance(values, pd.PeriodIndex):
                values = values.to_timestamp().to_series()
            elif name not in ('__truediv__', '__div__', '__mul__'):
                raise TypeError("incompatible type for a datetime/timedelta "
                                "operation [{0}]".format(name))
        elif isinstance(values[0], pd.DateOffset):
            # handle DateOffsets
            os = pa.array([getattr(v, 'delta', None) for v in values])
            mask = isnull(os)
            if mask.any():
                raise TypeError(
                    "cannot use a non-absolute DateOffset in "
                    "datetime/timedelta operations [{0}]".format(','.join(
                        [com.pprint_thing(v) for v in values[mask]])))
            values = _possibly_cast_to_timedelta(os, coerce=coerce)
        else:
            raise TypeError(
                "incompatible type [{0}] for a datetime/timedelta operation".
                format(pa.array(values).dtype))

        return values
示例#42
0
 def maybe_convert_for_time_op(cls, left, right, name):
     """
     if ``left`` and ``right`` are appropriate for datetime arithmetic with
     operation ``name``, processes them and returns a ``_TimeOp`` object
     that stores all the required values.  Otherwise, it will generate
     either a ``NotImplementedError`` or ``None``, indicating that the
     operation is unsupported for datetimes (e.g., an unsupported r_op) or
     that the data is not the right type for time ops.
     """
     # decide if we can do it
     is_timedelta_lhs = com.is_timedelta64_dtype(left)
     is_datetime_lhs = com.is_datetime64_dtype(left)
     if not (is_datetime_lhs or is_timedelta_lhs):
         return None
     # rops are allowed. No need for special checks, just strip off
     # r part.
     if name.startswith('__r'):
         name = "__" + name[3:]
     return cls(left, right, name)
示例#43
0
def to_datetime(arg, errors='ignore', dayfirst=False, box=True):
    """
    Convert argument to datetime

    Parameters
    ----------
    arg : string, datetime, array of strings (with possible NAs)
    errors : {'ignore', 'raise'}, default 'ignore'
        Errors are ignored by default (values left untouched)

    Returns
    -------
    ret : datetime if parsing succeeded
    """
    from pandas.core.series import Series
    from pandas.tseries.index import DatetimeIndex
    if arg is None:
        return arg
    elif isinstance(arg, datetime):
        return arg
    elif isinstance(arg, Series):
        values = lib.array_to_datetime(com._ensure_object(arg.values),
                                       raise_=errors == 'raise',
                                       dayfirst=dayfirst)
        return Series(values, index=arg.index, name=arg.name)
    elif isinstance(arg, (np.ndarray, list)):
        if isinstance(arg, list):
            arg = np.array(arg, dtype='O')
        result = lib.array_to_datetime(com._ensure_object(arg),
                                       raise_=errors == 'raise',
                                       dayfirst=dayfirst)
        if com.is_datetime64_dtype(result) and box:
            result = DatetimeIndex(result)
        return result
    try:
        if not arg:
            return arg
        return _dtparser.parse(arg, dayfirst=dayfirst)
    except Exception:
        if errors == 'raise':
            raise
        return arg
示例#44
0
文件: nanops.py 项目: rockg/pandas
def _wrap_results(result, dtype):
    """ wrap our results if needed """

    if is_datetime64_dtype(dtype):
        if not isinstance(result, np.ndarray):
            result = lib.Timestamp(result)
        else:
            result = result.view(dtype)
    elif is_timedelta64_dtype(dtype):
        if not isinstance(result, np.ndarray):

            # raise if we have a timedelta64[ns] which is too large
            if np.fabs(result) > _int64_max:
                raise ValueError("overflow in timedelta operation")

            result = lib.Timedelta(result, unit='ns')
        else:
            result = result.astype('i8').view(dtype)

    return result
示例#45
0
    def _sqlalchemy_type(self, arr_or_dtype):
        from sqlalchemy.types import Integer, Float, Text, Boolean, DateTime, Date, Interval

        if arr_or_dtype is date:
            return Date
        if com.is_datetime64_dtype(arr_or_dtype):
            try:
                tz = arr_or_dtype.tzinfo
                return DateTime(timezone=True)
            except:
                return DateTime
        if com.is_timedelta64_dtype(arr_or_dtype):
            return Interval
        elif com.is_float_dtype(arr_or_dtype):
            return Float
        elif com.is_integer_dtype(arr_or_dtype):
            # TODO: Refine integer size.
            return Integer
        elif com.is_bool(arr_or_dtype):
            return Boolean
        return Text
示例#46
0
def factorize(values, sort=False, order=None, na_sentinel=-1):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : sequence
    sort :
    order :

    Returns
    -------
    """
    values = np.asarray(values)
    is_datetime = com.is_datetime64_dtype(values)
    (hash_klass, vec_klass), values = _get_data_algo(values, _hashtables)

    table = hash_klass(len(values))
    uniques = vec_klass()
    labels = table.get_labels(values, uniques, 0, na_sentinel)

    labels = com._ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        sorter = uniques.argsort()
        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = labels < 0
        labels = reverse_indexer.take(labels)
        np.putmask(labels, mask, -1)

        uniques = uniques.take(sorter)

    if is_datetime:
        uniques = uniques.view('M8[ns]')

    return labels, uniques
示例#47
0
文件: missing.py 项目: yaduart/pandas
def pad_1d(values, limit=None, mask=None, dtype=None):

    if dtype is None:
        dtype = values.dtype
    _method = None
    if com.is_float_dtype(values):
        _method = getattr(algos, 'pad_inplace_%s' % dtype.name, None)
    elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values):
        _method = _pad_1d_datetime
    elif com.is_integer_dtype(values):
        values = com._ensure_float64(values)
        _method = algos.pad_inplace_float64
    elif values.dtype == np.object_:
        _method = algos.pad_inplace_object

    if _method is None:
        raise ValueError('Invalid dtype for pad_1d [%s]' % dtype.name)

    if mask is None:
        mask = com.isnull(values)
    mask = mask.view(np.uint8)
    _method(values, mask, limit=limit)
    return values
示例#48
0
    def describe_1d(data):
        count = data.count()
        leng = len(data)
        distinct_count = data.nunique(dropna=False)
        if count > distinct_count > 1:
            mode = data.mode().iloc[0]
        else:
            mode = data[0]

        results_data = {
            'count': count,
            'distinct_count': distinct_count,
            'p_missing': 1 - count / leng,
            'n_missing': leng - count,
            'is_unique': distinct_count == leng,
            'mode': mode,
            'p_unique': distinct_count / count
        }
        try:
            # pandas 0.17 onwards
            results_data['memorysize'] = data.memory_usage()
        except:
            results_data['memorysize'] = 0

        result = pd.Series(results_data, name=data.name)

        if distinct_count <= 1:
            result = result.append(describe_constant_1d(data))
        elif com.is_numeric_dtype(data):
            result = result.append(describe_numeric_1d(data, result))
        elif com.is_datetime64_dtype(data):
            result = result.append(describe_date_1d(data, result))
        elif distinct_count == leng:
            result = result.append(describe_unique_1d(data))
        else:
            result = result.append(describe_categorical_1d(data))
        return result
示例#49
0
文件: sql.py 项目: thorwhalen/pandas
    def _sqlalchemy_type(self, arr_or_dtype):
        from sqlalchemy.types import Integer, Float, Text, Boolean, DateTime, Date, Interval

        if arr_or_dtype is date:
            return Date
        if com.is_datetime64_dtype(arr_or_dtype):
            try:
                tz = arr_or_dtype.tzinfo
                return DateTime(timezone=True)
            except:
                return DateTime
        if com.is_timedelta64_dtype(arr_or_dtype):
            warnings.warn("the 'timedelta' type is not supported, and will be "
                          "written as integer values (ns frequency) to the "
                          "database.", UserWarning)
            return Integer
        elif com.is_float_dtype(arr_or_dtype):
            return Float
        elif com.is_integer_dtype(arr_or_dtype):
            # TODO: Refine integer size.
            return Integer
        elif com.is_bool(arr_or_dtype):
            return Boolean
        return Text
示例#50
0
def _get_data_algo(values, func_map):
    mask = None
    if com.is_float_dtype(values):
        f = func_map['float64']
        values = com._ensure_float64(values)
    elif com.is_datetime64_dtype(values):

        # if we have NaT, punt to object dtype
        mask = com.isnull(values)
        if mask.ravel().any():
            f = func_map['generic']
            values = com._ensure_object(values)
            values[mask] = np.nan
        else:
            f = func_map['int64']
            values = values.view('i8')

    elif com.is_integer_dtype(values):
        f = func_map['int64']
        values = com._ensure_int64(values)
    else:
        f = func_map['generic']
        values = com._ensure_object(values)
    return f, values
def to_datetime(arg,
                errors='ignore',
                dayfirst=False,
                utc=None,
                box=True,
                format=None):
    """
    Convert argument to datetime

    Parameters
    ----------
    arg : string, datetime, array of strings (with possible NAs)
    errors : {'ignore', 'raise'}, default 'ignore'
        Errors are ignored by default (values left untouched)
    dayfirst : boolean, default False
        If True parses dates with the day first, eg 20/01/2005
    utc : boolean, default None
        Return UTC DatetimeIndex if True (converting any tz-aware
        datetime.datetime objects as well)
    box : boolean, default True
        If True returns a DatetimeIndex, if False returns ndarray of values
    format : string, default None
        strftime to parse time, eg "%d/%m/%Y"

    Returns
    -------
    ret : datetime if parsing succeeded
    """
    from pandas.core.series import Series
    from pandas.tseries.index import DatetimeIndex

    def _convert_f(arg):
        arg = com._ensure_object(arg)

        try:
            if format is not None:
                result = tslib.array_strptime(arg, format)
            else:
                result = tslib.array_to_datetime(arg,
                                                 raise_=errors == 'raise',
                                                 utc=utc,
                                                 dayfirst=dayfirst)
            if com.is_datetime64_dtype(result) and box:
                result = DatetimeIndex(result, tz='utc' if utc else None)
            return result
        except ValueError as e:
            try:
                values, tz = tslib.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, None, tz=tz)
            except (ValueError, TypeError):
                raise e

    if arg is None:
        return arg
    elif isinstance(arg, datetime):
        return arg
    elif isinstance(arg, Series):
        values = arg.values
        if not com.is_datetime64_dtype(values):
            values = _convert_f(values)
        return Series(values, index=arg.index, name=arg.name)
    elif isinstance(arg, (np.ndarray, list)):
        if isinstance(arg, list):
            arg = np.array(arg, dtype='O')

        if com.is_datetime64_dtype(arg):
            if box and not isinstance(arg, DatetimeIndex):
                try:
                    return DatetimeIndex(arg, tz='utc' if utc else None)
                except ValueError as e:
                    try:
                        values, tz = tslib.datetime_to_datetime64(arg)
                        return DatetimeIndex._simple_new(values, None, tz=tz)
                    except (ValueError, TypeError):
                        raise e
            return arg

        try:
            return _convert_f(arg)
        except ValueError:
            raise
        return arg

    try:
        if not arg:
            return arg
        default = datetime(1, 1, 1)
        return parse(arg, dayfirst=dayfirst, default=default)
    except Exception:
        if errors == 'raise':
            raise
        return arg
示例#52
0
文件: tools.py 项目: teja2609/pandas
    def _convert_listlike(arg, box, format):

        if isinstance(arg, (list, tuple)):
            arg = np.array(arg, dtype='O')

        if com.is_datetime64_ns_dtype(arg):
            if box and not isinstance(arg, DatetimeIndex):
                try:
                    return DatetimeIndex(arg, tz='utc' if utc else None)
                except ValueError:
                    pass

            return arg

        arg = com._ensure_object(arg)

        if infer_datetime_format and format is None:
            format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)

            if format is not None:
                # There is a special fast-path for iso8601 formatted
                # datetime strings, so in those cases don't use the inferred
                # format because this path makes process slower in this
                # special case
                format_is_iso8601 = ('%Y-%m-%dT%H:%M:%S.%f'.startswith(format)
                                     or
                                     '%Y-%m-%d %H:%M:%S.%f'.startswith(format))
                if format_is_iso8601:
                    format = None

        try:
            result = None

            if format is not None:
                # shortcut formatting here
                if format == '%Y%m%d':
                    try:
                        result = _attempt_YYYYMMDD(arg, coerce=coerce)
                    except:
                        raise ValueError(
                            "cannot convert the input to '%Y%m%d' date format")

                # fallback
                if result is None:
                    try:
                        result = tslib.array_strptime(arg,
                                                      format,
                                                      exact=exact,
                                                      coerce=coerce)
                    except (tslib.OutOfBoundsDatetime):
                        if errors == 'raise':
                            raise
                        result = arg
                    except ValueError:
                        # Only raise this error if the user provided the
                        # datetime format, and not when it was inferred
                        if not infer_datetime_format:
                            raise

            if result is None and (format is None or infer_datetime_format):
                result = tslib.array_to_datetime(arg,
                                                 raise_=errors == 'raise',
                                                 utc=utc,
                                                 dayfirst=dayfirst,
                                                 coerce=coerce,
                                                 unit=unit)

            if com.is_datetime64_dtype(result) and box:
                result = DatetimeIndex(result, tz='utc' if utc else None)
            return result

        except ValueError as e:
            try:
                values, tz = tslib.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, None, tz=tz)
            except (ValueError, TypeError):
                raise e
示例#53
0
    def _convert_listlike(arg, box, format, name=None):

        if isinstance(arg, (list, tuple)):
            arg = np.array(arg, dtype='O')

        # these are shortcutable
        if com.is_datetime64_ns_dtype(arg):
            if box and not isinstance(arg, DatetimeIndex):
                try:
                    return DatetimeIndex(arg,
                                         tz='utc' if utc else None,
                                         name=name)
                except ValueError:
                    pass

            return arg
        elif format is None and com.is_integer_dtype(arg) and unit == 'ns':
            result = arg.astype('datetime64[ns]')
            if box:
                return DatetimeIndex(result,
                                     tz='utc' if utc else None,
                                     name=name)

            return result

        arg = com._ensure_object(arg)
        require_iso8601 = False

        if infer_datetime_format and format is None:
            format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)

        if format is not None:
            # There is a special fast-path for iso8601 formatted
            # datetime strings, so in those cases don't use the inferred
            # format because this path makes process slower in this
            # special case
            format_is_iso8601 = (('%Y-%m-%dT%H:%M:%S.%f'.startswith(format)
                                  or '%Y-%m-%d %H:%M:%S.%f'.startswith(format))
                                 and format != '%Y')
            if format_is_iso8601:
                require_iso8601 = not infer_datetime_format
                format = None

        try:
            result = None

            if format is not None:
                # shortcut formatting here
                if format == '%Y%m%d':
                    try:
                        result = _attempt_YYYYMMDD(arg, errors=errors)
                    except:
                        raise ValueError(
                            "cannot convert the input to '%Y%m%d' date format")

                # fallback
                if result is None:
                    try:
                        result = tslib.array_strptime(arg,
                                                      format,
                                                      exact=exact,
                                                      errors=errors)
                    except (tslib.OutOfBoundsDatetime):
                        if errors == 'raise':
                            raise
                        result = arg
                    except ValueError:
                        # if format was inferred, try falling back
                        # to array_to_datetime - terminate here
                        # for specified formats
                        if not infer_datetime_format:
                            if errors == 'raise':
                                raise
                            result = arg

            if result is None and (format is None or infer_datetime_format):
                result = tslib.array_to_datetime(
                    arg,
                    errors=errors,
                    utc=utc,
                    dayfirst=dayfirst,
                    yearfirst=yearfirst,
                    freq=freq,
                    unit=unit,
                    require_iso8601=require_iso8601)

            if com.is_datetime64_dtype(result) and box:
                result = DatetimeIndex(result,
                                       tz='utc' if utc else None,
                                       name=name)
            return result

        except ValueError as e:
            try:
                values, tz = tslib.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, name=name, tz=tz)
            except (ValueError, TypeError):
                raise e
示例#54
0
    def _convert_to_array(self, values, name=None, other=None):
        """converts values to ndarray"""
        from pandas.tseries.timedeltas import _possibly_cast_to_timedelta

        coerce = True
        if not is_list_like(values):
            values = np.array([values])
        inferred_type = lib.infer_dtype(values)

        if inferred_type in ('datetime64', 'datetime', 'date', 'time'):
            # if we have a other of timedelta, but use pd.NaT here we
            # we are in the wrong path
            if (other is not None and other.dtype == 'timedelta64[ns]'
                    and all(isnull(v) for v in values)):
                values = np.empty(values.shape, dtype=other.dtype)
                values[:] = tslib.iNaT

            # a datelike
            elif isinstance(values, pd.DatetimeIndex):
                values = values.to_series()
            elif not (isinstance(values, (pa.Array, pd.Series))
                      and com.is_datetime64_dtype(values)):
                values = tslib.array_to_datetime(values)
        elif inferred_type in ('timedelta', 'timedelta64'):
            # have a timedelta, convert to to ns here
            values = _possibly_cast_to_timedelta(values,
                                                 coerce=coerce,
                                                 dtype='timedelta64[ns]')
        elif inferred_type == 'integer':
            # py3 compat where dtype is 'm' but is an integer
            if values.dtype.kind == 'm':
                values = values.astype('timedelta64[ns]')
            elif isinstance(values, pd.PeriodIndex):
                values = values.to_timestamp().to_series()
            elif name not in ('__truediv__', '__div__', '__mul__'):
                raise TypeError("incompatible type for a datetime/timedelta "
                                "operation [{0}]".format(name))
        elif isinstance(values[0], pd.DateOffset):
            # handle DateOffsets
            os = pa.array([getattr(v, 'delta', None) for v in values])
            mask = isnull(os)
            if mask.any():
                raise TypeError(
                    "cannot use a non-absolute DateOffset in "
                    "datetime/timedelta operations [{0}]".format(', '.join(
                        [com.pprint_thing(v) for v in values[mask]])))
            values = _possibly_cast_to_timedelta(os, coerce=coerce)
        elif inferred_type == 'floating':

            # all nan, so ok, use the other dtype (e.g. timedelta or datetime)
            if isnull(values).all():
                values = np.empty(values.shape, dtype=other.dtype)
                values[:] = tslib.iNaT
            else:
                raise TypeError(
                    'incompatible type [{0}] for a datetime/timedelta '
                    'operation'.format(pa.array(values).dtype))
        else:
            raise TypeError("incompatible type [{0}] for a datetime/timedelta"
                            " operation".format(pa.array(values).dtype))

        return values
示例#55
0
文件: tools.py 项目: perrette/pandas
                result = DatetimeIndex(result, tz='utc' if utc else None)
            return result
        except ValueError, e:
            try:
                values, tz = tslib.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, None, tz=tz)
            except (ValueError, TypeError):
                raise e

    if arg is None:
        return arg
    elif isinstance(arg, datetime):
        return arg
    elif isinstance(arg, Series):
        values = arg.values
        if not com.is_datetime64_dtype(values):
            values = _convert_f(values)
        return Series(values, index=arg.index, name=arg.name)
    elif isinstance(arg, (np.ndarray, list)):
        if isinstance(arg, list):
            arg = np.array(arg, dtype='O')

        if com.is_datetime64_dtype(arg):
            if box and not isinstance(arg, DatetimeIndex):
                try:
                    return DatetimeIndex(arg, tz='utc' if utc else None)
                except ValueError, e:
                    try:
                        values, tz = tslib.datetime_to_datetime64(arg)
                        return DatetimeIndex._simple_new(values, None, tz=tz)
                    except (ValueError, TypeError):
示例#56
0
文件: tools.py 项目: tkiran/pandas
                return DatetimeIndex._simple_new(values, None, tz=tz)
            except (ValueError, TypeError):
                raise e

    if arg is None:
        return arg
    elif isinstance(arg, datetime):
        return arg
    elif isinstance(arg, Series):
        values = _convert_f(arg.values)
        return Series(values, index=arg.index, name=arg.name)
    elif isinstance(arg, (np.ndarray, list)):
        if isinstance(arg, list):
            arg = np.array(arg, dtype='O')

        if com.is_datetime64_dtype(arg):
            if box and not isinstance(arg, DatetimeIndex):
                try:
                    return DatetimeIndex(arg, tz='utc' if utc else None)
                except ValueError, e:
                    try:
                        values, tz = lib.datetime_to_datetime64(arg)
                        return DatetimeIndex._simple_new(values, None, tz=tz)
                    except (ValueError, TypeError):
                        raise e
            return arg

        try:
            return _convert_f(arg)
        except ValueError:
            raise
示例#57
0
 def test_compat(self):
     self.assertFalse(is_datetime64_ns_dtype(self.dtype))
     self.assertFalse(is_datetime64_ns_dtype('datetime64[ns, US/Eastern]'))
     self.assertFalse(is_datetime64_dtype(self.dtype))
     self.assertFalse(is_datetime64_dtype('datetime64[ns, US/Eastern]'))
示例#58
0
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    order : deprecated
    na_sentinel : int, default -1
        Value to mark "not found"
    size_hint : hint to the hashtable sizer

    Returns
    -------
    labels : the indexer to the original array
    uniques : ndarray (1-d) or Index
        the unique values. Index is returned when passed values is Index or Series

    note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex
    """
    if order is not None:
        msg = "order is deprecated. See https://github.com/pydata/pandas/issues/6926"
        warn(msg, FutureWarning, stacklevel=2)

    from pandas.core.index import Index
    from pandas.core.series import Series
    vals = np.asarray(values)

    is_datetime = com.is_datetime64_dtype(vals)
    is_timedelta = com.is_timedelta64_dtype(vals)
    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(size_hint or len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel)

    labels = com._ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        try:
            sorter = uniques.argsort()
        except:
            # unorderable in py3 if mixed str/int
            t = hash_klass(len(uniques))
            t.map_locations(com._ensure_object(uniques))

            # order ints before strings
            ordered = np.concatenate([
                np.sort(
                    np.array([e for i, e in enumerate(uniques) if f(e)],
                             dtype=object)) for f in [
                                 lambda x: not isinstance(x, string_types),
                                 lambda x: isinstance(x, string_types)
                             ]
            ])
            sorter = com._ensure_platform_int(
                t.lookup(com._ensure_object(ordered)))

        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = labels < 0
        labels = reverse_indexer.take(labels)
        np.putmask(labels, mask, -1)

        uniques = uniques.take(sorter)

    if is_datetime:
        uniques = uniques.astype('M8[ns]')
    elif is_timedelta:
        uniques = uniques.astype('m8[ns]')
    if isinstance(values, Index):
        uniques = values._shallow_copy(uniques, name=None)
    elif isinstance(values, Series):
        uniques = Index(uniques)
    return labels, uniques
示例#59
0
    def _convert_listlike(arg, box, format, name=None):

        if isinstance(arg, (list, tuple)):
            arg = np.array(arg, dtype='O')

        # these are shortcutable
        if com.is_datetime64_ns_dtype(arg):
            if box and not isinstance(arg, DatetimeIndex):
                try:
                    return DatetimeIndex(arg,
                                         tz='utc' if utc else None,
                                         name=name)
                except ValueError:
                    pass

            return arg

        elif com.is_datetime64tz_dtype(arg):
            if not isinstance(arg, DatetimeIndex):
                return DatetimeIndex(arg, tz='utc' if utc else None)
            if utc:
                arg = arg.tz_convert(None).tz_localize('UTC')
            return arg

        elif unit is not None:
            if format is not None:
                raise ValueError("cannot specify both format and unit")
            arg = getattr(arg, 'values', arg)
            result = tslib.array_with_unit_to_datetime(arg,
                                                       unit,
                                                       errors=errors)
            if box:
                if errors == 'ignore':
                    from pandas import Index
                    return Index(result)

                return DatetimeIndex(result,
                                     tz='utc' if utc else None,
                                     name=name)
            return result
        elif getattr(arg, 'ndim', 1) > 1:
            raise TypeError('arg must be a string, datetime, list, tuple, '
                            '1-d array, or Series')

        arg = com._ensure_object(arg)
        require_iso8601 = False

        if infer_datetime_format and format is None:
            format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)

        if format is not None:
            # There is a special fast-path for iso8601 formatted
            # datetime strings, so in those cases don't use the inferred
            # format because this path makes process slower in this
            # special case
            format_is_iso8601 = _format_is_iso(format)
            if format_is_iso8601:
                require_iso8601 = not infer_datetime_format
                format = None

        try:
            result = None

            if format is not None:
                # shortcut formatting here
                if format == '%Y%m%d':
                    try:
                        result = _attempt_YYYYMMDD(arg, errors=errors)
                    except:
                        raise ValueError("cannot convert the input to "
                                         "'%Y%m%d' date format")

                # fallback
                if result is None:
                    try:
                        result = tslib.array_strptime(arg,
                                                      format,
                                                      exact=exact,
                                                      errors=errors)
                    except tslib.OutOfBoundsDatetime:
                        if errors == 'raise':
                            raise
                        result = arg
                    except ValueError:
                        # if format was inferred, try falling back
                        # to array_to_datetime - terminate here
                        # for specified formats
                        if not infer_datetime_format:
                            if errors == 'raise':
                                raise
                            result = arg

            if result is None and (format is None or infer_datetime_format):
                result = tslib.array_to_datetime(
                    arg,
                    errors=errors,
                    utc=utc,
                    dayfirst=dayfirst,
                    yearfirst=yearfirst,
                    freq=freq,
                    require_iso8601=require_iso8601)

            if com.is_datetime64_dtype(result) and box:
                result = DatetimeIndex(result,
                                       tz='utc' if utc else None,
                                       name=name)
            return result

        except ValueError as e:
            try:
                values, tz = tslib.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, name=name, tz=tz)
            except (ValueError, TypeError):
                raise e
示例#60
0
def factorize(values, sort=False, order=None, na_sentinel=-1):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    order :
    na_sentinel: int, default -1
        Value to mark "not found"

    Returns
    -------
    labels : the indexer to the original array
    uniques : the unique values

    note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex
    """
    from pandas.tseries.period import PeriodIndex
    vals = np.asarray(values)
    is_datetime = com.is_datetime64_dtype(vals)
    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel)

    labels = com._ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        try:
            sorter = uniques.argsort()
        except:
            # unorderable in py3 if mixed str/int
            t = hash_klass(len(uniques))
            t.map_locations(com._ensure_object(uniques))

            # order ints before strings
            ordered = np.concatenate([
                np.sort(
                    np.array([e for i, e in enumerate(uniques) if f(e)],
                             dtype=object)) for f in [
                                 lambda x: not isinstance(x, string_types),
                                 lambda x: isinstance(x, string_types)
                             ]
            ])
            sorter = com._ensure_platform_int(
                t.lookup(com._ensure_object(ordered)))

        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = labels < 0
        labels = reverse_indexer.take(labels)
        np.putmask(labels, mask, -1)

        uniques = uniques.take(sorter)

    if is_datetime:
        uniques = uniques.astype('M8[ns]')
    if isinstance(values, PeriodIndex):
        uniques = PeriodIndex(ordinal=uniques, freq=values.freq)

    return labels, uniques