示例#1
0
    def equals(self, other):
        """
        Determines if two Index objects contain the same elements.
        """
        if self.is_(other):
            return True

        return com.array_equivalent(com._values_from_object(self),
                                    com._values_from_object(other))
示例#2
0
文件: ops.py 项目: AlexRobson/pandas
    def wrapper(self, other, axis=None):
        # Validate the axis parameter
        if axis is not None:
            self._get_axis_number(axis)

        if isinstance(other, pd.Series):
            name = _maybe_match_name(self, other)
            if len(self) != len(other):
                raise ValueError('Series lengths must match to compare')
            return self._constructor(na_op(self.values, other.values),
                                     index=self.index, name=name)
        elif isinstance(other, pd.DataFrame):  # pragma: no cover
            return NotImplemented
        elif isinstance(other, (np.ndarray, pd.Index)):
            if len(self) != len(other):
                raise ValueError('Lengths must match to compare')
            return self._constructor(na_op(self.values, np.asarray(other)),
                                     index=self.index).__finalize__(self)
        elif isinstance(other, pd.Categorical):
            if not com.is_categorical_dtype(self):
                msg = "Cannot compare a Categorical for op {op} with Series of dtype {typ}.\n"\
                      "If you want to compare values, use 'series <op> np.asarray(other)'."
                raise TypeError(msg.format(op=op,typ=self.dtype))


        mask = isnull(self)

        if com.is_categorical_dtype(self):
            # cats are a special case as get_values() would return an ndarray, which would then
            # not take categories ordering into account
            # we can go directly to op, as the na_op would just test again and dispatch to it.
            res = op(self.values, other)
        else:
            values = self.get_values()
            other = _index.convert_scalar(values,_values_from_object(other))

            if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
                values = values.view('i8')

            # scalars
            res = na_op(values, other)
            if np.isscalar(res):
                raise TypeError('Could not compare %s type with Series'
                                % type(other))

            # always return a full value series here
            res = _values_from_object(res)

        res = pd.Series(res, index=self.index, name=self.name,
                        dtype='bool')

        # mask out the invalids
        if mask.any():
            res[mask] = masker

        return res
示例#3
0
    def get_value(self, series, key):
        """ we always want to get an index value, never a value """
        if not is_scalar(key):
            raise InvalidIndexError

        k = _values_from_object(key)
        loc = self.get_loc(k)
        new_values = _values_from_object(series)[loc]

        return new_values
示例#4
0
    def equals(self, other):
        """
        Determines if two Index objects contain the same elements.
        """
        if self.is_(other):
            return True

        try:
            return com.array_equivalent(com._values_from_object(self), com._values_from_object(other))
        except TypeError:
            # e.g. fails in numpy 1.6 with DatetimeIndex #1681
            return False
示例#5
0
文件: common.py 项目: 5i7788/pandas
def maybe_to_datetimelike(data, copy=False):
    """
    return a DelegatedClass of a Series that is datetimelike (e.g. datetime64[ns] dtype or a Series of Periods)
    raise TypeError if this is not possible.

    Parameters
    ----------
    data : Series
    copy : boolean, default False
           copy the input data

    Returns
    -------
    DelegatedClass

    """

    if not isinstance(data, Series):
        raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data)))

    index = data.index
    if issubclass(data.dtype.type, np.datetime64):
        return DatetimeProperties(DatetimeIndex(data, copy=copy), index)
    else:

        if isinstance(data, PeriodIndex):
            return PeriodProperties(PeriodIndex(data, copy=copy), index)

        data = com._values_from_object(data)
        inferred = lib.infer_dtype(data)
        if inferred == 'period':
            return PeriodProperties(PeriodIndex(data), index)

    raise TypeError("cannot convert an object of type {0} to a datetimelike index".format(type(data)))
示例#6
0
文件: strings.py 项目: APWaldo/pandas
def str_repeat(arr, repeats):
    """
    Duplicate each string in the array by indicated number of times

    Parameters
    ----------
    repeats : int or array
        Same value for all (int) or different value per (array)

    Returns
    -------
    repeated : array
    """
    if np.isscalar(repeats):
        def rep(x):
            try:
                return compat.binary_type.__mul__(x, repeats)
            except TypeError:
                return compat.text_type.__mul__(x, repeats)

        return _na_map(rep, arr)
    else:
        def rep(x, r):
            try:
                return compat.binary_type.__mul__(x, r)
            except TypeError:
                return compat.text_type.__mul__(x, r)

        repeats = np.asarray(repeats, dtype=object)
        result = lib.vec_binop(_values_from_object(arr), repeats, rep)
        return result
示例#7
0
    def wrapper(self, other):
        msg = "cannot compare a TimedeltaIndex with type {0}"
        func = getattr(super(TimedeltaIndex, self), opname)
        if _is_convertible_to_td(other) or other is NaT:
            try:
                other = _to_m8(other)
            except ValueError:
                # failed to parse as timedelta
                raise TypeError(msg.format(type(other)))
            result = func(other)
            if isna(other):
                result.fill(nat_result)
        else:
            if not is_list_like(other):
                raise TypeError(msg.format(type(other)))

            other = TimedeltaIndex(other).values
            result = func(other)
            result = _values_from_object(result)

            if isinstance(other, Index):
                o_mask = other.values.view('i8') == iNaT
            else:
                o_mask = other.view('i8') == iNaT

            if o_mask.any():
                result[o_mask] = nat_result

        if self.hasnans:
            result[self._isnan] = nat_result

        # support of bool dtype indexers
        if is_bool_dtype(result):
            return result
        return Index(result)
示例#8
0
def _maybe_to_sparse(array):
    if isinstance(array, com.ABCSparseSeries):
        array = SparseArray(array.values, sparse_index=array.sp_index,
                            fill_value=array.fill_value, copy=True)
    if not isinstance(array, SparseArray):
        array = com._values_from_object(array)
    return array
示例#9
0
    def wrapper(self, other):
        msg = "cannot compare a {cls} with type {typ}"
        func = getattr(super(TimedeltaIndex, self), opname)
        if _is_convertible_to_td(other) or other is NaT:
            try:
                other = _to_m8(other)
            except ValueError:
                # failed to parse as timedelta
                raise TypeError(msg.format(cls=type(self).__name__,
                                           typ=type(other).__name__))
            result = func(other)
            if isna(other):
                result.fill(nat_result)

        elif not is_list_like(other):
            raise TypeError(msg.format(cls=type(self).__name__,
                                       typ=type(other).__name__))
        else:
            other = TimedeltaIndex(other).values
            result = func(other)
            result = com._values_from_object(result)

            o_mask = np.array(isna(other))
            if o_mask.any():
                result[o_mask] = nat_result

        if self.hasnans:
            result[self._isnan] = nat_result

        # support of bool dtype indexers
        if is_bool_dtype(result):
            return result
        return Index(result)
示例#10
0
文件: tdi.py 项目: AbnerZheng/pandas
    def wrapper(self, other):
        func = getattr(super(TimedeltaIndex, self), opname)
        if _is_convertible_to_td(other):
            other = _to_m8(other)
            result = func(other)
            if com.isnull(other):
                result.fill(nat_result)
        else:
            if not com.is_list_like(other):
                raise TypeError("cannot compare a TimedeltaIndex with type "
                                "{0}".format(type(other)))

            other = TimedeltaIndex(other).values
            result = func(other)
            result = _values_from_object(result)

            if isinstance(other, Index):
                o_mask = other.values.view('i8') == tslib.iNaT
            else:
                o_mask = other.view('i8') == tslib.iNaT

            if o_mask.any():
                result[o_mask] = nat_result

        if self.hasnans:
            result[self._isnan] = nat_result

        # support of bool dtype indexers
        if com.is_bool_dtype(result):
            return result
        return Index(result)
示例#11
0
文件: nanops.py 项目: X1mengYu/pandas
def _get_values(values, skipna, fill_value=None, fill_value_typ=None, isfinite=False, copy=True):
    """ utility to get the values view, mask, dtype
        if necessary copy and mask using the specified fill_value
        copy = True will force the copy """
    values = _values_from_object(values)
    if isfinite:
        mask = _isfinite(values)
    else:
        mask = isnull(values)

    dtype    = values.dtype
    dtype_ok = _na_ok_dtype(dtype)

    # get our fill value (in case we need to provide an alternative dtype for it)
    fill_value = _get_fill_value(dtype, fill_value=fill_value, fill_value_typ=fill_value_typ)

    if skipna:
        if copy:
            values = values.copy()
        if dtype_ok:
            np.putmask(values, mask, fill_value)

        # promote if needed
        else:
            values, changed = com._maybe_upcast_putmask(values, mask, fill_value)

    elif copy:
        values = values.copy()

    values = _view_if_needed(values)
    return values, mask, dtype
示例#12
0
    def wrapper(self, other):
        msg = "cannot compare a {cls} with type {typ}"
        meth = getattr(dtl.DatetimeLikeArrayMixin, opname)
        if _is_convertible_to_td(other) or other is NaT:
            try:
                other = _to_m8(other)
            except ValueError:
                # failed to parse as timedelta
                raise TypeError(msg.format(cls=type(self).__name__,
                                           typ=type(other).__name__))
            result = meth(self, other)
            if isna(other):
                result.fill(nat_result)

        elif not is_list_like(other):
            raise TypeError(msg.format(cls=type(self).__name__,
                                       typ=type(other).__name__))
        else:
            other = type(self)(other).values
            result = meth(self, other)
            result = com._values_from_object(result)

            o_mask = np.array(isna(other))
            if o_mask.any():
                result[o_mask] = nat_result

        if self.hasnans:
            result[self._isnan] = nat_result

        return result
示例#13
0
文件: base.py 项目: cldy/pandas
 def duplicated(self, keep="first"):
     keys = com._values_from_object(com._ensure_object(self.values))
     duplicated = lib.duplicated(keys, keep=keep)
     try:
         return self._constructor(duplicated, index=self.index).__finalize__(self)
     except AttributeError:
         return np.array(duplicated, dtype=bool)
示例#14
0
def _possibly_cast_to_timedelta(value, coerce=True, dtype=None):
    """ try to cast to timedelta64, if already a timedeltalike, then make
        sure that we are [ns] (as numpy 1.6.2 is very buggy in this regards,
        don't force the conversion unless coerce is True

        if dtype is passed then this is the target dtype
        """

    # deal with numpy not being able to handle certain timedelta operations
    if isinstance(value, (ABCSeries, np.ndarray)):

        # i8 conversions
        if value.dtype == 'int64' and np.dtype(dtype) == 'timedelta64[ns]':
            value = value.astype('timedelta64[ns]')
            return value
        elif value.dtype.kind == 'm':
            if value.dtype != 'timedelta64[ns]':
                value = value.astype('timedelta64[ns]')
            return value

    # we don't have a timedelta, but we want to try to convert to one (but
    # don't force it)
    if coerce:
        new_value = tslib.array_to_timedelta64(
            _values_from_object(value).astype(object), coerce=False)
        if new_value.dtype == 'i8':
            value = np.array(new_value, dtype='timedelta64[ns]')

    return value
示例#15
0
def _possibly_cast_to_timedelta(value, coerce=True):
    """ try to cast to timedelta64, if already a timedeltalike, then make
        sure that we are [ns] (as numpy 1.6.2 is very buggy in this regards,
        don't force the conversion unless coerce is True

        if coerce='compat' force a compatibilty coercerion (to timedeltas) if needeed
        """

    # coercion compatability
    if coerce == 'compat' and _np_version_under1p7:

        def convert(td, dtype):

            # we have an array with a non-object dtype
            if hasattr(td,'item'):
                td = td.astype(np.int64).item()
                if td == tslib.iNaT:
                    return td
                if dtype == 'm8[us]':
                    td *= 1000
                return td

            if td == tslib.compat_NaT:
                return tslib.iNaT

            # convert td value to a nanosecond value
            d = td.days
            s = td.seconds
            us = td.microseconds

            if dtype == 'object' or dtype == 'm8[ns]':
                td = 1000*us + (s + d * 24 * 3600) * 10 ** 9
            else:
                raise ValueError("invalid conversion of dtype in np < 1.7 [%s]" % dtype)

            return td

        # < 1.7 coercion
        if not is_list_like(value):
            value = np.array([ value ])

        dtype = value.dtype
        return np.array([ convert(v,dtype) for v in value ], dtype='m8[ns]')

    # deal with numpy not being able to handle certain timedelta operations
    if isinstance(value, (ABCSeries, np.ndarray)) and value.dtype.kind == 'm':
        if value.dtype != 'timedelta64[ns]':
            value = value.astype('timedelta64[ns]')
        return value

    # we don't have a timedelta, but we want to try to convert to one (but
    # don't force it)
    if coerce:
        new_value = tslib.array_to_timedelta64(
            _values_from_object(value).astype(object), coerce=False)
        if new_value.dtype == 'i8':
            value = np.array(new_value, dtype='timedelta64[ns]')

    return value
示例#16
0
    def wrapper(self, other):
        if isinstance(other, pd.Series):
            name = _maybe_match_name(self, other)
            if len(self) != len(other):
                raise ValueError('Series lengths must match to compare')
            return self._constructor(na_op(self.values, other.values),
                                     index=self.index, name=name)
        elif isinstance(other, pd.DataFrame):  # pragma: no cover
            return NotImplemented
        elif isinstance(other, (pa.Array, pd.Index)):
            if len(self) != len(other):
                raise ValueError('Lengths must match to compare')
            return self._constructor(na_op(self.values, np.asarray(other)),
                                     index=self.index).__finalize__(self)
        elif isinstance(other, pd.Categorical):
            if not com.is_categorical_dtype(self):
                msg = "Cannot compare a Categorical for op {op} with Series of dtype {typ}.\n"\
                      "If you want to compare values, use 'series <op> np.asarray(other)'."
                raise TypeError(msg.format(op=op,typ=self.dtype))
        else:

            mask = isnull(self)

            values = self.get_values()
            other = _index.convert_scalar(values,_values_from_object(other))

            if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
                values = values.view('i8')

            # scalars
            res = na_op(values, other)
            if np.isscalar(res):
                raise TypeError('Could not compare %s type with Series'
                                % type(other))

            # always return a full value series here
            res = _values_from_object(res)

            res = pd.Series(res, index=self.index, name=self.name,
                            dtype='bool')

            # mask out the invalids
            if mask.any():
                res[mask] = masker

            return res
示例#17
0
文件: strings.py 项目: 5i7788/pandas
def _get_array_list(arr, others):
    if len(others) and isinstance(_values_from_object(others)[0],
                                  (list, np.ndarray, Series)):
        arrays = [arr] + list(others)
    else:
        arrays = [arr, others]

    return [np.asarray(x, dtype=object) for x in arrays]
示例#18
0
文件: ops.py 项目: Alias4bb/pandas
    def wrapper(self, other, axis=None):
        # Validate the axis parameter
        if axis is not None:
            self._get_axis_number(axis)

        if isinstance(other, ABCSeries):
            name = _maybe_match_name(self, other)
            if len(self) != len(other):
                raise ValueError('Series lengths must match to compare')
            return self._constructor(na_op(self.values, other.values),
                                     index=self.index, name=name)
        elif isinstance(other, pd.DataFrame):  # pragma: no cover
            return NotImplemented
        elif isinstance(other, (np.ndarray, pd.Index)):
            # do not check length of zerodim array
            # as it will broadcast
            if (not lib.isscalar(lib.item_from_zerodim(other)) and
                    len(self) != len(other)):
                raise ValueError('Lengths must match to compare')

            if isinstance(other, ABCPeriodIndex):
                # temp workaround until fixing GH 13637
                # tested in test_nat_comparisons
                # (pandas.tests.series.test_operators.TestSeriesOperators)
                return self._constructor(na_op(self.values,
                                               other.asobject.values),
                                         index=self.index)

            return self._constructor(na_op(self.values, np.asarray(other)),
                                     index=self.index).__finalize__(self)
        elif isinstance(other, pd.Categorical):
            if not is_categorical_dtype(self):
                msg = ("Cannot compare a Categorical for op {op} with Series "
                       "of dtype {typ}.\nIf you want to compare values, use "
                       "'series <op> np.asarray(other)'.")
                raise TypeError(msg.format(op=op, typ=self.dtype))

        if is_categorical_dtype(self):
            # cats are a special case as get_values() would return an ndarray,
            # which would then not take categories ordering into account
            # we can go directly to op, as the na_op would just test again and
            # dispatch to it.
            res = op(self.values, other)
        else:
            values = self.get_values()
            if isinstance(other, (list, np.ndarray)):
                other = np.asarray(other)

            res = na_op(values, other)
            if isscalar(res):
                raise TypeError('Could not compare %s type with Series' %
                                type(other))

            # always return a full value series here
            res = _values_from_object(res)

        res = pd.Series(res, index=self.index, name=self.name, dtype='bool')
        return res
示例#19
0
    def convert_value(self, v):
        """ convert the expression that is in the term to something that is
        accepted by pytables """

        def stringify(value):
            if self.encoding is not None:
                encoder = partial(pprint_thing_encoded,
                                  encoding=self.encoding)
            else:
                encoder = pprint_thing
            return encoder(value)

        kind = _ensure_decoded(self.kind)
        meta = _ensure_decoded(self.meta)
        if kind == u('datetime64') or kind == u('datetime'):
            if isinstance(v, (int, float)):
                v = stringify(v)
            v = _ensure_decoded(v)
            v = pd.Timestamp(v)
            if v.tz is not None:
                v = v.tz_convert('UTC')
            return TermValue(v, v.value, kind)
        elif (isinstance(v, datetime) or hasattr(v, 'timetuple') or
                kind == u('date')):
            v = time.mktime(v.timetuple())
            return TermValue(v, pd.Timestamp(v), kind)
        elif kind == u('timedelta64') or kind == u('timedelta'):
            v = _coerce_scalar_to_timedelta_type(v, unit='s').value
            return TermValue(int(v), v, kind)
        elif meta == u('category'):
            metadata = com._values_from_object(self.metadata)
            result = metadata.searchsorted(v, side='left')

            # result returns 0 if v is first element or if v is not in metadata
            # check that metadata contains v
            if not result and v not in metadata:
                result = -1
            return TermValue(result, result, u('integer'))
        elif kind == u('integer'):
            v = int(float(v))
            return TermValue(v, v, kind)
        elif kind == u('float'):
            v = float(v)
            return TermValue(v, v, kind)
        elif kind == u('bool'):
            if isinstance(v, string_types):
                v = not v.strip().lower() in [u('false'), u('f'), u('no'),
                                              u('n'), u('none'), u('0'),
                                              u('[]'), u('{}'), u('')]
            else:
                v = bool(v)
            return TermValue(v, v, kind)
        elif not isinstance(v, string_types):
            v = stringify(v)
            return TermValue(v, stringify(v), u('string'))

        # string quoting
        return TermValue(v, stringify(v), u('string'))
示例#20
0
    def get_value(self, series, key):
        """ we always want to get an index value, never a value """
        if not lib.isscalar(key):
            raise InvalidIndexError

        from pandas.core.indexing import maybe_droplevels
        from pandas.core.series import Series

        k = com._values_from_object(key)
        loc = self.get_loc(k)
        new_values = com._values_from_object(series)[loc]

        if lib.isscalar(new_values) or new_values is None:
            return new_values

        new_index = self[loc]
        new_index = maybe_droplevels(new_index, k)
        return Series(new_values, index=new_index, name=series.name)
示例#21
0
文件: ops.py 项目: giang12/pandas
 def indices(self):
     """ dict {group name -> group indices} """
     if len(self.groupings) == 1:
         return self.groupings[0].indices
     else:
         label_list = [ping.labels for ping in self.groupings]
         keys = [com._values_from_object(ping.group_index)
                 for ping in self.groupings]
         return get_indexer_dict(label_list, keys)
示例#22
0
文件: nanops.py 项目: mficek/pandas
def nanskew(values, axis=None, skipna=True):
    """ Compute the sample skewness.

    The statistic computed here is the adjusted Fisher-Pearson standardized
    moment coefficient G1. The algorithm computes this coefficient directly
    from the second and third central moment.

    """

    values = _values_from_object(values)
    mask = isna(values)
    if not is_float_dtype(values.dtype):
        values = values.astype('f8')
        count = _get_counts(mask, axis)
    else:
        count = _get_counts(mask, axis, dtype=values.dtype)

    if skipna:
        values = values.copy()
        np.putmask(values, mask, 0)

    mean = values.sum(axis, dtype=np.float64) / count
    if axis is not None:
        mean = np.expand_dims(mean, axis)

    adjusted = values - mean
    if skipna:
        np.putmask(adjusted, mask, 0)
    adjusted2 = adjusted ** 2
    adjusted3 = adjusted2 * adjusted
    m2 = adjusted2.sum(axis, dtype=np.float64)
    m3 = adjusted3.sum(axis, dtype=np.float64)

    # floating point error
    #
    # #18044 in _libs/windows.pyx calc_skew follow this behavior
    # to fix the fperr to treat m2 <1e-14 as zero
    m2 = _zero_out_fperr(m2)
    m3 = _zero_out_fperr(m3)

    with np.errstate(invalid='ignore', divide='ignore'):
        result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2 ** 1.5)

    dtype = values.dtype
    if is_float_dtype(dtype):
        result = result.astype(dtype)

    if isinstance(result, np.ndarray):
        result = np.where(m2 == 0, 0, result)
        result[count < 3] = np.nan
        return result
    else:
        result = 0 if m2 == 0 else result
        if count < 3:
            return np.nan
        return result
示例#23
0
文件: ops.py 项目: Brajen259/pandas
    def wrapper(self, other):
        if isinstance(other, pd.Series):
            name = _maybe_match_name(self, other)
            if len(self) != len(other):
                raise ValueError('Series lengths must match to compare')
            return self._constructor(na_op(self.values, other.values),
                                     index=self.index, name=name)
        elif isinstance(other, pd.DataFrame):  # pragma: no cover
            return NotImplemented
        elif isinstance(other, (pa.Array, pd.Series, pd.Index)):
            if len(self) != len(other):
                raise ValueError('Lengths must match to compare')
            return self._constructor(na_op(self.values, np.asarray(other)),
                                     index=self.index).__finalize__(self)
        else:

            mask = isnull(self)

            values = self.get_values()
            other = _index.convert_scalar(values,_values_from_object(other))

            if issubclass(values.dtype.type, np.datetime64):
                values = values.view('i8')

            # scalars
            res = na_op(values, other)
            if np.isscalar(res):
                raise TypeError('Could not compare %s type with Series'
                                % type(other))

            # always return a full value series here
            res = _values_from_object(res)

            res = pd.Series(res, index=self.index, name=self.name,
                            dtype='bool')

            # mask out the invalids
            if mask.any():
                res[mask] = masker

            return res
示例#24
0
文件: ops.py 项目: jess010/pandas
    def na_op(x, y):

        # dispatch to the categorical if we have a categorical
        # in either operand
        if is_categorical_dtype(x):
            return op(x, y)
        elif is_categorical_dtype(y) and not is_scalar(y):
            return op(y, x)

        if is_object_dtype(x.dtype):
            result = _comp_method_OBJECT_ARRAY(op, x, y)
        else:

            # we want to compare like types
            # we only want to convert to integer like if
            # we are not NotImplemented, otherwise
            # we would allow datetime64 (but viewed as i8) against
            # integer comparisons
            if is_datetimelike_v_numeric(x, y):
                raise TypeError("invalid type comparison")

            # numpy does not like comparisons vs None
            if is_scalar(y) and isna(y):
                if name == '__ne__':
                    return np.ones(len(x), dtype=bool)
                else:
                    return np.zeros(len(x), dtype=bool)

            # we have a datetime/timedelta and may need to convert
            mask = None
            if (needs_i8_conversion(x) or
                    (not is_scalar(y) and needs_i8_conversion(y))):

                if is_scalar(y):
                    mask = isna(x)
                    y = libindex.convert_scalar(x, com._values_from_object(y))
                else:
                    mask = isna(x) | isna(y)
                    y = y.view('i8')
                x = x.view('i8')

            try:
                with np.errstate(all='ignore'):
                    result = getattr(x, name)(y)
                if result is NotImplemented:
                    raise TypeError("invalid type comparison")
            except AttributeError:
                result = op(x, y)

            if mask is not None and mask.any():
                result[mask] = masker

        return result
示例#25
0
def _get_codes_for_values(values, categories):
    """"
    utility routine to turn values into codes given the specified categories
    """

    from pandas.core.algorithms import _get_data_algo, _hashtables
    if values.dtype != categories.dtype:
        values = com._ensure_object(values)
        categories = com._ensure_object(categories)
    (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
    t = hash_klass(len(categories))
    t.map_locations(com._values_from_object(categories))
    return com._ensure_platform_int(t.lookup(values))
示例#26
0
文件: merge.py 项目: clamus/pandas
def _factorize_keys(lk, rk, sort=True):
    if com.is_datetime64tz_dtype(lk) and com.is_datetime64tz_dtype(rk):
        lk = lk.values
        rk = rk.values
    if com.is_int_or_datetime_dtype(lk) and com.is_int_or_datetime_dtype(rk):
        klass = _hash.Int64Factorizer
        lk = com._ensure_int64(com._values_from_object(lk))
        rk = com._ensure_int64(com._values_from_object(rk))
    else:
        klass = _hash.Factorizer
        lk = com._ensure_object(lk)
        rk = com._ensure_object(rk)

    rizer = klass(max(len(lk), len(rk)))

    llab = rizer.factorize(lk)
    rlab = rizer.factorize(rk)

    count = rizer.get_count()

    if sort:
        uniques = rizer.uniques.to_array()
        llab, rlab = _sort_labels(uniques, llab, rlab)

    # NA group
    lmask = llab == -1
    lany = lmask.any()
    rmask = rlab == -1
    rany = rmask.any()

    if lany or rany:
        if lany:
            np.putmask(llab, lmask, count)
        if rany:
            np.putmask(rlab, rmask, count)
        count += 1

    return llab, rlab, count
示例#27
0
文件: util.py 项目: MattRijk/pandas
def cartesian_product(X):
    """
    Numpy version of itertools.product or pandas.compat.product.
    Sometimes faster (for large inputs)...

    Parameters
    ----------
    X : list-like of list-likes

    Returns
    -------
    product : list of ndarrays

    Examples
    --------
    >>> cartesian_product([list('ABC'), [1, 2]])
    [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'),
    array([1, 2, 1, 2, 1, 2])]

    See also
    --------
    itertools.product : Cartesian product of input iterables.  Equivalent to
        nested for-loops.
    pandas.compat.product : An alias for itertools.product.
    """
    msg = "Input must be a list-like of list-likes"
    if not is_list_like(X):
        raise TypeError(msg)
    for x in X:
        if not is_list_like(x):
            raise TypeError(msg)

    if len(X) == 0:
        return []

    lenX = np.fromiter((len(x) for x in X), dtype=int)
    cumprodX = np.cumproduct(lenX)

    a = np.roll(cumprodX, 1)
    a[0] = 1

    if cumprodX[-1] != 0:
        b = cumprodX[-1] / cumprodX
    else:
        # if any factor is empty, the cartesian product is empty
        b = np.zeros_like(cumprodX)

    return [np.tile(np.repeat(np.asarray(com._values_from_object(x)), b[i]),
                    np.product(a[i]))
            for i, x in enumerate(X)]
示例#28
0
def value_counts(values, sort=True, ascending=False, normalize=False):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram

    Returns
    -------
    value_counts : Series
    """
    from pandas.core.series import Series

    values = np.asarray(values)

    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)
    elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):

        dtype = values.dtype
        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values)

        # convert the keys back to the dtype we came in
        keys = Series(keys,dtype=dtype)
    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        keys, counts = htable.value_count_object(values, mask)

    result = Series(counts, index=com._values_from_object(keys))

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    if normalize:
        result = result / float(values.size)

    return result
示例#29
0
    def get_value(self, series, key):
        """
        Fast lookup of value from 1-dimensional ndarray. Only use this if you
        know what you're doing
        """
        try:
            k = com._values_from_object(key)
            k = self._convert_scalar_indexer(k, kind='getitem')
            indexer = self.get_loc(k)
            return series.iloc[indexer]
        except (KeyError, TypeError):
            pass

        # we might be a positional inexer
        return super(CategoricalIndex, self).get_value(series, key)
示例#30
0
文件: nanops.py 项目: B-Rich/pandas
def _get_values(values, skipna, fill_value=None, fill_value_typ=None,
                isfinite=False, copy=True):
    """ utility to get the values view, mask, dtype
        if necessary copy and mask using the specified fill_value
        copy = True will force the copy """
    values = _values_from_object(values)
    if isfinite:
        mask = _isfinite(values)
    else:
        mask = isnull(values)

    dtype = values.dtype
    dtype_ok = _na_ok_dtype(dtype)

    # get our fill value (in case we need to provide an alternative
    # dtype for it)
    fill_value = _get_fill_value(dtype, fill_value=fill_value,
                                 fill_value_typ=fill_value_typ)

    if skipna:
        if copy:
            values = values.copy()
        if dtype_ok:
            np.putmask(values, mask, fill_value)

        # promote if needed
        else:
            values, changed = com._maybe_upcast_putmask(values, mask,
                                                        fill_value)

    elif copy:
        values = values.copy()

    values = _view_if_needed(values)

    # return a platform independent precision dtype
    dtype_max = dtype
    if dtype.kind == 'i' and not issubclass(dtype.type, (np.bool,
                                                         np.datetime64,
                                                         np.timedelta64)):
        dtype_max = np.int64
    elif dtype.kind in ['b'] or issubclass(dtype.type, np.bool):
        dtype_max = np.int64
    elif dtype.kind in ['f']:
        dtype_max = np.float64

    return values, mask, dtype, dtype_max
示例#31
0
    def get_value(self, series, key):
        """
        Fast lookup of value from 1-dimensional ndarray. Only use this if you
        know what you're doing
        """
        s = com._values_from_object(series)
        try:
            return com._maybe_box(self,
                                  super(PeriodIndex, self).get_value(s, key),
                                  series, key)
        except (KeyError, IndexError):
            try:
                asdt, parsed, reso = parse_time_string(key, self.freq)
                grp = frequencies.Resolution.get_freq_group(reso)
                freqn = frequencies.get_freq_group(self.freq)

                vals = self._values

                # if our data is higher resolution than requested key, slice
                if grp < freqn:
                    iv = Period(asdt, freq=(grp, 1))
                    ord1 = iv.asfreq(self.freq, how='S').ordinal
                    ord2 = iv.asfreq(self.freq, how='E').ordinal

                    if ord2 < vals[0] or ord1 > vals[-1]:
                        raise KeyError(key)

                    pos = np.searchsorted(self._values, [ord1, ord2])
                    key = slice(pos[0], pos[1] + 1)
                    return series[key]
                elif grp == freqn:
                    key = Period(asdt, freq=self.freq).ordinal
                    return com._maybe_box(self, self._engine.get_value(s, key),
                                          series, key)
                else:
                    raise KeyError(key)
            except TypeError:
                pass

            key = Period(key, self.freq).ordinal
            return com._maybe_box(self, self._engine.get_value(s, key), series,
                                  key)
示例#32
0
def _get_values(values, skipna, fill_value=None, fill_value_typ=None,
                isfinite=False, copy=True):
    """ utility to get the values view, mask, dtype
        if necessary copy and mask using the specified fill_value
        copy = True will force the copy """
    values = _values_from_object(values)
    if isfinite:
        mask = _isfinite(values)
    else:
        mask = isnull(values)

    dtype = values.dtype
    dtype_ok = _na_ok_dtype(dtype)

    # get our fill value (in case we need to provide an alternative
    # dtype for it)
    fill_value = _get_fill_value(dtype, fill_value=fill_value,
                                 fill_value_typ=fill_value_typ)

    if skipna:
        if copy:
            values = values.copy()
        if dtype_ok:
            np.putmask(values, mask, fill_value)

        # promote if needed
        else:
            values, changed = _maybe_upcast_putmask(values, mask, fill_value)

    elif copy:
        values = values.copy()

    values = _view_if_needed(values)

    # return a platform independent precision dtype
    dtype_max = dtype
    if is_integer_dtype(dtype) or is_bool_dtype(dtype):
        dtype_max = np.int64
    elif is_float_dtype(dtype):
        dtype_max = np.float64

    return values, mask, dtype, dtype_max
示例#33
0
    def wrapper(self, other):
        meth = getattr(dtl.DatetimeLikeArrayMixin, opname)

        if isinstance(other, (datetime, np.datetime64, compat.string_types)):
            if isinstance(other, datetime):
                # GH#18435 strings get a pass from tzawareness compat
                self._assert_tzawareness_compat(other)

            other = _to_m8(other, tz=self.tz)
            result = meth(self, other)
            if isna(other):
                result.fill(nat_result)
        else:
            if isinstance(other, list):
                other = type(self)(other)
            elif not isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)):
                # Following Timestamp convention, __eq__ is all-False
                # and __ne__ is all True, others raise TypeError.
                if opname == '__eq__':
                    return np.zeros(shape=self.shape, dtype=bool)
                elif opname == '__ne__':
                    return np.ones(shape=self.shape, dtype=bool)
                raise TypeError('%s type object %s' %
                                (type(other), str(other)))

            if is_datetimelike(other):
                self._assert_tzawareness_compat(other)

            result = meth(self, np.asarray(other))
            result = com._values_from_object(result)

            # Make sure to pass an array to result[...]; indexing with
            # Series breaks with older version of numpy
            o_mask = np.array(isna(other))
            if o_mask.any():
                result[o_mask] = nat_result

        if self.hasnans:
            result[self._isnan] = nat_result

        return result
示例#34
0
    def wrapper(self, other):
        if isinstance(other, pd.Series):
            name = _maybe_match_name(self, other)
            if len(self) != len(other):
                raise ValueError('Series lengths must match to compare')
            return self._constructor(na_op(self.values, other.values),
                                     index=self.index, name=name)
        elif isinstance(other, pd.DataFrame):  # pragma: no cover
            return NotImplemented
        elif isinstance(other, (pa.Array, pd.Series)):
            if len(self) != len(other):
                raise ValueError('Lengths must match to compare')
            return self._constructor(na_op(self.values, np.asarray(other)),
                                     index=self.index).__finalize__(self)
        else:

            mask = isnull(self)

            values = self.values
            other = _index.convert_scalar(values, other)

            if issubclass(values.dtype.type, np.datetime64):
                values = values.view('i8')

            # scalars
            res = na_op(values, other)
            if np.isscalar(res):
                raise TypeError('Could not compare %s type with Series'
                                % type(other))

            # always return a full value series here
            res = _values_from_object(res)

            res = pd.Series(res, index=self.index, name=self.name,
                            dtype='bool')

            # mask out the invalids
            if mask.any():
                res[mask] = masker

            return res
示例#35
0
    def na_op(x, y):
        try:
            result = expressions.evaluate(op, str_rep, x, y,
                                          raise_on_error=True, **eval_kwargs)
        except TypeError:
            if isinstance(y, (np.ndarray, pd.Series, pd.Index)):
                dtype = np.find_common_type([x.dtype, y.dtype], [])
                result = np.empty(x.size, dtype=dtype)
                mask = notnull(x) & notnull(y)
                result[mask] = op(x[mask], _values_from_object(y[mask]))
            elif isinstance(x, np.ndarray):
                result = np.empty(len(x), dtype=x.dtype)
                mask = notnull(x)
                result[mask] = op(x[mask], y)
            else:
                raise TypeError("{typ} cannot perform the operation {op}".format(typ=type(x).__name__,op=str_rep))

            result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan)

        result = com._fill_zeros(result, x, y, name, fill_zeros)
        return result
示例#36
0
    def work(self, fig=None, ax=None):
        """Draw a histogram on matplotlib figure or axis

        Parameters:
        -----------
        fig: matplotlib figure
        ax: matplotlib axis

        Returns:
        --------
        a tuple with figure and axis objects
        """
        if ax is None:
            if fig is None:
                return fig, ax
            else:
                ax = fig.gca()
        x = self.data[self.aes['x']]
        ax.hist(_values_from_object(x), self.bins, facecolor=self.colour)
        ax.set_xlabel(self.aes['x'])
        return fig, ax
示例#37
0
    def na_op(x, y):
        import pandas.core.computation.expressions as expressions

        try:
            result = expressions.evaluate(op, str_rep, x, y, **eval_kwargs)
        except TypeError:
            if isinstance(y, (np.ndarray, ABCSeries, pd.Index)):
                dtype = find_common_type([x.dtype, y.dtype])
                result = np.empty(x.size, dtype=dtype)
                mask = notna(x) & notna(y)
                result[mask] = op(x[mask], com._values_from_object(y[mask]))
            else:
                assert isinstance(x, np.ndarray)
                result = np.empty(len(x), dtype=x.dtype)
                mask = notna(x)
                result[mask] = op(x[mask], y)

            result, changed = maybe_upcast_putmask(result, ~mask, np.nan)

        result = missing.fill_zeros(result, x, y, name, fill_zeros)
        return result
示例#38
0
    def __getitem__(self, key):
        try:
            return self._get_val_at(self.index.get_loc(key))

        except KeyError:
            if isinstance(key, (int, np.integer)):
                return self._get_val_at(key)
            elif key is Ellipsis:
                return self
            raise Exception('Requested index not in this series!')

        except TypeError:
            # Could not hash item, must be array-like?
            pass

        # is there a case where this would NOT be an ndarray?
        # need to find an example, I took out the case for now

        key = _values_from_object(key)
        dataSlice = self.values[key]
        new_index = Index(self.index.view(ndarray)[key])
        return self._constructor(dataSlice, index=new_index).__finalize__(self)
示例#39
0
def maybe_to_datetimelike(data, copy=False):
    """
    return a DelegatedClass of a Series that is datetimelike (e.g. datetime64[ns] dtype or a Series of Periods)
    raise TypeError if this is not possible.

    Parameters
    ----------
    data : Series
    copy : boolean, default False
           copy the input data

    Returns
    -------
    DelegatedClass

    """

    if not isinstance(data, Series):
        raise TypeError(
            "cannot convert an object of type {0} to a datetimelike index".
            format(type(data)))

    index = data.index
    if issubclass(data.dtype.type, np.datetime64):
        return DatetimeProperties(DatetimeIndex(data, copy=copy), index)
    else:

        if isinstance(data, PeriodIndex):
            return PeriodProperties(PeriodIndex(data, copy=copy), index)

        data = com._values_from_object(data)
        inferred = lib.infer_dtype(data)
        if inferred == 'period':
            return PeriodProperties(PeriodIndex(data), index)

    raise TypeError(
        "cannot convert an object of type {0} to a datetimelike index".format(
            type(data)))
示例#40
0
    def __getitem__(self, key):
        try:
            return self.index.get_value(self, key)

        except InvalidIndexError:
            pass
        except KeyError:
            if isinstance(key, (int, np.integer)):
                return self._get_val_at(key)
            elif key is Ellipsis:
                return self
            raise Exception('Requested index not in this series!')

        except TypeError:
            # Could not hash item, must be array-like?
            pass

        key = _values_from_object(key)
        if self.index.nlevels > 1 and isinstance(key, tuple):
            # to handle MultiIndex labels
            key = self.index.get_loc(key)
        return self._constructor(self.values[key],
                                 index=self.index[key]).__finalize__(self)
示例#41
0
def nanvar(values, axis=None, skipna=True, ddof=1):

    values = _values_from_object(values)
    dtype = values.dtype
    mask = isna(values)
    if is_any_int_dtype(values):
        values = values.astype('f8')
        values[mask] = np.nan

    if is_float_dtype(values):
        count, d = _get_counts_nanvar(mask, axis, ddof, values.dtype)
    else:
        count, d = _get_counts_nanvar(mask, axis, ddof)

    if skipna:
        values = values.copy()
        np.putmask(values, mask, 0)

    # xref GH10242
    # Compute variance via two-pass algorithm, which is stable against
    # cancellation errors and relatively accurate for small numbers of
    # observations.
    #
    # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
    avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
    if axis is not None:
        avg = np.expand_dims(avg, axis)
    sqr = _ensure_numeric((avg - values)**2)
    np.putmask(sqr, mask, 0)
    result = sqr.sum(axis=axis, dtype=np.float64) / d

    # Return variance as np.float64 (the datatype used in the accumulator),
    # unless we were dealing with a float array, in which case use the same
    # precision as the original values array.
    if is_float_dtype(dtype):
        result = result.astype(dtype)
    return _wrap_results(result, values.dtype)
示例#42
0
def cartesian_product(X):
    '''
    Numpy version of itertools.product or pandas.compat.product.
    Sometimes faster (for large inputs)...

    Examples
    --------
    >>> cartesian_product([list('ABC'), [1, 2]])
    [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'),
    array([1, 2, 1, 2, 1, 2])]

    '''

    lenX = np.fromiter((len(x) for x in X), dtype=int)
    cumprodX = np.cumproduct(lenX)

    a = np.roll(cumprodX, 1)
    a[0] = 1

    b = cumprodX[-1] / cumprodX

    return [np.tile(np.repeat(np.asarray(com._values_from_object(x)), b[i]),
                    np.product(a[i]))
            for i, x in enumerate(X)]
示例#43
0
文件: pytables.py 项目: zmyer/pandas
    def convert_value(self, v):
        """ convert the expression that is in the term to something that is
        accepted by pytables """
        def stringify(value):
            if self.encoding is not None:
                encoder = partial(pprint_thing_encoded, encoding=self.encoding)
            else:
                encoder = pprint_thing
            return encoder(value)

        kind = _ensure_decoded(self.kind)
        meta = _ensure_decoded(self.meta)
        if kind == u('datetime64') or kind == u('datetime'):
            if isinstance(v, (int, float)):
                v = stringify(v)
            v = _ensure_decoded(v)
            v = pd.Timestamp(v)
            if v.tz is not None:
                v = v.tz_convert('UTC')
            return TermValue(v, v.value, kind)
        elif kind == u('timedelta64') or kind == u('timedelta'):
            v = _coerce_scalar_to_timedelta_type(v, unit='s').value
            return TermValue(int(v), v, kind)
        elif meta == u('category'):
            metadata = com._values_from_object(self.metadata)
            result = metadata.searchsorted(v, side='left')

            # result returns 0 if v is first element or if v is not in metadata
            # check that metadata contains v
            if not result and v not in metadata:
                result = -1
            return TermValue(result, result, u('integer'))
        elif kind == u('integer'):
            v = int(float(v))
            return TermValue(v, v, kind)
        elif kind == u('float'):
            v = float(v)
            return TermValue(v, v, kind)
        elif kind == u('bool'):
            if isinstance(v, string_types):
                v = not v.strip().lower() in [
                    u('false'),
                    u('f'),
                    u('no'),
                    u('n'),
                    u('none'),
                    u('0'),
                    u('[]'),
                    u('{}'),
                    u('')
                ]
            else:
                v = bool(v)
            return TermValue(v, v, kind)
        elif isinstance(v, string_types):
            # string quoting
            return TermValue(v, stringify(v), u('string'))
        else:
            raise TypeError(("Cannot compare {v} of type {typ}"
                             " to {kind} column").format(v=v,
                                                         typ=type(v),
                                                         kind=kind))
示例#44
0
def _where_standard(cond, a, b, raise_on_error=True):
    return np.where(_values_from_object(cond), _values_from_object(a),
                    _values_from_object(b))
示例#45
0
    def wrapper(self, other, axis=None):
        # Validate the axis parameter
        if axis is not None:
            self._get_axis_number(axis)

        if isinstance(other, ABCSeries):
            name = com._maybe_match_name(self, other)
            if not self._indexed_same(other):
                msg = 'Can only compare identically-labeled Series objects'
                raise ValueError(msg)
            return self._constructor(na_op(self.values, other.values),
                                     index=self.index,
                                     name=name)
        elif isinstance(other, ABCDataFrame):  # pragma: no cover
            return NotImplemented
        elif isinstance(other, (np.ndarray, pd.Index)):
            # do not check length of zerodim array
            # as it will broadcast
            if (not is_scalar(lib.item_from_zerodim(other))
                    and len(self) != len(other)):
                raise ValueError('Lengths must match to compare')

            if isinstance(other, ABCPeriodIndex):
                # temp workaround until fixing GH 13637
                # tested in test_nat_comparisons
                # (pandas.tests.series.test_operators.TestSeriesOperators)
                return self._constructor(na_op(self.values,
                                               other.astype(object).values),
                                         index=self.index)

            return self._constructor(na_op(self.values, np.asarray(other)),
                                     index=self.index).__finalize__(self)

        elif isinstance(other, pd.Categorical):
            if not is_categorical_dtype(self):
                msg = ("Cannot compare a Categorical for op {op} with Series "
                       "of dtype {typ}.\nIf you want to compare values, use "
                       "'series <op> np.asarray(other)'.")
                raise TypeError(msg.format(op=op, typ=self.dtype))

        if is_categorical_dtype(self):
            # cats are a special case as get_values() would return an ndarray,
            # which would then not take categories ordering into account
            # we can go directly to op, as the na_op would just test again and
            # dispatch to it.
            with np.errstate(all='ignore'):
                res = op(self.values, other)
        else:
            values = self.get_values()
            if isinstance(other, (list, np.ndarray)):
                other = np.asarray(other)

            with np.errstate(all='ignore'):
                res = na_op(values, other)
            if is_scalar(res):
                raise TypeError(
                    'Could not compare {typ} type with Series'.format(
                        typ=type(other)))

            # always return a full value series here
            res = com._values_from_object(res)

        res = pd.Series(res, index=self.index, name=self.name, dtype='bool')
        return res
示例#46
0
 def get_value_maybe_box(self, series, key):
     if not isinstance(key, Timedelta):
         key = Timedelta(key)
     values = self._engine.get_value(_values_from_object(series), key)
     return _maybe_box(self, values, series, key)
def _where_standard(cond, a, b):
    return np.where(com._values_from_object(cond), com._values_from_object(a),
                    com._values_from_object(b))
示例#48
0
def _possibly_cast_to_timedelta(value, coerce=True, dtype=None):
    """ try to cast to timedelta64, if already a timedeltalike, then make
        sure that we are [ns] (as numpy 1.6.2 is very buggy in this regards,
        don't force the conversion unless coerce is True

        if coerce='compat' force a compatibilty coercerion (to timedeltas) if needeed
        if dtype is passed then this is the target dtype
        """

    # coercion compatability
    if coerce == 'compat' and _np_version_under1p7:

        def convert(td, dtype):

            # we have an array with a non-object dtype
            if hasattr(td, 'item'):
                td = td.astype(np.int64).item()
                if td == tslib.iNaT:
                    return td
                if dtype == 'm8[us]':
                    td *= 1000
                return td

            if isnull(td) or td == tslib.compat_NaT or td == tslib.iNaT:
                return tslib.iNaT

            # convert td value to a nanosecond value
            d = td.days
            s = td.seconds
            us = td.microseconds

            if dtype == 'object' or dtype == 'm8[ns]':
                td = 1000 * us + (s + d * 24 * 3600) * 10**9
            else:
                raise ValueError(
                    "invalid conversion of dtype in np < 1.7 [%s]" % dtype)

            return td

        # < 1.7 coercion
        if not is_list_like(value):
            value = np.array([value])

        dtype = value.dtype
        return np.array([convert(v, dtype) for v in value], dtype='m8[ns]')

    # deal with numpy not being able to handle certain timedelta operations
    if isinstance(value, (ABCSeries, np.ndarray)):

        # i8 conversions
        if value.dtype == 'int64' and np.dtype(dtype) == 'timedelta64[ns]':
            value = value.astype('timedelta64[ns]')
            return value
        elif value.dtype.kind == 'm':
            if value.dtype != 'timedelta64[ns]':
                value = value.astype('timedelta64[ns]')
            return value

    # we don't have a timedelta, but we want to try to convert to one (but
    # don't force it)
    if coerce:
        new_value = tslib.array_to_timedelta64(
            _values_from_object(value).astype(object), coerce=False)
        if new_value.dtype == 'i8':
            value = np.array(new_value, dtype='timedelta64[ns]')

    return value
示例#49
0
 def get_median(x):
     mask = notnull(x)
     if not skipna and not mask.all():
         return np.nan
     return algos.median(_values_from_object(x[mask]))
示例#50
0
def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    from pandas.tools.tile import cut

    values = Series(values).values

    if bins is not None:
        try:
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.labels

    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)

    elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
        dtype = values.dtype
        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values)

        # convert the keys back to the dtype we came in
        keys = Series(keys, dtype=dtype)

    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        keys, counts = htable.value_count_object(values, mask)

    result = Series(counts, index=com._values_from_object(keys))

    if bins is not None:
        # TODO: This next line should be more efficient
        result = result.reindex(np.arange(len(cat.levels)), fill_value=0)
        result.index = bins[:-1]

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    if normalize:
        result = result / float(values.size)

    return result
示例#51
0
def nankurt(values, axis=None, skipna=True):
    """ Compute the sample skewness.

    The statistic computed here is the adjusted Fisher-Pearson standardized
    moment coefficient G2, computed directly from the second and fourth
    central moment.

    """
    values = _values_from_object(values)
    mask = isna(values)
    if not is_float_dtype(values.dtype):
        values = values.astype('f8')
        count = _get_counts(mask, axis)
    else:
        count = _get_counts(mask, axis, dtype=values.dtype)

    if skipna:
        values = values.copy()
        np.putmask(values, mask, 0)

    mean = values.sum(axis, dtype=np.float64) / count
    if axis is not None:
        mean = np.expand_dims(mean, axis)

    adjusted = values - mean
    if skipna:
        np.putmask(adjusted, mask, 0)
    adjusted2 = adjusted**2
    adjusted4 = adjusted2**2
    m2 = adjusted2.sum(axis, dtype=np.float64)
    m4 = adjusted4.sum(axis, dtype=np.float64)

    with np.errstate(invalid='ignore', divide='ignore'):
        adj = 3 * (count - 1)**2 / ((count - 2) * (count - 3))
        numer = count * (count + 1) * (count - 1) * m4
        denom = (count - 2) * (count - 3) * m2**2
        result = numer / denom - adj

    # floating point error
    numer = _zero_out_fperr(numer)
    denom = _zero_out_fperr(denom)

    if not isinstance(denom, np.ndarray):
        # if ``denom`` is a scalar, check these corner cases first before
        # doing division
        if count < 4:
            return np.nan
        if denom == 0:
            return 0

    with np.errstate(invalid='ignore', divide='ignore'):
        result = numer / denom - adj

    dtype = values.dtype
    if is_float_dtype(dtype):
        result = result.astype(dtype)

    if isinstance(result, np.ndarray):
        result = np.where(denom == 0, 0, result)
        result[count < 4] = np.nan

    return result
示例#52
0
def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None,
                 dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    from pandas.tools.tile import cut
    from pandas.tseries.period import PeriodIndex

    is_period = com.is_period_arraylike(values)
    values = Series(values).values
    is_category = com.is_categorical_dtype(values.dtype)

    if bins is not None:
        try:
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes
    elif is_category:
        bins = values.categories
        cat = values
        values = cat.codes

    dtype = values.dtype

    if issubclass(values.dtype.type,
                  (np.datetime64, np.timedelta64)) or is_period:
        if is_period:
            values = PeriodIndex(values)

        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values)

        if dropna:
            from pandas.tslib import iNaT
            msk = keys != iNaT
            keys, counts = keys[msk], counts[msk]
        # convert the keys back to the dtype we came in
        keys = keys.astype(dtype)

    elif com.is_integer_dtype(dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)

    else:
        values = com._ensure_object(values)
        mask = com.isnull(values)
        keys, counts = htable.value_count_object(values, mask)
        if not dropna:
            keys = np.insert(keys, 0, np.NaN)
            counts = np.insert(counts, 0, mask.sum())

    result = Series(counts, index=com._values_from_object(keys))
    if bins is not None:
        # TODO: This next line should be more efficient
        result = result.reindex(np.arange(len(cat.categories)), fill_value=0)
        if not is_category:
            result.index = bins[:-1]
        else:
            result.index = cat.categories

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    if normalize:
        result = result / float(values.size)

    return result
示例#53
0
    def wrapper(self, other, axis=None):
        # Validate the axis parameter
        if axis is not None:
            self._get_axis_number(axis)

        if isinstance(other, ABCDataFrame):  # pragma: no cover
            # Defer to DataFrame implementation; fail early
            return NotImplemented

        elif isinstance(other, ABCSeries):
            name = com._maybe_match_name(self, other)
            if not self._indexed_same(other):
                msg = 'Can only compare identically-labeled Series objects'
                raise ValueError(msg)
            res_values = na_op(self.values, other.values)
            return self._constructor(res_values, index=self.index, name=name)

        elif isinstance(other, (np.ndarray, pd.Index)):
            # do not check length of zerodim array
            # as it will broadcast
            if (not is_scalar(lib.item_from_zerodim(other))
                    and len(self) != len(other)):
                raise ValueError('Lengths must match to compare')

            res_values = na_op(self.values, np.asarray(other))
            return self._constructor(res_values,
                                     index=self.index).__finalize__(self)

        elif (isinstance(other, pd.Categorical)
              and not is_categorical_dtype(self)):
            raise TypeError(
                "Cannot compare a Categorical for op {op} with "
                "Series of dtype {typ}.\nIf you want to compare "
                "values, use 'series <op> np.asarray(other)'.".format(
                    op=op, typ=self.dtype))

        elif is_scalar(other) and isna(other):
            # numpy does not like comparisons vs None
            if op is operator.ne:
                res_values = np.ones(len(self), dtype=bool)
            else:
                res_values = np.zeros(len(self), dtype=bool)
            return self._constructor(res_values,
                                     index=self.index,
                                     name=self.name,
                                     dtype='bool')

        if is_categorical_dtype(self):
            # cats are a special case as get_values() would return an ndarray,
            # which would then not take categories ordering into account
            # we can go directly to op, as the na_op would just test again and
            # dispatch to it.
            with np.errstate(all='ignore'):
                res = op(self.values, other)
        else:
            values = self.get_values()
            if isinstance(other, (list, np.ndarray)):
                other = np.asarray(other)

            with np.errstate(all='ignore'):
                res = na_op(values, other)
            if is_scalar(res):
                raise TypeError(
                    'Could not compare {typ} type with Series'.format(
                        typ=type(other)))

            # always return a full value series here
            res = com._values_from_object(res)

        res = pd.Series(res, index=self.index, name=self.name, dtype='bool')
        return res
示例#54
0
    def na_op(x, y):

        # dispatch to the categorical if we have a categorical
        # in either operand
        if is_categorical_dtype(x):
            return op(x,y)
        elif is_categorical_dtype(y) and not isscalar(y):
            return op(y,x)

        if is_object_dtype(x.dtype):
            if isinstance(y, list):
                y = lib.list_to_object_array(y)

            if isinstance(y, (np.ndarray, pd.Series)):
                if not is_object_dtype(y.dtype):
                    result = lib.vec_compare(x, y.astype(np.object_), op)
                else:
                    result = lib.vec_compare(x, y, op)
            else:
                result = lib.scalar_compare(x, y, op)
        else:

            # we want to compare like types
            # we only want to convert to integer like if
            # we are not NotImplemented, otherwise
            # we would allow datetime64 (but viewed as i8) against
            # integer comparisons
            if is_datetimelike_v_numeric(x, y):
                raise TypeError("invalid type comparison")

            # numpy does not like comparisons vs None
            if isscalar(y) and isnull(y):
                y = np.nan

            # we have a datetime/timedelta and may need to convert
            mask = None
            if needs_i8_conversion(x) or (not isscalar(y) and needs_i8_conversion(y)):

                if isscalar(y):
                    y = _index.convert_scalar(x,_values_from_object(y))
                else:
                    y = y.view('i8')

                if name == '__ne__':
                    mask = notnull(x)
                else:
                    mask = isnull(x)

                x = x.view('i8')

            try:
                result = getattr(x, name)(y)
                if result is NotImplemented:
                    raise TypeError("invalid type comparison")
            except AttributeError:
                result = op(x, y)

            if mask is not None and mask.any():
                result[mask] = False

        return result