예제 #1
0
파일: ops.py 프로젝트: jess010/pandas
    def na_op(x, y):

        # dispatch to the categorical if we have a categorical
        # in either operand
        if is_categorical_dtype(x):
            return op(x, y)
        elif is_categorical_dtype(y) and not is_scalar(y):
            return op(y, x)

        if is_object_dtype(x.dtype):
            result = _comp_method_OBJECT_ARRAY(op, x, y)
        else:

            # we want to compare like types
            # we only want to convert to integer like if
            # we are not NotImplemented, otherwise
            # we would allow datetime64 (but viewed as i8) against
            # integer comparisons
            if is_datetimelike_v_numeric(x, y):
                raise TypeError("invalid type comparison")

            # numpy does not like comparisons vs None
            if is_scalar(y) and isna(y):
                if name == '__ne__':
                    return np.ones(len(x), dtype=bool)
                else:
                    return np.zeros(len(x), dtype=bool)

            # we have a datetime/timedelta and may need to convert
            mask = None
            if (needs_i8_conversion(x) or
                    (not is_scalar(y) and needs_i8_conversion(y))):

                if is_scalar(y):
                    mask = isna(x)
                    y = libindex.convert_scalar(x, com._values_from_object(y))
                else:
                    mask = isna(x) | isna(y)
                    y = y.view('i8')
                x = x.view('i8')

            try:
                with np.errstate(all='ignore'):
                    result = getattr(x, name)(y)
                if result is NotImplemented:
                    raise TypeError("invalid type comparison")
            except AttributeError:
                result = op(x, y)

            if mask is not None and mask.any():
                result[mask] = masker

        return result
예제 #2
0
파일: numeric.py 프로젝트: dalejung/pandas
 def astype(self, dtype, copy=True):
     dtype = pandas_dtype(dtype)
     if needs_i8_conversion(dtype):
         msg = ('Cannot convert Float64Index to dtype {dtype}; integer '
                'values are required for conversion').format(dtype=dtype)
         raise TypeError(msg)
     elif is_integer_dtype(dtype) and self.hasnans:
         # GH 13149
         raise ValueError('Cannot convert NA to integer')
     return super(Float64Index, self).astype(dtype, copy=copy)
예제 #3
0
    def cmp_method(self, other):
        if isinstance(other, ABCDataFrame):
            return NotImplemented

        if isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)):
            if other.ndim > 0 and len(self) != len(other):
                raise ValueError('Lengths must match to compare')

        if needs_i8_conversion(self) and needs_i8_conversion(other):
            # we may need to directly compare underlying
            # representations
            return self._evaluate_compare(other, op)

        # numpy will show a DeprecationWarning on invalid elementwise
        # comparisons, this will raise in the future
        with warnings.catch_warnings(record=True):
            with np.errstate(all='ignore'):
                result = op(self.values, np.asarray(other))

        return result
예제 #4
0
파일: numeric.py 프로젝트: pydata/pandas
 def astype(self, dtype, copy=True):
     dtype = pandas_dtype(dtype)
     if needs_i8_conversion(dtype):
         msg = ('Cannot convert Float64Index to dtype {dtype}; integer '
                'values are required for conversion').format(dtype=dtype)
         raise TypeError(msg)
     elif (is_integer_dtype(dtype) and
           not is_extension_array_dtype(dtype)) and self.hasnans:
         # TODO(jreback); this can change once we have an EA Index type
         # GH 13149
         raise ValueError('Cannot convert NA to integer')
     return super().astype(dtype, copy=copy)
예제 #5
0
def test_needs_i8_conversion():
    assert not com.needs_i8_conversion(str)
    assert not com.needs_i8_conversion(np.int64)
    assert not com.needs_i8_conversion(pd.Series([1, 2]))
    assert not com.needs_i8_conversion(np.array(['a', 'b']))

    assert com.needs_i8_conversion(np.datetime64)
    assert com.needs_i8_conversion(pd.Series([], dtype="timedelta64[ns]"))
    assert com.needs_i8_conversion(pd.DatetimeIndex(
        [1, 2, 3], tz="US/Eastern"))
예제 #6
0
    def test_fillna(self):
        # # GH 11343
        # though Index.fillna and Series.fillna has separate impl,
        # test here to confirm these works as the same

        for orig in self.objs:

            o = orig.copy()
            values = o.values

            # values will not be changed
            result = o.fillna(o.astype(object).values[0])
            if isinstance(o, Index):
                tm.assert_index_equal(o, result)
            else:
                tm.assert_series_equal(o, result)
            # check shallow_copied
            assert o is not result

        for null_obj in [np.nan, None]:
            for orig in self.objs:
                o = orig.copy()
                klass = type(o)

                if not self._allow_na_ops(o):
                    continue

                if needs_i8_conversion(o):

                    values = o.astype(object).values
                    fill_value = values[0]
                    values[0:2] = pd.NaT
                else:
                    values = o.values.copy()
                    fill_value = o.values[0]
                    values[0:2] = null_obj

                expected = [fill_value] * 2 + list(values[2:])

                expected = klass(expected)
                o = klass(values)

                # check values has the same dtype as the original
                assert o.dtype == orig.dtype

                result = o.fillna(fill_value)
                if isinstance(o, Index):
                    tm.assert_index_equal(result, expected)
                else:
                    tm.assert_series_equal(result, expected)
                # check shallow_copied
                assert o is not result
예제 #7
0
파일: common.py 프로젝트: adamklein/pandas
    def test_get_unique_index(self):
        for ind in self.indices.values():

            # MultiIndex tested separately
            if not len(ind) or isinstance(ind, MultiIndex):
                continue

            idx = ind[[0] * 5]
            idx_unique = ind[[0]]

            # We test against `idx_unique`, so first we make sure it's unique
            # and doesn't contain nans.
            assert idx_unique.is_unique
            try:
                assert not idx_unique.hasnans
            except NotImplementedError:
                pass

            for dropna in [False, True]:
                result = idx._get_unique_index(dropna=dropna)
                tm.assert_index_equal(result, idx_unique)

            # nans:
            if not ind._can_hold_na:
                continue

            if needs_i8_conversion(ind):
                vals = ind.asi8[[0] * 5]
                vals[0] = iNaT
            else:
                vals = ind.values[[0] * 5]
                vals[0] = np.nan

            vals_unique = vals[:2]
            idx_nan = ind._shallow_copy(vals)
            idx_unique_nan = ind._shallow_copy(vals_unique)
            assert idx_unique_nan.is_unique

            assert idx_nan.dtype == ind.dtype
            assert idx_unique_nan.dtype == ind.dtype

            for dropna, expected in zip([False, True],
                                        [idx_unique_nan, idx_unique]):
                for i in [idx_nan, idx_unique_nan]:
                    result = i._get_unique_index(dropna=dropna)
                    tm.assert_index_equal(result, expected)
예제 #8
0
파일: packers.py 프로젝트: ankostis/pandas
def convert(values):
    """ convert the numpy values to a list """

    dtype = values.dtype

    if is_categorical_dtype(values):
        return values

    elif is_object_dtype(dtype):
        return values.ravel().tolist()

    if needs_i8_conversion(dtype):
        values = values.view('i8')
    v = values.ravel()

    if compressor == 'zlib':
        _check_zlib()

        # return string arrays like they are
        if dtype == np.object_:
            return v.tolist()

        # convert to a bytes array
        v = v.tostring()
        return ExtType(0, zlib.compress(v))

    elif compressor == 'blosc':
        _check_blosc()

        # return string arrays like they are
        if dtype == np.object_:
            return v.tolist()

        # convert to a bytes array
        v = v.tostring()
        return ExtType(0, blosc.compress(v, typesize=dtype.itemsize))

    # ndarray (on original dtype)
    return ExtType(0, v.tostring())
예제 #9
0
파일: test_ops.py 프로젝트: ymerkli/pandas
    def test_value_counts_unique_nunique_null(self, null_obj):

        for orig in self.objs:
            o = orig.copy()
            klass = type(o)
            values = o._ndarray_values

            if not self._allow_na_ops(o):
                continue

            # special assign to the numpy array
            if is_datetime64tz_dtype(o):
                if isinstance(o, DatetimeIndex):
                    v = o.asi8
                    v[0:2] = iNaT
                    values = o._shallow_copy(v)
                else:
                    o = o.copy()
                    o[0:2] = pd.NaT
                    values = o._values

            elif needs_i8_conversion(o):
                values[0:2] = iNaT
                values = o._shallow_copy(values)
            else:
                values[0:2] = null_obj
            # check values has the same dtype as the original

            assert values.dtype == o.dtype

            # create repeated values, 'n'th element is repeated by n+1
            # times
            if isinstance(o, (DatetimeIndex, PeriodIndex)):
                expected_index = o.copy()
                expected_index.name = None

                # attach name to klass
                o = klass(values.repeat(range(1, len(o) + 1)))
                o.name = "a"
            else:
                if isinstance(o, DatetimeIndex):
                    expected_index = orig._values._shallow_copy(values)
                else:
                    expected_index = Index(values)
                expected_index.name = None
                o = o.repeat(range(1, len(o) + 1))
                o.name = "a"

            # check values has the same dtype as the original
            assert o.dtype == orig.dtype
            # check values correctly have NaN
            nanloc = np.zeros(len(o), dtype=np.bool)
            nanloc[:3] = True
            if isinstance(o, Index):
                tm.assert_numpy_array_equal(pd.isna(o), nanloc)
            else:
                exp = Series(nanloc, o.index, name="a")
                tm.assert_series_equal(pd.isna(o), exp)

            expected_s_na = Series(
                list(range(10, 2, -1)) + [3],
                index=expected_index[9:0:-1],
                dtype="int64",
                name="a",
            )
            expected_s = Series(
                list(range(10, 2, -1)),
                index=expected_index[9:1:-1],
                dtype="int64",
                name="a",
            )

            result_s_na = o.value_counts(dropna=False)
            tm.assert_series_equal(result_s_na, expected_s_na)
            assert result_s_na.index.name is None
            assert result_s_na.name == "a"
            result_s = o.value_counts()
            tm.assert_series_equal(o.value_counts(), expected_s)
            assert result_s.index.name is None
            assert result_s.name == "a"

            result = o.unique()
            if isinstance(o, Index):
                tm.assert_index_equal(result, Index(values[1:], name="a"))
            elif is_datetime64tz_dtype(o):
                # unable to compare NaT / nan
                tm.assert_extension_array_equal(result[1:], values[2:])
                assert result[0] is pd.NaT
            else:
                tm.assert_numpy_array_equal(result[1:], values[2:])

                assert pd.isna(result[0])
                assert result.dtype == orig.dtype

            assert o.nunique() == 8
            assert o.nunique(dropna=False) == 9
예제 #10
0
    def _cython_operation(
        self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
    ):
        assert kind in ["transform", "aggregate"]
        orig_values = values

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values) or is_sparse(values):
            raise NotImplementedError(
                "{dtype} dtype not supported".format(dtype=values.dtype)
            )
        elif is_datetime64_any_dtype(values):
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise NotImplementedError(
                    "datetime64 type does not support {how} operations".format(how=how)
                )
        elif is_timedelta64_dtype(values):
            if how in ["prod", "cumprod"]:
                raise NotImplementedError(
                    "timedelta64 type does not support {how} operations".format(how=how)
                )

        if is_datetime64tz_dtype(values.dtype):
            # Cast to naive; we'll cast back at the end of the function
            # TODO: possible need to reshape?  kludge can be avoided when
            #  2D EA is allowed.
            values = values.view("M8[ns]")

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError(
                    "arity of more than 1 is not supported for the 'how' argument"
                )
            out_shape = (self.ngroups,) + values.shape[1:]

        try:
            func = self._get_cython_function(kind, how, values, is_numeric)
        except NotImplementedError:
            if is_numeric:
                try:
                    values = ensure_float64(values)
                except TypeError:
                    if lib.infer_dtype(values, skipna=False) == "complex":
                        values = values.astype(complex)
                    else:
                        raise
                func = self._get_cython_function(kind, how, values, is_numeric)
            else:
                raise

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = "{kind}{itemsize}".format(
                    kind=values.dtype.kind, itemsize=values.dtype.itemsize
                )
            else:
                out_dtype = "object"

        codes, _, _ = self.group_info

        if kind == "aggregate":
            result = _maybe_fill(
                np.empty(out_shape, dtype=out_dtype), fill_value=np.nan
            )
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(
                result, counts, values, codes, func, is_datetimelike, min_count
            )
        elif kind == "transform":
            result = _maybe_fill(
                np.empty_like(values, dtype=out_dtype), fill_value=np.nan
            )

            # TODO: min_count
            result = self._transform(
                result, values, codes, func, is_datetimelike, **kwargs
            )

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all():
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        if how in self._name_functions:
            names = self._name_functions[how]()  # type: Optional[List[str]]
        else:
            names = None

        if swapped:
            result = result.swapaxes(0, axis)

        if is_datetime64tz_dtype(orig_values.dtype):
            result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype)
        elif is_datetimelike and kind == "aggregate":
            result = result.astype(orig_values.dtype)

        return result, names
예제 #11
0
    def test_value_counts_unique_nunique_null(self, null_obj,
                                              index_or_series_obj):
        orig = index_or_series_obj
        obj = orig.copy()
        klass = type(obj)
        values = obj._ndarray_values
        num_values = len(orig)

        if not allow_na_ops(obj):
            pytest.skip("type doesn't allow for NA operations")
        elif isinstance(orig, (pd.CategoricalIndex, pd.IntervalIndex)):
            pytest.skip(f"values of {klass} cannot be changed")
        elif isinstance(orig, pd.MultiIndex):
            pytest.skip("MultiIndex doesn't support isna")
        elif orig.duplicated().any():
            pytest.xfail(
                "The test implementation isn't flexible enough to deal"
                " with duplicated values. This isn't a bug in the"
                " application code, but in the test code.")

        # special assign to the numpy array
        if is_datetime64tz_dtype(obj):
            if isinstance(obj, DatetimeIndex):
                v = obj.asi8
                v[0:2] = iNaT
                values = obj._shallow_copy(v)
            else:
                obj = obj.copy()
                obj[0:2] = pd.NaT
                values = obj._values

        elif needs_i8_conversion(obj):
            values[0:2] = iNaT
            values = obj._shallow_copy(values)
        else:
            values[0:2] = null_obj

        # check values has the same dtype as the original
        assert values.dtype == obj.dtype

        # create repeated values, 'n'th element is repeated by n+1
        # times
        if isinstance(obj, (DatetimeIndex, PeriodIndex)):
            expected_index = obj.copy()
            expected_index.name = None

            # attach name to klass
            obj = klass(values.repeat(range(1, len(obj) + 1)))
            obj.name = "a"
        else:
            if isinstance(obj, DatetimeIndex):
                expected_index = orig._values._shallow_copy(values)
            else:
                expected_index = Index(values)
            expected_index.name = None
            obj = obj.repeat(range(1, len(obj) + 1))
            obj.name = "a"

        # check values has the same dtype as the original
        assert obj.dtype == orig.dtype

        # check values correctly have NaN
        nanloc = np.zeros(len(obj), dtype=np.bool)
        nanloc[:3] = True
        if isinstance(obj, Index):
            tm.assert_numpy_array_equal(pd.isna(obj), nanloc)
        else:
            exp = Series(nanloc, obj.index, name="a")
            tm.assert_series_equal(pd.isna(obj), exp)

        expected_data = list(range(num_values, 2, -1))
        expected_data_na = expected_data.copy()
        if expected_data_na:
            expected_data_na.append(3)
        expected_s_na = Series(
            expected_data_na,
            index=expected_index[num_values - 1:0:-1],
            dtype="int64",
            name="a",
        )
        expected_s = Series(
            expected_data,
            index=expected_index[num_values - 1:1:-1],
            dtype="int64",
            name="a",
        )

        result_s_na = obj.value_counts(dropna=False)
        tm.assert_series_equal(result_s_na, expected_s_na)
        assert result_s_na.index.name is None
        assert result_s_na.name == "a"
        result_s = obj.value_counts()
        tm.assert_series_equal(obj.value_counts(), expected_s)
        assert result_s.index.name is None
        assert result_s.name == "a"

        result = obj.unique()
        if isinstance(obj, Index):
            tm.assert_index_equal(result, Index(values[1:], name="a"))
        elif is_datetime64tz_dtype(obj):
            # unable to compare NaT / nan
            tm.assert_extension_array_equal(result[1:], values[2:])
            assert result[0] is pd.NaT
        elif len(obj) > 0:
            tm.assert_numpy_array_equal(result[1:], values[2:])

            assert pd.isna(result[0])
            assert result.dtype == orig.dtype

        assert obj.nunique() == max(0, num_values - 2)
        assert obj.nunique(dropna=False) == max(0, num_values - 1)
예제 #12
0
def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
                   limit_direction='forward', limit_area=None, fill_value=None,
                   bounds_error=False, order=None, **kwargs):
    """
    Logic for the 1-d interpolation.  The result should be 1-d, inputs
    xvalues and yvalues will each be 1-d arrays of the same length.

    Bounds_error is currently hardcoded to False since non-scipy ones don't
    take it as an argumnet.
    """
    # Treat the original, non-scipy methods first.

    invalid = isna(yvalues)
    valid = ~invalid

    if not valid.any():
        # have to call np.asarray(xvalues) since xvalues could be an Index
        # which can't be mutated
        result = np.empty_like(np.asarray(xvalues), dtype=np.float64)
        result.fill(np.nan)
        return result

    if valid.all():
        return yvalues

    if method == 'time':
        if not getattr(xvalues, 'is_all_dates', None):
            # if not issubclass(xvalues.dtype.type, np.datetime64):
            raise ValueError('time-weighted interpolation only works '
                             'on Series or DataFrames with a '
                             'DatetimeIndex')
        method = 'values'

    valid_limit_directions = ['forward', 'backward', 'both']
    limit_direction = limit_direction.lower()
    if limit_direction not in valid_limit_directions:
        msg = ('Invalid limit_direction: expecting one of {valid!r}, '
               'got {invalid!r}.')
        raise ValueError(msg.format(valid=valid_limit_directions,
                                    invalid=limit_direction))

    if limit_area is not None:
        valid_limit_areas = ['inside', 'outside']
        limit_area = limit_area.lower()
        if limit_area not in valid_limit_areas:
            raise ValueError('Invalid limit_area: expecting one of {}, got '
                             '{}.'.format(valid_limit_areas, limit_area))

    # default limit is unlimited GH #16282
    if limit is None:
        # limit = len(xvalues)
        pass
    elif not is_integer(limit):
        raise ValueError('Limit must be an integer')
    elif limit < 1:
        raise ValueError('Limit must be greater than 0')

    from pandas import Series
    ys = Series(yvalues)

    # These are sets of index pointers to invalid values... i.e. {0, 1, etc...
    all_nans = set(np.flatnonzero(invalid))
    start_nans = set(range(ys.first_valid_index()))
    end_nans = set(range(1 + ys.last_valid_index(), len(valid)))
    mid_nans = all_nans - start_nans - end_nans

    # Like the sets above, preserve_nans contains indices of invalid values,
    # but in this case, it is the final set of indices that need to be
    # preserved as NaN after the interpolation.

    # For example if limit_direction='forward' then preserve_nans will
    # contain indices of NaNs at the beginning of the series, and NaNs that
    # are more than'limit' away from the prior non-NaN.

    # set preserve_nans based on direction using _interp_limit
    if limit_direction == 'forward':
        preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
    elif limit_direction == 'backward':
        preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
    else:
        # both directions... just use _interp_limit
        preserve_nans = set(_interp_limit(invalid, limit, limit))

    # if limit_area is set, add either mid or outside indices
    # to preserve_nans GH #16284
    if limit_area == 'inside':
        # preserve NaNs on the outside
        preserve_nans |= start_nans | end_nans
    elif limit_area == 'outside':
        # preserve NaNs on the inside
        preserve_nans |= mid_nans

    # sort preserve_nans and covert to list
    preserve_nans = sorted(preserve_nans)

    xvalues = getattr(xvalues, 'values', xvalues)
    yvalues = getattr(yvalues, 'values', yvalues)
    result = yvalues.copy()

    if method in ['linear', 'time', 'index', 'values']:
        if method in ('values', 'index'):
            inds = np.asarray(xvalues)
            # hack for DatetimeIndex, #1646
            if needs_i8_conversion(inds.dtype.type):
                inds = inds.view(np.int64)
            if inds.dtype == np.object_:
                inds = lib.maybe_convert_objects(inds)
        else:
            inds = xvalues
        result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid])
        result[preserve_nans] = np.nan
        return result

    sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic',
                  'barycentric', 'krogh', 'spline', 'polynomial',
                  'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima']

    if method in sp_methods:
        inds = np.asarray(xvalues)
        # hack for DatetimeIndex, #1646
        if issubclass(inds.dtype.type, np.datetime64):
            inds = inds.view(np.int64)
        result[invalid] = _interpolate_scipy_wrapper(inds[valid],
                                                     yvalues[valid],
                                                     inds[invalid],
                                                     method=method,
                                                     fill_value=fill_value,
                                                     bounds_error=bounds_error,
                                                     order=order, **kwargs)
        result[preserve_nans] = np.nan
        return result
예제 #13
0
    def _cython_operation(self,
                          kind,
                          values,
                          how,
                          axis,
                          min_count=-1,
                          **kwargs):
        assert kind in ["transform", "aggregate"]

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values) or is_sparse(values):
            raise NotImplementedError(
                "{} are not support in cython ops".format(values.dtype))
        elif is_datetime64_any_dtype(values):
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise NotImplementedError(
                    "datetime64 type does not support {} operations".format(
                        how))
        elif is_timedelta64_dtype(values):
            if how in ["prod", "cumprod"]:
                raise NotImplementedError(
                    "timedelta64 type does not support {} operations".format(
                        how))

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError(
                    "arity of more than 1 is not supported for the 'how' argument"
                )
            out_shape = (self.ngroups, ) + values.shape[1:]

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        try:
            func = self._get_cython_function(kind, how, values, is_numeric)
        except NotImplementedError:
            if is_numeric:
                values = ensure_float64(values)
                func = self._get_cython_function(kind, how, values, is_numeric)
            else:
                raise

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = "{kind}{itemsize}".format(
                    kind=values.dtype.kind, itemsize=values.dtype.itemsize)
            else:
                out_dtype = "object"

        labels, _, _ = self.group_info

        if kind == "aggregate":
            result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                 fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(
                result,
                counts,
                values,
                labels,
                func,
                is_numeric,
                is_datetimelike,
                min_count,
            )
        elif kind == "transform":
            result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                 fill_value=np.nan)

            # TODO: min_count
            result = self._transform(result, values, labels, func, is_numeric,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all(
        ):
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        if how in self._name_functions:
            # TODO
            names = self._name_functions[how]()
        else:
            names = None

        if swapped:
            result = result.swapaxes(0, axis)

        return result, names
예제 #14
0
def _get_values(
    values: np.ndarray,
    skipna: bool,
    fill_value: Any = None,
    fill_value_typ: Optional[str] = None,
    mask: Optional[np.ndarray] = None,
) -> Tuple[np.ndarray, Optional[np.ndarray], np.dtype, np.dtype, Any]:
    """
    Utility to get the values view, mask, dtype, dtype_max, and fill_value.

    If both mask and fill_value/fill_value_typ are not None and skipna is True,
    the values array will be copied.

    For input arrays of boolean or integer dtypes, copies will only occur if a
    precomputed mask, a fill_value/fill_value_typ, and skipna=True are
    provided.

    Parameters
    ----------
    values : ndarray
        input array to potentially compute mask for
    skipna : bool
        boolean for whether NaNs should be skipped
    fill_value : Any
        value to fill NaNs with
    fill_value_typ : str
        Set to '+inf' or '-inf' to handle dtype-specific infinities
    mask : Optional[np.ndarray]
        nan-mask if known

    Returns
    -------
    values : ndarray
        Potential copy of input value array
    mask : Optional[ndarray[bool]]
        Mask for values, if deemed necessary to compute
    dtype : dtype
        dtype for values
    dtype_max : dtype
        platform independent dtype
    fill_value : Any
        fill value used
    """
    # In _get_values is only called from within nanops, and in all cases
    #  with scalar fill_value.  This guarantee is important for the
    #  maybe_upcast_putmask call below
    assert is_scalar(fill_value)
    values = extract_array(values, extract_numpy=True)

    mask = _maybe_get_mask(values, skipna, mask)

    dtype = values.dtype

    if needs_i8_conversion(values):
        # changing timedelta64/datetime64 to int64 needs to happen after
        #  finding `mask` above
        values = np.asarray(values.view("i8"))

    dtype_ok = _na_ok_dtype(dtype)

    # get our fill value (in case we need to provide an alternative
    # dtype for it)
    fill_value = _get_fill_value(dtype,
                                 fill_value=fill_value,
                                 fill_value_typ=fill_value_typ)

    copy = (mask is not None) and (fill_value is not None)

    if skipna and copy:
        values = values.copy()
        assert mask is not None  # for mypy
        if dtype_ok and mask.any():
            np.putmask(values, mask, fill_value)

        # promote if needed
        else:
            values, _ = maybe_upcast_putmask(values, mask, fill_value)

    # return a platform independent precision dtype
    dtype_max = dtype
    if is_integer_dtype(dtype) or is_bool_dtype(dtype):
        dtype_max = np.int64
    elif is_float_dtype(dtype):
        dtype_max = np.float64

    return values, mask, dtype, dtype_max, fill_value
예제 #15
0
def array_equivalent(left,
                     right,
                     strict_nan: bool = False,
                     dtype_equal: bool = False) -> bool:
    """
    True if two arrays, left and right, have equal non-NaN elements, and NaNs
    in corresponding locations.  False otherwise. It is assumed that left and
    right are NumPy arrays of the same dtype. The behavior of this function
    (particularly with respect to NaNs) is not defined if the dtypes are
    different.

    Parameters
    ----------
    left, right : ndarrays
    strict_nan : bool, default False
        If True, consider NaN and None to be different.
    dtype_equal : bool, default False
        Whether `left` and `right` are known to have the same dtype
        according to `is_dtype_equal`. Some methods like `BlockManager.equals`.
        require that the dtypes match. Setting this to ``True`` can improve
        performance, but will give different results for arrays that are
        equal but different dtypes.

    Returns
    -------
    b : bool
        Returns True if the arrays are equivalent.

    Examples
    --------
    >>> array_equivalent(
    ...     np.array([1, 2, np.nan]),
    ...     np.array([1, 2, np.nan]))
    True
    >>> array_equivalent(
    ...     np.array([1, np.nan, 2]),
    ...     np.array([1, 2, np.nan]))
    False
    """
    left, right = np.asarray(left), np.asarray(right)

    # shape compat
    if left.shape != right.shape:
        return False

    if dtype_equal:
        # fastpath when we require that the dtypes match (Block.equals)
        if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype):
            return _array_equivalent_float(left, right)
        elif is_datetimelike_v_numeric(left.dtype, right.dtype):
            return False
        elif needs_i8_conversion(left.dtype):
            return _array_equivalent_datetimelike(left, right)
        elif is_string_dtype(left.dtype):
            # TODO: fastpath for pandas' StringDtype
            return _array_equivalent_object(left, right, strict_nan)
        else:
            return np.array_equal(left, right)

    # Slow path when we allow comparing different dtypes.
    # Object arrays can contain None, NaN and NaT.
    # string dtypes must be come to this path for NumPy 1.7.1 compat
    if is_string_dtype(left.dtype) or is_string_dtype(right.dtype):
        return _array_equivalent_object(left, right, strict_nan)

    # NaNs can occur in float and complex arrays.
    if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype):
        if not (np.prod(left.shape) and np.prod(right.shape)):
            return True
        return ((left == right) | (isna(left) & isna(right))).all()

    elif is_datetimelike_v_numeric(left, right):
        # GH#29553 avoid numpy deprecation warning
        return False

    elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype):
        # datetime64, timedelta64, Period
        if not is_dtype_equal(left.dtype, right.dtype):
            return False

        left = left.view("i8")
        right = right.view("i8")

    # if we have structured dtypes, compare first
    if left.dtype.type is np.void or right.dtype.type is np.void:
        if left.dtype != right.dtype:
            return False

    return np.array_equal(left, right)
예제 #16
0
파일: ops.py 프로젝트: iansheng/pandas
    def _call_cython_op(
        self,
        values: np.ndarray,  # np.ndarray[ndim=2]
        *,
        min_count: int,
        ngroups: int,
        comp_ids: np.ndarray,
        mask: np.ndarray | None,
        result_mask: np.ndarray | None,
        **kwargs,
    ) -> np.ndarray:  # np.ndarray[ndim=2]
        orig_values = values

        dtype = values.dtype
        is_numeric = is_numeric_dtype(dtype)

        is_datetimelike = needs_i8_conversion(dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(dtype):
            values = values.astype("int64")
        elif is_integer_dtype(dtype):
            # e.g. uint8 -> uint64, int16 -> int64
            dtype_str = dtype.kind + "8"
            values = values.astype(dtype_str, copy=False)
        elif is_numeric:
            if not is_complex_dtype(dtype):
                values = ensure_float64(values)

        values = values.T
        if mask is not None:
            mask = mask.T
            if result_mask is not None:
                result_mask = result_mask.T

        out_shape = self._get_output_shape(ngroups, values)
        func, values = self.get_cython_func_and_vals(values, is_numeric)
        out_dtype = self.get_out_dtype(values.dtype)

        result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
        if self.kind == "aggregate":
            counts = np.zeros(ngroups, dtype=np.int64)
            if self.how in ["min", "max", "mean"]:
                func(
                    result,
                    counts,
                    values,
                    comp_ids,
                    min_count,
                    mask=mask,
                    result_mask=result_mask,
                    is_datetimelike=is_datetimelike,
                )
            elif self.how in ["add"]:
                # We support datetimelike
                func(
                    result,
                    counts,
                    values,
                    comp_ids,
                    min_count,
                    datetimelike=is_datetimelike,
                )
            else:
                func(result, counts, values, comp_ids, min_count)
        else:
            # TODO: min_count
            if self.uses_mask():
                func(
                    result,
                    values,
                    comp_ids,
                    ngroups,
                    is_datetimelike,
                    mask=mask,
                    **kwargs,
                )
            else:
                func(result, values, comp_ids, ngroups, is_datetimelike,
                     **kwargs)

        if self.kind == "aggregate":
            # i.e. counts is defined.  Locations where count<min_count
            # need to have the result set to np.nan, which may require casting,
            # see GH#40767
            if is_integer_dtype(result.dtype) and not is_datetimelike:
                cutoff = max(1, min_count)
                empty_groups = counts < cutoff
                if empty_groups.any():
                    # Note: this conversion could be lossy, see GH#40767
                    result = result.astype("float64")
                    result[empty_groups] = np.nan

        result = result.T

        if self.how not in self.cast_blocklist:
            # e.g. if we are int64 and need to restore to datetime64/timedelta64
            # "rank" is the only member of cast_blocklist we get here
            res_dtype = self._get_result_dtype(orig_values.dtype)
            op_result = maybe_downcast_to_dtype(result, res_dtype)
        else:
            op_result = result

        # error: Incompatible return value type (got "Union[ExtensionArray, ndarray]",
        # expected "ndarray")
        return op_result  # type: ignore[return-value]
예제 #17
0
    def _cython_operation(self,
                          kind: str,
                          values,
                          how: str,
                          axis: int,
                          min_count: int = -1,
                          **kwargs) -> ArrayLike:
        """
        Returns the values of a cython operation.
        """
        orig_values = values
        assert kind in ["transform", "aggregate"]

        if values.ndim > 2:
            raise NotImplementedError(
                "number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        dtype = values.dtype
        is_numeric = is_numeric_dtype(dtype)

        cy_op = WrappedCythonOp(kind=kind, how=how)

        # can we do this operation with our cython functions
        # if not raise NotImplementedError
        cy_op.disallow_invalid_ops(dtype, is_numeric)

        if is_extension_array_dtype(dtype):
            return self._ea_wrap_cython_operation(kind, values, how, axis,
                                                  min_count, **kwargs)

        elif values.ndim == 1:
            # expand to 2d, dispatch, then squeeze if appropriate
            values2d = values[None, :]
            res = self._cython_operation(
                kind=kind,
                values=values2d,
                how=how,
                axis=1,
                min_count=min_count,
                **kwargs,
            )
            if res.shape[0] == 1:
                return res[0]

            # otherwise we have OHLC
            return res.T

        is_datetimelike = needs_i8_conversion(dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(dtype):
            values = values.astype("int64")
        elif is_integer_dtype(dtype):
            # e.g. uint8 -> uint64, int16 -> int64
            dtype = dtype.kind + "8"
            values = values.astype(dtype, copy=False)
        elif is_numeric:
            if not is_complex_dtype(dtype):
                values = ensure_float64(values)

        ngroups = self.ngroups
        comp_ids, _, _ = self.group_info

        assert axis == 1
        values = values.T

        out_shape = cy_op.get_output_shape(ngroups, values)
        func, values = cy_op.get_cython_func_and_vals(values, is_numeric)
        out_dtype = cy_op.get_out_dtype(values.dtype)

        result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
        if kind == "aggregate":
            counts = np.zeros(ngroups, dtype=np.int64)
            if how in ["min", "max"]:
                func(
                    result,
                    counts,
                    values,
                    comp_ids,
                    min_count,
                    is_datetimelike=is_datetimelike,
                )
            else:
                func(result, counts, values, comp_ids, min_count)
        elif kind == "transform":
            # TODO: min_count
            func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)

        if kind == "aggregate":
            # i.e. counts is defined.  Locations where count<min_count
            # need to have the result set to np.nan, which may require casting,
            # see GH#40767
            if is_integer_dtype(result.dtype) and not is_datetimelike:
                cutoff = max(1, min_count)
                empty_groups = counts < cutoff
                if empty_groups.any():
                    # Note: this conversion could be lossy, see GH#40767
                    result = result.astype("float64")
                    result[empty_groups] = np.nan

            if self._filter_empty_groups and not counts.all():
                assert result.ndim != 2
                result = result[counts > 0]

        result = result.T

        if how not in base.cython_cast_blocklist:
            # e.g. if we are int64 and need to restore to datetime64/timedelta64
            # "rank" is the only member of cython_cast_blocklist we get here
            dtype = maybe_cast_result_dtype(orig_values.dtype, how)
            op_result = maybe_downcast_to_dtype(result, dtype)
        else:
            op_result = result

        return op_result
예제 #18
0
파일: missing.py 프로젝트: willxue/pandas
def interpolate_1d(
    xvalues,
    yvalues,
    method="linear",
    limit=None,
    limit_direction="forward",
    limit_area=None,
    fill_value=None,
    bounds_error=False,
    order=None,
    **kwargs,
):
    """
    Logic for the 1-d interpolation.  The result should be 1-d, inputs
    xvalues and yvalues will each be 1-d arrays of the same length.

    Bounds_error is currently hardcoded to False since non-scipy ones don't
    take it as an argument.
    """
    # Treat the original, non-scipy methods first.

    invalid = isna(yvalues)
    valid = ~invalid

    if not valid.any():
        # have to call np.asarray(xvalues) since xvalues could be an Index
        # which can't be mutated
        result = np.empty_like(np.asarray(xvalues), dtype=np.float64)
        result.fill(np.nan)
        return result

    if valid.all():
        return yvalues

    if method == "time":
        if not getattr(xvalues, "is_all_dates", None):
            # if not issubclass(xvalues.dtype.type, np.datetime64):
            raise ValueError("time-weighted interpolation only works "
                             "on Series or DataFrames with a "
                             "DatetimeIndex")
        method = "values"

    valid_limit_directions = ["forward", "backward", "both"]
    limit_direction = limit_direction.lower()
    if limit_direction not in valid_limit_directions:
        msg = "Invalid limit_direction: expecting one of {valid!r}, got {invalid!r}."
        raise ValueError(
            msg.format(valid=valid_limit_directions, invalid=limit_direction))

    if limit_area is not None:
        valid_limit_areas = ["inside", "outside"]
        limit_area = limit_area.lower()
        if limit_area not in valid_limit_areas:
            raise ValueError("Invalid limit_area: expecting one of {}, got "
                             "{}.".format(valid_limit_areas, limit_area))

    # default limit is unlimited GH #16282
    limit = algos._validate_limit(nobs=None, limit=limit)

    # These are sets of index pointers to invalid values... i.e. {0, 1, etc...
    all_nans = set(np.flatnonzero(invalid))
    start_nans = set(range(find_valid_index(yvalues, "first")))
    end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid)))
    mid_nans = all_nans - start_nans - end_nans

    # Like the sets above, preserve_nans contains indices of invalid values,
    # but in this case, it is the final set of indices that need to be
    # preserved as NaN after the interpolation.

    # For example if limit_direction='forward' then preserve_nans will
    # contain indices of NaNs at the beginning of the series, and NaNs that
    # are more than'limit' away from the prior non-NaN.

    # set preserve_nans based on direction using _interp_limit
    if limit_direction == "forward":
        preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
    elif limit_direction == "backward":
        preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
    else:
        # both directions... just use _interp_limit
        preserve_nans = set(_interp_limit(invalid, limit, limit))

    # if limit_area is set, add either mid or outside indices
    # to preserve_nans GH #16284
    if limit_area == "inside":
        # preserve NaNs on the outside
        preserve_nans |= start_nans | end_nans
    elif limit_area == "outside":
        # preserve NaNs on the inside
        preserve_nans |= mid_nans

    # sort preserve_nans and covert to list
    preserve_nans = sorted(preserve_nans)

    xvalues = getattr(xvalues, "values", xvalues)
    yvalues = getattr(yvalues, "values", yvalues)
    result = yvalues.copy()

    if method in ["linear", "time", "index", "values"]:
        if method in ("values", "index"):
            inds = np.asarray(xvalues)
            # hack for DatetimeIndex, #1646
            if needs_i8_conversion(inds.dtype.type):
                inds = inds.view(np.int64)
            if inds.dtype == np.object_:
                inds = lib.maybe_convert_objects(inds)
        else:
            inds = xvalues
        result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid])
        result[preserve_nans] = np.nan
        return result

    sp_methods = [
        "nearest",
        "zero",
        "slinear",
        "quadratic",
        "cubic",
        "barycentric",
        "krogh",
        "spline",
        "polynomial",
        "from_derivatives",
        "piecewise_polynomial",
        "pchip",
        "akima",
    ]

    if method in sp_methods:
        inds = np.asarray(xvalues)
        # hack for DatetimeIndex, #1646
        if issubclass(inds.dtype.type, np.datetime64):
            inds = inds.view(np.int64)
        result[invalid] = _interpolate_scipy_wrapper(
            inds[valid],
            yvalues[valid],
            inds[invalid],
            method=method,
            fill_value=fill_value,
            bounds_error=bounds_error,
            order=order,
            **kwargs,
        )
        result[preserve_nans] = np.nan
        return result
예제 #19
0
def interpolate_1d(xvalues,
                   yvalues,
                   method='linear',
                   limit=None,
                   limit_direction='forward',
                   fill_value=None,
                   bounds_error=False,
                   order=None,
                   **kwargs):
    """
    Logic for the 1-d interpolation.  The result should be 1-d, inputs
    xvalues and yvalues will each be 1-d arrays of the same length.

    Bounds_error is currently hardcoded to False since non-scipy ones don't
    take it as an argumnet.
    """
    # Treat the original, non-scipy methods first.

    invalid = isna(yvalues)
    valid = ~invalid

    if not valid.any():
        # have to call np.asarray(xvalues) since xvalues could be an Index
        # which cant be mutated
        result = np.empty_like(np.asarray(xvalues), dtype=np.float64)
        result.fill(np.nan)
        return result

    if valid.all():
        return yvalues

    if method == 'time':
        if not getattr(xvalues, 'is_all_dates', None):
            # if not issubclass(xvalues.dtype.type, np.datetime64):
            raise ValueError('time-weighted interpolation only works '
                             'on Series or DataFrames with a '
                             'DatetimeIndex')
        method = 'values'

    valid_limit_directions = ['forward', 'backward', 'both']
    limit_direction = limit_direction.lower()
    if limit_direction not in valid_limit_directions:
        msg = ('Invalid limit_direction: expecting one of {valid!r}, '
               'got {invalid!r}.')
        raise ValueError(
            msg.format(valid=valid_limit_directions, invalid=limit_direction))

    from pandas import Series
    ys = Series(yvalues)
    start_nans = set(range(ys.first_valid_index()))
    end_nans = set(range(1 + ys.last_valid_index(), len(valid)))

    # violate_limit is a list of the indexes in the series whose yvalue is
    # currently NaN, and should still be NaN after the interpolation.
    # Specifically:
    #
    # If limit_direction='forward' or None then the list will contain NaNs at
    # the beginning of the series, and NaNs that are more than 'limit' away
    # from the prior non-NaN.
    #
    # If limit_direction='backward' then the list will contain NaNs at
    # the end of the series, and NaNs that are more than 'limit' away
    # from the subsequent non-NaN.
    #
    # If limit_direction='both' then the list will contain NaNs that
    # are more than 'limit' away from any non-NaN.
    #
    # If limit=None, then use default behavior of filling an unlimited number
    # of NaNs in the direction specified by limit_direction

    # default limit is unlimited GH #16282
    if limit is None:
        # limit = len(xvalues)
        pass
    elif not is_integer(limit):
        raise ValueError('Limit must be an integer')
    elif limit < 1:
        raise ValueError('Limit must be greater than 0')

    # each possible limit_direction
    # TODO: do we need sorted?
    if limit_direction == 'forward' and limit is not None:
        violate_limit = sorted(start_nans
                               | set(_interp_limit(invalid, limit, 0)))
    elif limit_direction == 'forward':
        violate_limit = sorted(start_nans)
    elif limit_direction == 'backward' and limit is not None:
        violate_limit = sorted(end_nans
                               | set(_interp_limit(invalid, 0, limit)))
    elif limit_direction == 'backward':
        violate_limit = sorted(end_nans)
    elif limit_direction == 'both' and limit is not None:
        violate_limit = sorted(_interp_limit(invalid, limit, limit))
    else:
        violate_limit = []

    xvalues = getattr(xvalues, 'values', xvalues)
    yvalues = getattr(yvalues, 'values', yvalues)
    result = yvalues.copy()

    if method in ['linear', 'time', 'index', 'values']:
        if method in ('values', 'index'):
            inds = np.asarray(xvalues)
            # hack for DatetimeIndex, #1646
            if needs_i8_conversion(inds.dtype.type):
                inds = inds.view(np.int64)
            if inds.dtype == np.object_:
                inds = lib.maybe_convert_objects(inds)
        else:
            inds = xvalues
        result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid])
        result[violate_limit] = np.nan
        return result

    sp_methods = [
        'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric',
        'krogh', 'spline', 'polynomial', 'from_derivatives',
        'piecewise_polynomial', 'pchip', 'akima'
    ]

    if method in sp_methods:
        inds = np.asarray(xvalues)
        # hack for DatetimeIndex, #1646
        if issubclass(inds.dtype.type, np.datetime64):
            inds = inds.view(np.int64)
        result[invalid] = _interpolate_scipy_wrapper(inds[valid],
                                                     yvalues[valid],
                                                     inds[invalid],
                                                     method=method,
                                                     fill_value=fill_value,
                                                     bounds_error=bounds_error,
                                                     order=order,
                                                     **kwargs)
        result[violate_limit] = np.nan
        return result
예제 #20
0
    def _cython_operation(self,
                          kind,
                          values,
                          how,
                          axis,
                          min_count=-1,
                          **kwargs):
        assert kind in ['transform', 'aggregate']

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values):
            raise NotImplementedError(
                "categoricals are not support in cython ops ATM")
        elif is_datetime64_any_dtype(values):
            if how in ['add', 'prod', 'cumsum', 'cumprod']:
                raise NotImplementedError(
                    "datetime64 type does not support {} "
                    "operations".format(how))
        elif is_timedelta64_dtype(values):
            if how in ['prod', 'cumprod']:
                raise NotImplementedError(
                    "timedelta64 type does not support {} "
                    "operations".format(how))

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                values = values.swapaxes(0, axis)
            if arity > 1:
                raise NotImplementedError("arity of more than 1 is not "
                                          "supported for the 'how' argument")
            out_shape = (self.ngroups, ) + values.shape[1:]

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view('int64')
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int64_or_float64(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        try:
            func = self._get_cython_function(kind, how, values, is_numeric)
        except NotImplementedError:
            if is_numeric:
                values = ensure_float64(values)
                func = self._get_cython_function(kind, how, values, is_numeric)
            else:
                raise

        if how == 'rank':
            out_dtype = 'float'
        else:
            if is_numeric:
                out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize)
            else:
                out_dtype = 'object'

        labels, _, _ = self.group_info

        if kind == 'aggregate':
            result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                 fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(result, counts, values, labels, func,
                                     is_numeric, is_datetimelike, min_count)
        elif kind == 'transform':
            result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                 fill_value=np.nan)

            # TODO: min_count
            result = self._transform(result, values, labels, func, is_numeric,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype('float64')
                result[mask] = np.nan

        if (kind == 'aggregate' and self._filter_empty_groups
                and not counts.all()):
            if result.ndim == 2:
                try:
                    result = lib.row_bool_subset(result,
                                                 (counts > 0).view(np.uint8))
                except ValueError:
                    result = lib.row_bool_subset_object(
                        ensure_object(result), (counts > 0).view(np.uint8))
            else:
                result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        if how in self._name_functions:
            # TODO
            names = self._name_functions[how]()
        else:
            names = None

        if swapped:
            result = result.swapaxes(0, axis)

        return result, names
예제 #21
0
def to_numeric(arg, errors="raise", downcast=None):
    """
    Convert argument to a numeric type.

    The default return dtype is `float64` or `int64`
    depending on the data supplied. Use the `downcast` parameter
    to obtain other dtypes.

    Please note that precision loss may occur if really large numbers
    are passed in. Due to the internal limitations of `ndarray`, if
    numbers smaller than `-9223372036854775808` (np.iinfo(np.int64).min)
    or larger than `18446744073709551615` (np.iinfo(np.uint64).max) are
    passed in, it is very likely they will be converted to float so that
    they can stored in an `ndarray`. These warnings apply similarly to
    `Series` since it internally leverages `ndarray`.

    Parameters
    ----------
    arg : scalar, list, tuple, 1-d array, or Series
        Argument to be converted.
    errors : {'ignore', 'raise', 'coerce'}, default 'raise'
        - If 'raise', then invalid parsing will raise an exception.
        - If 'coerce', then invalid parsing will be set as NaN.
        - If 'ignore', then invalid parsing will return the input.
    downcast : str, default None
        Can be 'integer', 'signed', 'unsigned', or 'float'.
        If not None, and if the data has been successfully cast to a
        numerical dtype (or if the data was numeric to begin with),
        downcast that resulting data to the smallest numerical dtype
        possible according to the following rules:

        - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
        - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
        - 'float': smallest float dtype (min.: np.float32)

        As this behaviour is separate from the core conversion to
        numeric values, any errors raised during the downcasting
        will be surfaced regardless of the value of the 'errors' input.

        In addition, downcasting will only occur if the size
        of the resulting data's dtype is strictly larger than
        the dtype it is to be cast to, so if none of the dtypes
        checked satisfy that specification, no downcasting will be
        performed on the data.

    Returns
    -------
    ret
        Numeric if parsing succeeded.
        Return type depends on input.  Series if Series, otherwise ndarray.

    See Also
    --------
    DataFrame.astype : Cast argument to a specified dtype.
    to_datetime : Convert argument to datetime.
    to_timedelta : Convert argument to timedelta.
    numpy.ndarray.astype : Cast a numpy array to a specified type.
    DataFrame.convert_dtypes : Convert dtypes.

    Examples
    --------
    Take separate series and convert to numeric, coercing when told to

    >>> s = pd.Series(['1.0', '2', -3])
    >>> pd.to_numeric(s)
    0    1.0
    1    2.0
    2   -3.0
    dtype: float64
    >>> pd.to_numeric(s, downcast='float')
    0    1.0
    1    2.0
    2   -3.0
    dtype: float32
    >>> pd.to_numeric(s, downcast='signed')
    0    1
    1    2
    2   -3
    dtype: int8
    >>> s = pd.Series(['apple', '1.0', '2', -3])
    >>> pd.to_numeric(s, errors='ignore')
    0    apple
    1      1.0
    2        2
    3       -3
    dtype: object
    >>> pd.to_numeric(s, errors='coerce')
    0    NaN
    1    1.0
    2    2.0
    3   -3.0
    dtype: float64

    Downcasting of nullable integer and floating dtypes is supported:

    >>> s = pd.Series([1, 2, 3], dtype="Int64")
    >>> pd.to_numeric(s, downcast="integer")
    0    1
    1    2
    2    3
    dtype: Int8
    >>> s = pd.Series([1.0, 2.1, 3.0], dtype="Float64")
    >>> pd.to_numeric(s, downcast="float")
    0    1.0
    1    2.1
    2    3.0
    dtype: Float32
    """
    if downcast not in (None, "integer", "signed", "unsigned", "float"):
        raise ValueError("invalid downcasting method provided")

    if errors not in ("ignore", "raise", "coerce"):
        raise ValueError("invalid error value specified")

    is_series = False
    is_index = False
    is_scalars = False

    if isinstance(arg, ABCSeries):
        is_series = True
        values = arg.values
    elif isinstance(arg, ABCIndex):
        is_index = True
        if needs_i8_conversion(arg.dtype):
            values = arg.asi8
        else:
            values = arg.values
    elif isinstance(arg, (list, tuple)):
        values = np.array(arg, dtype="O")
    elif is_scalar(arg):
        if is_decimal(arg):
            return float(arg)
        if is_number(arg):
            return arg
        is_scalars = True
        values = np.array([arg], dtype="O")
    elif getattr(arg, "ndim", 1) > 1:
        raise TypeError("arg must be a list, tuple, 1-d array, or Series")
    else:
        values = arg

    # GH33013: for IntegerArray & FloatingArray extract non-null values for casting
    # save mask to reconstruct the full array after casting
    mask: npt.NDArray[np.bool_] | None = None
    if isinstance(values, NumericArray):
        mask = values._mask
        values = values._data[~mask]

    values_dtype = getattr(values, "dtype", None)
    if is_numeric_dtype(values_dtype):
        pass
    elif is_datetime_or_timedelta_dtype(values_dtype):
        values = values.view(np.int64)
    else:
        values = ensure_object(values)
        coerce_numeric = errors not in ("ignore", "raise")
        try:
            values, _ = lib.maybe_convert_numeric(
                values, set(), coerce_numeric=coerce_numeric)
        except (ValueError, TypeError):
            if errors == "raise":
                raise

    # attempt downcast only if the data has been successfully converted
    # to a numerical dtype and if a downcast method has been specified
    if downcast is not None and is_numeric_dtype(values.dtype):
        typecodes: str | None = None

        if downcast in ("integer", "signed"):
            typecodes = np.typecodes["Integer"]
        elif downcast == "unsigned" and (not len(values)
                                         or np.min(values) >= 0):
            typecodes = np.typecodes["UnsignedInteger"]
        elif downcast == "float":
            typecodes = np.typecodes["Float"]

            # pandas support goes only to np.float32,
            # as float dtypes smaller than that are
            # extremely rare and not well supported
            float_32_char = np.dtype(np.float32).char
            float_32_ind = typecodes.index(float_32_char)
            typecodes = typecodes[float_32_ind:]

        if typecodes is not None:
            # from smallest to largest
            for typecode in typecodes:
                dtype = np.dtype(typecode)
                if dtype.itemsize <= values.dtype.itemsize:
                    values = maybe_downcast_numeric(values, dtype)

                    # successful conversion
                    if values.dtype == dtype:
                        break

    # GH33013: for IntegerArray & FloatingArray need to reconstruct masked array
    if mask is not None:
        data = np.zeros(mask.shape, dtype=values.dtype)
        data[~mask] = values

        from pandas.core.arrays import (
            FloatingArray,
            IntegerArray,
        )

        klass = IntegerArray if is_integer_dtype(data.dtype) else FloatingArray
        values = klass(data, mask.copy())

    if is_series:
        return arg._constructor(values, index=arg.index, name=arg.name)
    elif is_index:
        # because we want to coerce to numeric if possible,
        # do not use _shallow_copy
        return pd.Index(values, name=arg.name)
    elif is_scalars:
        return values[0]
    else:
        return values
예제 #22
0
    def get_new_values(self, values, fill_value=None):

        if values.ndim == 1:
            values = values[:, np.newaxis]

        sorted_values = self._make_sorted_values(values)

        # place the values
        length, width = self.full_shape
        stride = values.shape[1]
        result_width = width * stride
        result_shape = (length, result_width)
        mask = self.mask
        mask_all = mask.all()

        # we can simply reshape if we don't have a mask
        if mask_all and len(values):
            # TODO: Under what circumstances can we rely on sorted_values
            #  matching values?  When that holds, we can slice instead
            #  of take (in particular for EAs)
            new_values = (sorted_values.reshape(
                length, width, stride).swapaxes(1, 2).reshape(result_shape))
            new_mask = np.ones(result_shape, dtype=bool)
            return new_values, new_mask

        # if our mask is all True, then we can use our existing dtype
        if mask_all:
            dtype = values.dtype
            new_values = np.empty(result_shape, dtype=dtype)
        else:
            dtype, fill_value = maybe_promote(values.dtype, fill_value)
            new_values = np.empty(result_shape, dtype=dtype)
            new_values.fill(fill_value)

        new_mask = np.zeros(result_shape, dtype=bool)

        name = np.dtype(dtype).name

        # we need to convert to a basic dtype
        # and possibly coerce an input to our output dtype
        # e.g. ints -> floats
        if needs_i8_conversion(values.dtype):
            sorted_values = sorted_values.view("i8")
            new_values = new_values.view("i8")
        elif is_bool_dtype(values.dtype):
            sorted_values = sorted_values.astype("object")
            new_values = new_values.astype("object")
        else:
            sorted_values = sorted_values.astype(name, copy=False)

        # fill in our values & mask
        libreshape.unstack(
            sorted_values,
            mask.view("u1"),
            stride,
            length,
            width,
            new_values,
            new_mask.view("u1"),
        )

        # reconstruct dtype if needed
        if needs_i8_conversion(values.dtype):
            new_values = new_values.view(values.dtype)

        return new_values, new_mask
예제 #23
0
def array_equivalent(left, right, strict_nan: bool = False) -> bool:
    """
    True if two arrays, left and right, have equal non-NaN elements, and NaNs
    in corresponding locations.  False otherwise. It is assumed that left and
    right are NumPy arrays of the same dtype. The behavior of this function
    (particularly with respect to NaNs) is not defined if the dtypes are
    different.

    Parameters
    ----------
    left, right : ndarrays
    strict_nan : bool, default False
        If True, consider NaN and None to be different.

    Returns
    -------
    b : bool
        Returns True if the arrays are equivalent.

    Examples
    --------
    >>> array_equivalent(
    ...     np.array([1, 2, np.nan]),
    ...     np.array([1, 2, np.nan]))
    True
    >>> array_equivalent(
    ...     np.array([1, np.nan, 2]),
    ...     np.array([1, 2, np.nan]))
    False
    """
    left, right = np.asarray(left), np.asarray(right)

    # shape compat
    if left.shape != right.shape:
        return False

    # Object arrays can contain None, NaN and NaT.
    # string dtypes must be come to this path for NumPy 1.7.1 compat
    if is_string_dtype(left) or is_string_dtype(right):

        if not strict_nan:
            # isna considers NaN and None to be equivalent.
            return lib.array_equivalent_object(ensure_object(left.ravel()),
                                               ensure_object(right.ravel()))

        for left_value, right_value in zip(left, right):
            if left_value is NaT and right_value is not NaT:
                return False

            elif left_value is libmissing.NA and right_value is not libmissing.NA:
                return False

            elif isinstance(left_value, float) and np.isnan(left_value):
                if not isinstance(right_value,
                                  float) or not np.isnan(right_value):
                    return False
            else:
                try:
                    if np.any(np.asarray(left_value != right_value)):
                        return False
                except TypeError as err:
                    if "Cannot compare tz-naive" in str(err):
                        # tzawareness compat failure, see GH#28507
                        return False
                    elif "boolean value of NA is ambiguous" in str(err):
                        return False
                    raise
        return True

    # NaNs can occur in float and complex arrays.
    if is_float_dtype(left) or is_complex_dtype(left):

        # empty
        if not (np.prod(left.shape) and np.prod(right.shape)):
            return True
        return ((left == right) | (isna(left) & isna(right))).all()

    elif is_datetimelike_v_numeric(left, right):
        # GH#29553 avoid numpy deprecation warning
        return False

    elif needs_i8_conversion(left) or needs_i8_conversion(right):
        # datetime64, timedelta64, Period
        if not is_dtype_equal(left.dtype, right.dtype):
            return False

        left = left.view("i8")
        right = right.view("i8")

    # if we have structured dtypes, compare first
    if left.dtype.type is np.void or right.dtype.type is np.void:
        if left.dtype != right.dtype:
            return False

    return np.array_equal(left, right)
예제 #24
0
def _na_ok_dtype(dtype) -> bool:
    if needs_i8_conversion(dtype):
        return False
    return not issubclass(dtype.type, np.integer)
예제 #25
0
    def get_new_values(self):
        values = self.values

        # place the values
        length, width = self.full_shape
        stride = values.shape[1]
        result_width = width * stride
        result_shape = (length, result_width)
        mask = self.mask
        mask_all = mask.all()

        # we can simply reshape if we don't have a mask
        if mask_all and len(values):
            new_values = (self.sorted_values.reshape(
                length, width, stride).swapaxes(1, 2).reshape(result_shape))
            new_mask = np.ones(result_shape, dtype=bool)
            return new_values, new_mask

        # if our mask is all True, then we can use our existing dtype
        if mask_all:
            dtype = values.dtype
            new_values = np.empty(result_shape, dtype=dtype)
        else:
            dtype, fill_value = maybe_promote(values.dtype, self.fill_value)
            new_values = np.empty(result_shape, dtype=dtype)
            new_values.fill(fill_value)

        new_mask = np.zeros(result_shape, dtype=bool)

        name = np.dtype(dtype).name
        sorted_values = self.sorted_values

        # we need to convert to a basic dtype
        # and possibly coerce an input to our output dtype
        # e.g. ints -> floats
        if needs_i8_conversion(values):
            sorted_values = sorted_values.view("i8")
            new_values = new_values.view("i8")
            name = "int64"
        elif is_bool_dtype(values):
            sorted_values = sorted_values.astype("object")
            new_values = new_values.astype("object")
            name = "object"
        else:
            sorted_values = sorted_values.astype(name, copy=False)

        # fill in our values & mask
        libreshape.unstack(
            sorted_values,
            mask.view("u1"),
            stride,
            length,
            width,
            new_values,
            new_mask.view("u1"),
        )

        # reconstruct dtype if needed
        if needs_i8_conversion(values):
            new_values = new_values.view(values.dtype)

        return new_values, new_mask
예제 #26
0
def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
                   limit_direction='forward', fill_value=None,
                   bounds_error=False, order=None, **kwargs):
    """
    Logic for the 1-d interpolation.  The result should be 1-d, inputs
    xvalues and yvalues will each be 1-d arrays of the same length.

    Bounds_error is currently hardcoded to False since non-scipy ones don't
    take it as an argumnet.
    """
    # Treat the original, non-scipy methods first.

    invalid = isnull(yvalues)
    valid = ~invalid

    if not valid.any():
        # have to call np.asarray(xvalues) since xvalues could be an Index
        # which cant be mutated
        result = np.empty_like(np.asarray(xvalues), dtype=np.float64)
        result.fill(np.nan)
        return result

    if valid.all():
        return yvalues

    if method == 'time':
        if not getattr(xvalues, 'is_all_dates', None):
            # if not issubclass(xvalues.dtype.type, np.datetime64):
            raise ValueError('time-weighted interpolation only works '
                             'on Series or DataFrames with a '
                             'DatetimeIndex')
        method = 'values'

    valid_limit_directions = ['forward', 'backward', 'both']
    limit_direction = limit_direction.lower()
    if limit_direction not in valid_limit_directions:
        raise ValueError('Invalid limit_direction: expecting one of %r, got '
                         '%r.' % (valid_limit_directions, limit_direction))

    from pandas import Series
    ys = Series(yvalues)
    start_nans = set(range(ys.first_valid_index()))
    end_nans = set(range(1 + ys.last_valid_index(), len(valid)))

    # violate_limit is a list of the indexes in the series whose yvalue is
    # currently NaN, and should still be NaN after the interpolation.
    # Specifically:
    #
    # If limit_direction='forward' or None then the list will contain NaNs at
    # the beginning of the series, and NaNs that are more than 'limit' away
    # from the prior non-NaN.
    #
    # If limit_direction='backward' then the list will contain NaNs at
    # the end of the series, and NaNs that are more than 'limit' away
    # from the subsequent non-NaN.
    #
    # If limit_direction='both' then the list will contain NaNs that
    # are more than 'limit' away from any non-NaN.
    #
    # If limit=None, then use default behavior of filling an unlimited number
    # of NaNs in the direction specified by limit_direction

    # default limit is unlimited GH #16282
    if limit is None:
        # limit = len(xvalues)
        pass
    elif not is_integer(limit):
        raise ValueError('Limit must be an integer')
    elif limit < 1:
        raise ValueError('Limit must be greater than 0')

    # each possible limit_direction
    # TODO: do we need sorted?
    if limit_direction == 'forward' and limit is not None:
        violate_limit = sorted(start_nans |
                               set(_interp_limit(invalid, limit, 0)))
    elif limit_direction == 'forward':
        violate_limit = sorted(start_nans)
    elif limit_direction == 'backward' and limit is not None:
        violate_limit = sorted(end_nans |
                               set(_interp_limit(invalid, 0, limit)))
    elif limit_direction == 'backward':
        violate_limit = sorted(end_nans)
    elif limit_direction == 'both' and limit is not None:
        violate_limit = sorted(_interp_limit(invalid, limit, limit))
    else:
        violate_limit = []

    xvalues = getattr(xvalues, 'values', xvalues)
    yvalues = getattr(yvalues, 'values', yvalues)
    result = yvalues.copy()

    if method in ['linear', 'time', 'index', 'values']:
        if method in ('values', 'index'):
            inds = np.asarray(xvalues)
            # hack for DatetimeIndex, #1646
            if needs_i8_conversion(inds.dtype.type):
                inds = inds.view(np.int64)
            if inds.dtype == np.object_:
                inds = lib.maybe_convert_objects(inds)
        else:
            inds = xvalues
        result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid])
        result[violate_limit] = np.nan
        return result

    sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic',
                  'barycentric', 'krogh', 'spline', 'polynomial',
                  'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima']

    if method in sp_methods:
        inds = np.asarray(xvalues)
        # hack for DatetimeIndex, #1646
        if issubclass(inds.dtype.type, np.datetime64):
            inds = inds.view(np.int64)
        result[invalid] = _interpolate_scipy_wrapper(inds[valid],
                                                     yvalues[valid],
                                                     inds[invalid],
                                                     method=method,
                                                     fill_value=fill_value,
                                                     bounds_error=bounds_error,
                                                     order=order, **kwargs)
        result[violate_limit] = np.nan
        return result
예제 #27
0
파일: ops.py 프로젝트: zeneli/pandas
    def _cython_operation(self,
                          kind: str,
                          values,
                          how: str,
                          axis,
                          min_count: int = -1,
                          **kwargs) -> Tuple[np.ndarray, Optional[List[str]]]:
        """
        Returns the values of a cython operation as a Tuple of [data, names].

        Names is only useful when dealing with 2D results, like ohlc
        (see self._name_functions).
        """

        assert kind in ["transform", "aggregate"]
        orig_values = values

        if values.ndim > 2:
            raise NotImplementedError(
                "number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values) or is_sparse(values):
            raise NotImplementedError(f"{values.dtype} dtype not supported")
        elif is_datetime64_any_dtype(values):
            if how in ["add", "prod", "cumsum", "cumprod"]:
                raise NotImplementedError(
                    f"datetime64 type does not support {how} operations")
        elif is_timedelta64_dtype(values):
            if how in ["prod", "cumprod"]:
                raise NotImplementedError(
                    f"timedelta64 type does not support {how} operations")

        if is_datetime64tz_dtype(values.dtype):
            # Cast to naive; we'll cast back at the end of the function
            # TODO: possible need to reshape?  kludge can be avoided when
            #  2D EA is allowed.
            values = values.view("M8[ns]")

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError(
                    "arity of more than 1 is not supported for the 'how' argument"
                )
            out_shape = (self.ngroups, ) + values.shape[1:]

        func, values = self._get_cython_func_and_vals(kind, how, values,
                                                      is_numeric)

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}"
            else:
                out_dtype = "object"

        codes, _, _ = self.group_info

        if kind == "aggregate":
            result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                 fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(result, counts, values, codes, func,
                                     is_datetimelike, min_count)
        elif kind == "transform":
            result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                 fill_value=np.nan)

            # TODO: min_count
            result = self._transform(result, values, codes, func,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all(
        ):
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        names: Optional[List[str]] = self._name_functions.get(how, None)

        if swapped:
            result = result.swapaxes(0, axis)

        if is_datetime64tz_dtype(orig_values.dtype):
            result = type(orig_values)(result.astype(np.int64),
                                       dtype=orig_values.dtype)
        elif is_datetimelike and kind == "aggregate":
            result = result.astype(orig_values.dtype)

        return result, names
예제 #28
0
파일: missing.py 프로젝트: ankostis/pandas
def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
                   limit_direction='forward', fill_value=None,
                   bounds_error=False, order=None, **kwargs):
    """
    Logic for the 1-d interpolation.  The result should be 1-d, inputs
    xvalues and yvalues will each be 1-d arrays of the same length.

    Bounds_error is currently hardcoded to False since non-scipy ones don't
    take it as an argumnet.
    """
    # Treat the original, non-scipy methods first.

    invalid = isnull(yvalues)
    valid = ~invalid

    if not valid.any():
        # have to call np.asarray(xvalues) since xvalues could be an Index
        # which cant be mutated
        result = np.empty_like(np.asarray(xvalues), dtype=np.float64)
        result.fill(np.nan)
        return result

    if valid.all():
        return yvalues

    if method == 'time':
        if not getattr(xvalues, 'is_all_dates', None):
            # if not issubclass(xvalues.dtype.type, np.datetime64):
            raise ValueError('time-weighted interpolation only works '
                             'on Series or DataFrames with a '
                             'DatetimeIndex')
        method = 'values'

    def _interp_limit(invalid, fw_limit, bw_limit):
        "Get idx of values that won't be filled b/c they exceed the limits."
        for x in np.where(invalid)[0]:
            if invalid[max(0, x - fw_limit):x + bw_limit + 1].all():
                yield x

    valid_limit_directions = ['forward', 'backward', 'both']
    limit_direction = limit_direction.lower()
    if limit_direction not in valid_limit_directions:
        raise ValueError('Invalid limit_direction: expecting one of %r, got '
                         '%r.' % (valid_limit_directions, limit_direction))

    from pandas import Series
    ys = Series(yvalues)
    start_nans = set(range(ys.first_valid_index()))
    end_nans = set(range(1 + ys.last_valid_index(), len(valid)))

    # This is a list of the indexes in the series whose yvalue is currently
    # NaN, but whose interpolated yvalue will be overwritten with NaN after
    # computing the interpolation. For each index in this list, one of these
    # conditions is true of the corresponding NaN in the yvalues:
    #
    # a) It is one of a chain of NaNs at the beginning of the series, and
    #    either limit is not specified or limit_direction is 'forward'.
    # b) It is one of a chain of NaNs at the end of the series, and limit is
    #    specified and limit_direction is 'backward' or 'both'.
    # c) Limit is nonzero and it is further than limit from the nearest non-NaN
    #    value (with respect to the limit_direction setting).
    #
    # The default behavior is to fill forward with no limit, ignoring NaNs at
    # the beginning (see issues #9218 and #10420)
    violate_limit = sorted(start_nans)

    if limit is not None:
        if not is_integer(limit):
            raise ValueError('Limit must be an integer')
        if limit < 1:
            raise ValueError('Limit must be greater than 0')
        if limit_direction == 'forward':
            violate_limit = sorted(start_nans | set(_interp_limit(invalid,
                                                                  limit, 0)))
        if limit_direction == 'backward':
            violate_limit = sorted(end_nans | set(_interp_limit(invalid, 0,
                                                                limit)))
        if limit_direction == 'both':
            violate_limit = sorted(_interp_limit(invalid, limit, limit))

    xvalues = getattr(xvalues, 'values', xvalues)
    yvalues = getattr(yvalues, 'values', yvalues)
    result = yvalues.copy()

    if method in ['linear', 'time', 'index', 'values']:
        if method in ('values', 'index'):
            inds = np.asarray(xvalues)
            # hack for DatetimeIndex, #1646
            if needs_i8_conversion(inds.dtype.type):
                inds = inds.view(np.int64)
            if inds.dtype == np.object_:
                inds = lib.maybe_convert_objects(inds)
        else:
            inds = xvalues
        result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid])
        result[violate_limit] = np.nan
        return result

    sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic',
                  'barycentric', 'krogh', 'spline', 'polynomial',
                  'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima']

    if method in sp_methods:
        inds = np.asarray(xvalues)
        # hack for DatetimeIndex, #1646
        if issubclass(inds.dtype.type, np.datetime64):
            inds = inds.view(np.int64)
        result[invalid] = _interpolate_scipy_wrapper(inds[valid],
                                                     yvalues[valid],
                                                     inds[invalid],
                                                     method=method,
                                                     fill_value=fill_value,
                                                     bounds_error=bounds_error,
                                                     order=order, **kwargs)
        result[violate_limit] = np.nan
        return result
예제 #29
0
    def test_value_counts_unique_nunique_null(self):

        for null_obj in [np.nan, None]:
            for orig in self.objs:
                o = orig.copy()
                klass = type(o)
                values = o._ndarray_values

                if not self._allow_na_ops(o):
                    continue

                # special assign to the numpy array
                if is_datetimetz(o):
                    if isinstance(o, DatetimeIndex):
                        v = o.asi8
                        v[0:2] = iNaT
                        values = o._shallow_copy(v)
                    else:
                        o = o.copy()
                        o[0:2] = iNaT
                        values = o._values

                elif needs_i8_conversion(o):
                    values[0:2] = iNaT
                    values = o._shallow_copy(values)
                else:
                    values[0:2] = null_obj
                # check values has the same dtype as the original

                assert values.dtype == o.dtype

                # create repeated values, 'n'th element is repeated by n+1
                # times
                if isinstance(o, (DatetimeIndex, PeriodIndex)):
                    expected_index = o.copy()
                    expected_index.name = None

                    # attach name to klass
                    o = klass(values.repeat(range(1, len(o) + 1)))
                    o.name = 'a'
                else:
                    if is_datetimetz(o):
                        expected_index = orig._values._shallow_copy(values)
                    else:
                        expected_index = Index(values)
                    expected_index.name = None
                    o = o.repeat(range(1, len(o) + 1))
                    o.name = 'a'

                # check values has the same dtype as the original
                assert o.dtype == orig.dtype
                # check values correctly have NaN
                nanloc = np.zeros(len(o), dtype=np.bool)
                nanloc[:3] = True
                if isinstance(o, Index):
                    tm.assert_numpy_array_equal(pd.isna(o), nanloc)
                else:
                    exp = Series(nanloc, o.index, name='a')
                    tm.assert_series_equal(pd.isna(o), exp)

                expected_s_na = Series(list(range(10, 2, -1)) + [3],
                                       index=expected_index[9:0:-1],
                                       dtype='int64', name='a')
                expected_s = Series(list(range(10, 2, -1)),
                                    index=expected_index[9:1:-1],
                                    dtype='int64', name='a')

                result_s_na = o.value_counts(dropna=False)
                tm.assert_series_equal(result_s_na, expected_s_na)
                assert result_s_na.index.name is None
                assert result_s_na.name == 'a'
                result_s = o.value_counts()
                tm.assert_series_equal(o.value_counts(), expected_s)
                assert result_s.index.name is None
                assert result_s.name == 'a'

                result = o.unique()
                if isinstance(o, Index):
                    tm.assert_index_equal(result,
                                          Index(values[1:], name='a'))
                elif is_datetimetz(o):
                    # unable to compare NaT / nan
                    vals = values[2:].astype(object).values
                    tm.assert_numpy_array_equal(result[1:], vals)
                    assert result[0] is pd.NaT
                else:
                    tm.assert_numpy_array_equal(result[1:], values[2:])

                    assert pd.isna(result[0])
                    assert result.dtype == orig.dtype

                assert o.nunique() == 8
                assert o.nunique(dropna=False) == 9
예제 #30
0
    def _cython_operation(self,
                          kind: str,
                          values,
                          how: str,
                          axis: int,
                          min_count: int = -1,
                          **kwargs) -> Tuple[np.ndarray, Optional[List[str]]]:
        """
        Returns the values of a cython operation as a Tuple of [data, names].

        Names is only useful when dealing with 2D results, like ohlc
        (see self._name_functions).
        """
        orig_values = values
        assert kind in ["transform", "aggregate"]

        if values.ndim > 2:
            raise NotImplementedError(
                "number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        # can we do this operation with our cython functions
        # if not raise NotImplementedError
        self._disallow_invalid_ops(values, how)

        if is_extension_array_dtype(values.dtype):
            return self._ea_wrap_cython_operation(kind, values, how, axis,
                                                  min_count, **kwargs)

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_int_or_float(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(ensure_float(values))
        else:
            values = values.astype(object)

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                assert axis == 1, axis
                values = values.T
            if arity > 1:
                raise NotImplementedError(
                    "arity of more than 1 is not supported for the 'how' argument"
                )
            out_shape = (self.ngroups, ) + values.shape[1:]

        func, values = self._get_cython_func_and_vals(kind, how, values,
                                                      is_numeric)

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}"
            else:
                out_dtype = "object"

        codes, _, _ = self.group_info

        if kind == "aggregate":
            result = maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(result, counts, values, codes, func,
                                     min_count)
        elif kind == "transform":
            result = maybe_fill(np.empty_like(values, dtype=out_dtype),
                                fill_value=np.nan)

            # TODO: min_count
            result = self._transform(result, values, codes, func,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all(
        ):
            assert result.ndim != 2
            result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        names: Optional[List[str]] = self._name_functions.get(how, None)

        if swapped:
            result = result.swapaxes(0, axis)

        if how not in base.cython_cast_blocklist:
            # e.g. if we are int64 and need to restore to datetime64/timedelta64
            # "rank" is the only member of cython_cast_blocklist we get here
            dtype = maybe_cast_result_dtype(orig_values.dtype, how)
            result = maybe_downcast_to_dtype(result, dtype)

        return result, names
예제 #31
0
def _interpolate_2d_with_fill(
    data: np.ndarray,  # floating dtype
    index: Index,
    axis: int,
    method: str = "linear",
    limit: int | None = None,
    limit_direction: str = "forward",
    limit_area: str | None = None,
    fill_value: Any | None = None,
    **kwargs,
) -> np.ndarray:
    """
    Column-wise application of _interpolate_1d.

    Notes
    -----
    The signature does differs from _interpolate_1d because it only
    includes what is needed for Block.interpolate.
    """
    # validate the interp method
    clean_interp_method(method, index, **kwargs)

    if is_valid_na_for_dtype(fill_value, data.dtype):
        fill_value = na_value_for_dtype(data.dtype, compat=False)

    if method == "time":
        if not needs_i8_conversion(index.dtype):
            raise ValueError(
                "time-weighted interpolation only works "
                "on Series or DataFrames with a "
                "DatetimeIndex"
            )
        method = "values"

    valid_limit_directions = ["forward", "backward", "both"]
    limit_direction = limit_direction.lower()
    if limit_direction not in valid_limit_directions:
        raise ValueError(
            "Invalid limit_direction: expecting one of "
            f"{valid_limit_directions}, got '{limit_direction}'."
        )

    if limit_area is not None:
        valid_limit_areas = ["inside", "outside"]
        limit_area = limit_area.lower()
        if limit_area not in valid_limit_areas:
            raise ValueError(
                f"Invalid limit_area: expecting one of {valid_limit_areas}, got "
                f"{limit_area}."
            )

    # default limit is unlimited GH #16282
    limit = algos.validate_limit(nobs=None, limit=limit)

    indices = _index_to_interp_indices(index, method)

    def func(yvalues: np.ndarray) -> np.ndarray:
        # process 1-d slices in the axis direction, returning it

        # should the axis argument be handled below in apply_along_axis?
        # i.e. not an arg to _interpolate_1d
        return _interpolate_1d(
            indices=indices,
            yvalues=yvalues,
            method=method,
            limit=limit,
            limit_direction=limit_direction,
            limit_area=limit_area,
            fill_value=fill_value,
            bounds_error=False,
            **kwargs,
        )

    # interp each column independently
    return np.apply_along_axis(func, axis, data)
예제 #32
0
def interpolate_1d(
    xvalues: np.ndarray,
    yvalues: np.ndarray,
    method: Optional[str] = "linear",
    limit: Optional[int] = None,
    limit_direction: str = "forward",
    limit_area: Optional[str] = None,
    fill_value: Optional[Any] = None,
    bounds_error: bool = False,
    order: Optional[int] = None,
    **kwargs,
):
    """
    Logic for the 1-d interpolation.  The result should be 1-d, inputs
    xvalues and yvalues will each be 1-d arrays of the same length.

    Bounds_error is currently hardcoded to False since non-scipy ones don't
    take it as an argument.
    """
    invalid = isna(yvalues)
    valid = ~invalid

    if not valid.any():
        # have to call np.asarray(xvalues) since xvalues could be an Index
        # which can't be mutated
        result = np.empty_like(np.asarray(xvalues), dtype=np.float64)
        result.fill(np.nan)
        return result

    if valid.all():
        return yvalues

    if method == "time":
        if not getattr(xvalues, "_is_all_dates", None):
            # if not issubclass(xvalues.dtype.type, np.datetime64):
            raise ValueError("time-weighted interpolation only works "
                             "on Series or DataFrames with a "
                             "DatetimeIndex")
        method = "values"

    valid_limit_directions = ["forward", "backward", "both"]
    limit_direction = limit_direction.lower()
    if limit_direction not in valid_limit_directions:
        raise ValueError("Invalid limit_direction: expecting one of "
                         f"{valid_limit_directions}, got '{limit_direction}'.")

    if limit_area is not None:
        valid_limit_areas = ["inside", "outside"]
        limit_area = limit_area.lower()
        if limit_area not in valid_limit_areas:
            raise ValueError(
                f"Invalid limit_area: expecting one of {valid_limit_areas}, got "
                f"{limit_area}.")

    # default limit is unlimited GH #16282
    limit = algos.validate_limit(nobs=None, limit=limit)

    # These are sets of index pointers to invalid values... i.e. {0, 1, etc...
    all_nans = set(np.flatnonzero(invalid))
    start_nans = set(range(find_valid_index(yvalues, "first")))
    end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid)))
    mid_nans = all_nans - start_nans - end_nans

    # Like the sets above, preserve_nans contains indices of invalid values,
    # but in this case, it is the final set of indices that need to be
    # preserved as NaN after the interpolation.

    # For example if limit_direction='forward' then preserve_nans will
    # contain indices of NaNs at the beginning of the series, and NaNs that
    # are more than'limit' away from the prior non-NaN.

    # set preserve_nans based on direction using _interp_limit
    preserve_nans: Union[List, Set]
    if limit_direction == "forward":
        preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
    elif limit_direction == "backward":
        preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
    else:
        # both directions... just use _interp_limit
        preserve_nans = set(_interp_limit(invalid, limit, limit))

    # if limit_area is set, add either mid or outside indices
    # to preserve_nans GH #16284
    if limit_area == "inside":
        # preserve NaNs on the outside
        preserve_nans |= start_nans | end_nans
    elif limit_area == "outside":
        # preserve NaNs on the inside
        preserve_nans |= mid_nans

    # sort preserve_nans and covert to list
    preserve_nans = sorted(preserve_nans)

    yvalues = getattr(yvalues, "values", yvalues)
    result = yvalues.copy()

    # xvalues to pass to NumPy/SciPy

    xvalues = getattr(xvalues, "values", xvalues)
    if method == "linear":
        inds = xvalues
    else:
        inds = np.asarray(xvalues)

        # hack for DatetimeIndex, #1646
        if needs_i8_conversion(inds.dtype):
            inds = inds.view(np.int64)

        if method in ("values", "index"):
            if inds.dtype == np.object_:
                inds = lib.maybe_convert_objects(inds)

    if method in NP_METHODS:
        # np.interp requires sorted X values, #21037
        indexer = np.argsort(inds[valid])
        result[invalid] = np.interp(inds[invalid], inds[valid][indexer],
                                    yvalues[valid][indexer])
    else:
        result[invalid] = _interpolate_scipy_wrapper(
            inds[valid],
            yvalues[valid],
            inds[invalid],
            method=method,
            fill_value=fill_value,
            bounds_error=bounds_error,
            order=order,
            **kwargs,
        )

    result[preserve_nans] = np.nan
    return result
예제 #33
0
def interpolate_1d(xvalues,
                   yvalues,
                   method='linear',
                   limit=None,
                   limit_direction='forward',
                   limit_area=None,
                   fill_value=None,
                   bounds_error=False,
                   order=None,
                   **kwargs):
    """
    Logic for the 1-d interpolation.  The result should be 1-d, inputs
    xvalues and yvalues will each be 1-d arrays of the same length.

    Bounds_error is currently hardcoded to False since non-scipy ones don't
    take it as an argumnet.
    """
    # Treat the original, non-scipy methods first.

    invalid = isna(yvalues)
    valid = ~invalid

    if not valid.any():
        # have to call np.asarray(xvalues) since xvalues could be an Index
        # which can't be mutated
        result = np.empty_like(np.asarray(xvalues), dtype=np.float64)
        result.fill(np.nan)
        return result

    if valid.all():
        return yvalues

    if method == 'time':
        if not getattr(xvalues, 'is_all_dates', None):
            # if not issubclass(xvalues.dtype.type, np.datetime64):
            raise ValueError('time-weighted interpolation only works '
                             'on Series or DataFrames with a '
                             'DatetimeIndex')
        method = 'values'

    valid_limit_directions = ['forward', 'backward', 'both']
    limit_direction = limit_direction.lower()
    if limit_direction not in valid_limit_directions:
        msg = ('Invalid limit_direction: expecting one of {valid!r}, '
               'got {invalid!r}.')
        raise ValueError(
            msg.format(valid=valid_limit_directions, invalid=limit_direction))

    if limit_area is not None:
        valid_limit_areas = ['inside', 'outside']
        limit_area = limit_area.lower()
        if limit_area not in valid_limit_areas:
            raise ValueError('Invalid limit_area: expecting one of {}, got '
                             '{}.'.format(valid_limit_areas, limit_area))

    # default limit is unlimited GH #16282
    if limit is None:
        # limit = len(xvalues)
        pass
    elif not is_integer(limit):
        raise ValueError('Limit must be an integer')
    elif limit < 1:
        raise ValueError('Limit must be greater than 0')

    from pandas import Series
    ys = Series(yvalues)

    # These are sets of index pointers to invalid values... i.e. {0, 1, etc...
    all_nans = set(np.flatnonzero(invalid))
    start_nans = set(range(ys.first_valid_index()))
    end_nans = set(range(1 + ys.last_valid_index(), len(valid)))
    mid_nans = all_nans - start_nans - end_nans

    # Like the sets above, preserve_nans contains indices of invalid values,
    # but in this case, it is the final set of indices that need to be
    # preserved as NaN after the interpolation.

    # For example if limit_direction='forward' then preserve_nans will
    # contain indices of NaNs at the beginning of the series, and NaNs that
    # are more than'limit' away from the prior non-NaN.

    # set preserve_nans based on direction using _interp_limit
    if limit_direction == 'forward':
        preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
    elif limit_direction == 'backward':
        preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
    else:
        # both directions... just use _interp_limit
        preserve_nans = set(_interp_limit(invalid, limit, limit))

    # if limit_area is set, add either mid or outside indices
    # to preserve_nans GH #16284
    if limit_area == 'inside':
        # preserve NaNs on the outside
        preserve_nans |= start_nans | end_nans
    elif limit_area == 'outside':
        # preserve NaNs on the inside
        preserve_nans |= mid_nans

    # sort preserve_nans and covert to list
    preserve_nans = sorted(preserve_nans)

    xvalues = getattr(xvalues, 'values', xvalues)
    yvalues = getattr(yvalues, 'values', yvalues)
    result = yvalues.copy()

    if method in ['linear', 'time', 'index', 'values']:
        if method in ('values', 'index'):
            inds = np.asarray(xvalues)
            # hack for DatetimeIndex, #1646
            if needs_i8_conversion(inds.dtype.type):
                inds = inds.view(np.int64)
            if inds.dtype == np.object_:
                inds = lib.maybe_convert_objects(inds)
        else:
            inds = xvalues
        result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid])
        result[preserve_nans] = np.nan
        return result

    sp_methods = [
        'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'barycentric',
        'krogh', 'spline', 'polynomial', 'from_derivatives',
        'piecewise_polynomial', 'pchip', 'akima'
    ]

    if method in sp_methods:
        inds = np.asarray(xvalues)
        # hack for DatetimeIndex, #1646
        if issubclass(inds.dtype.type, np.datetime64):
            inds = inds.view(np.int64)
        result[invalid] = _interpolate_scipy_wrapper(inds[valid],
                                                     yvalues[valid],
                                                     inds[invalid],
                                                     method=method,
                                                     fill_value=fill_value,
                                                     bounds_error=bounds_error,
                                                     order=order,
                                                     **kwargs)
        result[preserve_nans] = np.nan
        return result
예제 #34
0
파일: ops.py 프로젝트: bashtage/pandas
    def _cython_operation(self, kind, values, how, axis, min_count=-1,
                          **kwargs):
        assert kind in ['transform', 'aggregate']

        # can we do this operation with our cython functions
        # if not raise NotImplementedError

        # we raise NotImplemented if this is an invalid operation
        # entirely, e.g. adding datetimes

        # categoricals are only 1d, so we
        # are not setup for dim transforming
        if is_categorical_dtype(values):
            raise NotImplementedError(
                "categoricals are not support in cython ops ATM")
        elif is_datetime64_any_dtype(values):
            if how in ['add', 'prod', 'cumsum', 'cumprod']:
                raise NotImplementedError(
                    "datetime64 type does not support {} "
                    "operations".format(how))
        elif is_timedelta64_dtype(values):
            if how in ['prod', 'cumprod']:
                raise NotImplementedError(
                    "timedelta64 type does not support {} "
                    "operations".format(how))

        arity = self._cython_arity.get(how, 1)

        vdim = values.ndim
        swapped = False
        if vdim == 1:
            values = values[:, None]
            out_shape = (self.ngroups, arity)
        else:
            if axis > 0:
                swapped = True
                values = values.swapaxes(0, axis)
            if arity > 1:
                raise NotImplementedError("arity of more than 1 is not "
                                          "supported for the 'how' argument")
            out_shape = (self.ngroups,) + values.shape[1:]

        is_datetimelike = needs_i8_conversion(values.dtype)
        is_numeric = is_numeric_dtype(values.dtype)

        if is_datetimelike:
            values = values.view('int64')
            is_numeric = True
        elif is_bool_dtype(values.dtype):
            values = ensure_float64(values)
        elif is_integer_dtype(values):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int64_or_float64(values)
        elif is_numeric and not is_complex_dtype(values):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        try:
            func = self._get_cython_function(
                kind, how, values, is_numeric)
        except NotImplementedError:
            if is_numeric:
                values = ensure_float64(values)
                func = self._get_cython_function(
                    kind, how, values, is_numeric)
            else:
                raise

        if how == 'rank':
            out_dtype = 'float'
        else:
            if is_numeric:
                out_dtype = '{kind}{itemsize}'.format(
                    kind=values.dtype.kind, itemsize=values.dtype.itemsize)
            else:
                out_dtype = 'object'

        labels, _, _ = self.group_info

        if kind == 'aggregate':
            result = _maybe_fill(np.empty(out_shape, dtype=out_dtype),
                                 fill_value=np.nan)
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(
                result, counts, values, labels, func, is_numeric,
                is_datetimelike, min_count)
        elif kind == 'transform':
            result = _maybe_fill(np.empty_like(values, dtype=out_dtype),
                                 fill_value=np.nan)

            # TODO: min_count
            result = self._transform(
                result, values, labels, func, is_numeric, is_datetimelike,
                **kwargs)

        if is_integer_dtype(result) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype('float64')
                result[mask] = np.nan

        if (kind == 'aggregate' and
                self._filter_empty_groups and not counts.all()):
            if result.ndim == 2:
                try:
                    result = lib.row_bool_subset(
                        result, (counts > 0).view(np.uint8))
                except ValueError:
                    result = lib.row_bool_subset_object(
                        ensure_object(result),
                        (counts > 0).view(np.uint8))
            else:
                result = result[counts > 0]

        if vdim == 1 and arity == 1:
            result = result[:, 0]

        if how in self._name_functions:
            # TODO
            names = self._name_functions[how]()
        else:
            names = None

        if swapped:
            result = result.swapaxes(0, axis)

        return result, names
예제 #35
0
파일: reshape.py 프로젝트: tnir/pandas
    def get_new_values(self, values, fill_value=None):

        if values.ndim == 1:
            values = values[:, np.newaxis]

        sorted_values = self._make_sorted_values(values)

        # place the values
        length, width = self.full_shape
        stride = values.shape[1]
        result_width = width * stride
        result_shape = (length, result_width)
        mask = self.mask
        mask_all = self.mask_all

        # we can simply reshape if we don't have a mask
        if mask_all and len(values):
            # TODO: Under what circumstances can we rely on sorted_values
            #  matching values?  When that holds, we can slice instead
            #  of take (in particular for EAs)
            new_values = (sorted_values.reshape(
                length, width, stride).swapaxes(1, 2).reshape(result_shape))
            new_mask = np.ones(result_shape, dtype=bool)
            return new_values, new_mask

        dtype = values.dtype

        # if our mask is all True, then we can use our existing dtype
        if mask_all:
            dtype = values.dtype
            new_values = np.empty(result_shape, dtype=dtype)
        else:
            if isinstance(dtype, ExtensionDtype):
                # GH#41875
                # We are assuming that fill_value can be held by this dtype,
                #  unlike the non-EA case that promotes.
                cls = dtype.construct_array_type()
                new_values = cls._empty(result_shape, dtype=dtype)
                new_values[:] = fill_value
            else:
                dtype, fill_value = maybe_promote(dtype, fill_value)
                new_values = np.empty(result_shape, dtype=dtype)
                new_values.fill(fill_value)

        name = dtype.name
        new_mask = np.zeros(result_shape, dtype=bool)

        # we need to convert to a basic dtype
        # and possibly coerce an input to our output dtype
        # e.g. ints -> floats
        if needs_i8_conversion(values.dtype):
            sorted_values = sorted_values.view("i8")
            new_values = new_values.view("i8")
        else:
            sorted_values = sorted_values.astype(name, copy=False)

        # fill in our values & mask
        libreshape.unstack(
            sorted_values,
            mask.view("u1"),
            stride,
            length,
            width,
            new_values,
            new_mask.view("u1"),
        )

        # reconstruct dtype if needed
        if needs_i8_conversion(values.dtype):
            # view as datetime64 so we can wrap in DatetimeArray and use
            #  DTA's view method
            new_values = new_values.view("M8[ns]")
            new_values = ensure_wrapped_if_datetimelike(new_values)
            new_values = new_values.view(values.dtype)

        return new_values, new_mask
예제 #36
0
파일: reshape.py 프로젝트: pydata/pandas
    def get_new_values(self):
        values = self.values

        # place the values
        length, width = self.full_shape
        stride = values.shape[1]
        result_width = width * stride
        result_shape = (length, result_width)
        mask = self.mask
        mask_all = mask.all()

        # we can simply reshape if we don't have a mask
        if mask_all and len(values):
            new_values = (self.sorted_values
                              .reshape(length, width, stride)
                              .swapaxes(1, 2)
                              .reshape(result_shape)
                          )
            new_mask = np.ones(result_shape, dtype=bool)
            return new_values, new_mask

        # if our mask is all True, then we can use our existing dtype
        if mask_all:
            dtype = values.dtype
            new_values = np.empty(result_shape, dtype=dtype)
        else:
            dtype, fill_value = maybe_promote(values.dtype, self.fill_value)
            new_values = np.empty(result_shape, dtype=dtype)
            new_values.fill(fill_value)

        new_mask = np.zeros(result_shape, dtype=bool)

        name = np.dtype(dtype).name
        sorted_values = self.sorted_values

        # we need to convert to a basic dtype
        # and possibly coerce an input to our output dtype
        # e.g. ints -> floats
        if needs_i8_conversion(values):
            sorted_values = sorted_values.view('i8')
            new_values = new_values.view('i8')
            name = 'int64'
        elif is_bool_dtype(values):
            sorted_values = sorted_values.astype('object')
            new_values = new_values.astype('object')
            name = 'object'
        else:
            sorted_values = sorted_values.astype(name, copy=False)

        # fill in our values & mask
        f = getattr(_reshape, "unstack_{name}".format(name=name))
        f(sorted_values,
          mask.view('u1'),
          stride,
          length,
          width,
          new_values,
          new_mask.view('u1'))

        # reconstruct dtype if needed
        if needs_i8_conversion(values):
            new_values = new_values.view(values.dtype)

        return new_values, new_mask
예제 #37
0
    def _cython_operation(self,
                          kind: str,
                          values,
                          how: str,
                          axis: int,
                          min_count: int = -1,
                          **kwargs) -> ArrayLike:
        """
        Returns the values of a cython operation.
        """
        orig_values = values
        assert kind in ["transform", "aggregate"]

        if values.ndim > 2:
            raise NotImplementedError(
                "number of dimensions is currently limited to 2")
        elif values.ndim == 2:
            # Note: it is *not* the case that axis is always 0 for 1-dim values,
            #  as we can have 1D ExtensionArrays that we need to treat as 2D
            assert axis == 1, axis

        dtype = values.dtype
        is_numeric = is_numeric_dtype(dtype)

        # can we do this operation with our cython functions
        # if not raise NotImplementedError
        self._disallow_invalid_ops(dtype, how, is_numeric)

        if is_extension_array_dtype(dtype):
            return self._ea_wrap_cython_operation(kind, values, how, axis,
                                                  min_count, **kwargs)

        elif values.ndim == 1:
            # expand to 2d, dispatch, then squeeze if appropriate
            values2d = values[None, :]
            res = self._cython_operation(
                kind=kind,
                values=values2d,
                how=how,
                axis=1,
                min_count=min_count,
                **kwargs,
            )
            if res.shape[0] == 1:
                return res[0]

            # otherwise we have OHLC
            return res.T

        is_datetimelike = needs_i8_conversion(dtype)

        if is_datetimelike:
            values = values.view("int64")
            is_numeric = True
        elif is_bool_dtype(dtype):
            values = ensure_int_or_float(values)
        elif is_integer_dtype(dtype):
            # we use iNaT for the missing value on ints
            # so pre-convert to guard this condition
            if (values == iNaT).any():
                values = ensure_float64(values)
            else:
                values = ensure_int_or_float(values)
        elif is_numeric and not is_complex_dtype(dtype):
            values = ensure_float64(values)
        else:
            values = values.astype(object)

        arity = self._cython_arity.get(how, 1)
        ngroups = self.ngroups

        assert axis == 1
        values = values.T
        if how == "ohlc":
            out_shape = (ngroups, 4)
        elif arity > 1:
            raise NotImplementedError(
                "arity of more than 1 is not supported for the 'how' argument")
        elif kind == "transform":
            out_shape = values.shape
        else:
            out_shape = (ngroups, ) + values.shape[1:]

        func, values = self._get_cython_func_and_vals(kind, how, values,
                                                      is_numeric)

        if how == "rank":
            out_dtype = "float"
        else:
            if is_numeric:
                out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}"
            else:
                out_dtype = "object"

        codes, _, _ = self.group_info

        result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
        if kind == "aggregate":
            counts = np.zeros(self.ngroups, dtype=np.int64)
            result = self._aggregate(result, counts, values, codes, func,
                                     min_count)
        elif kind == "transform":
            # TODO: min_count
            result = self._transform(result, values, codes, func,
                                     is_datetimelike, **kwargs)

        if is_integer_dtype(result.dtype) and not is_datetimelike:
            mask = result == iNaT
            if mask.any():
                result = result.astype("float64")
                result[mask] = np.nan

        if kind == "aggregate" and self._filter_empty_groups and not counts.all(
        ):
            assert result.ndim != 2
            result = result[counts > 0]

        result = result.T

        if how not in base.cython_cast_blocklist:
            # e.g. if we are int64 and need to restore to datetime64/timedelta64
            # "rank" is the only member of cython_cast_blocklist we get here
            dtype = maybe_cast_result_dtype(orig_values.dtype, how)
            op_result = maybe_downcast_to_dtype(result, dtype)
        else:
            op_result = result

        return op_result
예제 #38
0
def assert_series_equal(
    left,
    right,
    check_dtype=True,
    check_index_type="equiv",
    check_series_type=True,
    check_less_precise=no_default,
    check_names=True,
    check_exact=False,
    check_datetimelike_compat=False,
    check_categorical=True,
    check_category_order=True,
    check_freq=True,
    check_flags=True,
    rtol=1.0e-5,
    atol=1.0e-8,
    obj="Series",
    *,
    check_index=True,
):
    """
    Check that left and right Series are equal.

    Parameters
    ----------
    left : Series
    right : Series
    check_dtype : bool, default True
        Whether to check the Series dtype is identical.
    check_index_type : bool or {'equiv'}, default 'equiv'
        Whether to check the Index class, dtype and inferred_type
        are identical.
    check_series_type : bool, default True
         Whether to check the Series class is identical.
    check_less_precise : bool or int, default False
        Specify comparison precision. Only used when check_exact is False.
        5 digits (False) or 3 digits (True) after decimal points are compared.
        If int, then specify the digits to compare.

        When comparing two numbers, if the first number has magnitude less
        than 1e-5, we compare the two numbers directly and check whether
        they are equivalent within the specified precision. Otherwise, we
        compare the **ratio** of the second number to the first number and
        check whether it is equivalent to 1 within the specified precision.

        .. deprecated:: 1.1.0
           Use `rtol` and `atol` instead to define relative/absolute
           tolerance, respectively. Similar to :func:`math.isclose`.
    check_names : bool, default True
        Whether to check the Series and Index names attribute.
    check_exact : bool, default False
        Whether to compare number exactly.
    check_datetimelike_compat : bool, default False
        Compare datetime-like which is comparable ignoring dtype.
    check_categorical : bool, default True
        Whether to compare internal Categorical exactly.
    check_category_order : bool, default True
        Whether to compare category order of internal Categoricals.

        .. versionadded:: 1.0.2
    check_freq : bool, default True
        Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex.

        .. versionadded:: 1.1.0
    check_flags : bool, default True
        Whether to check the `flags` attribute.

        .. versionadded:: 1.2.0

    rtol : float, default 1e-5
        Relative tolerance. Only used when check_exact is False.

        .. versionadded:: 1.1.0
    atol : float, default 1e-8
        Absolute tolerance. Only used when check_exact is False.

        .. versionadded:: 1.1.0
    obj : str, default 'Series'
        Specify object name being compared, internally used to show appropriate
        assertion message.
    check_index : bool, default True
        Whether to check index equivalence. If False, then compare only values.

        .. versionadded:: 1.3.0

    Examples
    --------
    >>> from pandas.testing import assert_series_equal
    >>> a = pd.Series([1, 2, 3, 4])
    >>> b = pd.Series([1, 2, 3, 4])
    >>> assert_series_equal(a, b)
    """
    __tracebackhide__ = True

    if check_less_precise is not no_default:
        warnings.warn(
            "The 'check_less_precise' keyword in testing.assert_*_equal "
            "is deprecated and will be removed in a future version. "
            "You can stop passing 'check_less_precise' to silence this warning.",
            FutureWarning,
            stacklevel=2,
        )
        rtol = atol = _get_tol_from_less_precise(check_less_precise)

    # instance validation
    _check_isinstance(left, right, Series)

    if check_series_type:
        assert_class_equal(left, right, obj=obj)

    # length comparison
    if len(left) != len(right):
        msg1 = f"{len(left)}, {left.index}"
        msg2 = f"{len(right)}, {right.index}"
        raise_assert_detail(obj, "Series length are different", msg1, msg2)

    if check_flags:
        assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}"

    if check_index:
        # GH #38183
        assert_index_equal(
            left.index,
            right.index,
            exact=check_index_type,
            check_names=check_names,
            check_exact=check_exact,
            check_categorical=check_categorical,
            rtol=rtol,
            atol=atol,
            obj=f"{obj}.index",
        )

    if check_freq and isinstance(left.index, (DatetimeIndex, TimedeltaIndex)):
        lidx = left.index
        ridx = right.index
        assert lidx.freq == ridx.freq, (lidx.freq, ridx.freq)

    if check_dtype:
        # We want to skip exact dtype checking when `check_categorical`
        # is False. We'll still raise if only one is a `Categorical`,
        # regardless of `check_categorical`
        if (
            is_categorical_dtype(left.dtype)
            and is_categorical_dtype(right.dtype)
            and not check_categorical
        ):
            pass
        else:
            assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}")

    if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype):
        left_values = left._values
        right_values = right._values
        # Only check exact if dtype is numeric
        if isinstance(left_values, ExtensionArray) and isinstance(
            right_values, ExtensionArray
        ):
            assert_extension_array_equal(
                left_values,
                right_values,
                check_dtype=check_dtype,
                index_values=np.asarray(left.index),
            )
        else:
            assert_numpy_array_equal(
                left_values,
                right_values,
                check_dtype=check_dtype,
                obj=str(obj),
                index_values=np.asarray(left.index),
            )
    elif check_datetimelike_compat and (
        needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype)
    ):
        # we want to check only if we have compat dtypes
        # e.g. integer and M|m are NOT compat, but we can simply check
        # the values in that case

        # datetimelike may have different objects (e.g. datetime.datetime
        # vs Timestamp) but will compare equal
        if not Index(left._values).equals(Index(right._values)):
            msg = (
                f"[datetimelike_compat=True] {left._values} "
                f"is not equal to {right._values}."
            )
            raise AssertionError(msg)
    elif is_interval_dtype(left.dtype) and is_interval_dtype(right.dtype):
        assert_interval_array_equal(left.array, right.array)
    elif is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype):
        _testing.assert_almost_equal(
            left._values,
            right._values,
            rtol=rtol,
            atol=atol,
            check_dtype=check_dtype,
            obj=str(obj),
            index_values=np.asarray(left.index),
        )
    elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(right.dtype):
        assert_extension_array_equal(
            left._values,
            right._values,
            check_dtype=check_dtype,
            index_values=np.asarray(left.index),
        )
    elif is_extension_array_dtype_and_needs_i8_conversion(
        left.dtype, right.dtype
    ) or is_extension_array_dtype_and_needs_i8_conversion(right.dtype, left.dtype):
        assert_extension_array_equal(
            left._values,
            right._values,
            check_dtype=check_dtype,
            index_values=np.asarray(left.index),
        )
    elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype):
        # DatetimeArray or TimedeltaArray
        assert_extension_array_equal(
            left._values,
            right._values,
            check_dtype=check_dtype,
            index_values=np.asarray(left.index),
        )
    else:
        _testing.assert_almost_equal(
            left._values,
            right._values,
            rtol=rtol,
            atol=atol,
            check_dtype=check_dtype,
            obj=str(obj),
            index_values=np.asarray(left.index),
        )

    # metadata comparison
    if check_names:
        assert_attr_equal("name", left, right, obj=obj)

    if check_categorical:
        if is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype):
            assert_categorical_equal(
                left._values,
                right._values,
                obj=f"{obj} category",
                check_category_order=check_category_order,
            )
예제 #39
0
    def test_value_counts_unique_nunique_null(self):

        for null_obj in [np.nan, None]:
            for orig in self.objs:
                o = orig.copy()
                klass = type(o)
                values = o._values

                if not self._allow_na_ops(o):
                    continue

                # special assign to the numpy array
                if is_datetimetz(o):
                    if isinstance(o, DatetimeIndex):
                        v = o.asi8
                        v[0:2] = iNaT
                        values = o._shallow_copy(v)
                    else:
                        o = o.copy()
                        o[0:2] = iNaT
                        values = o._values

                elif needs_i8_conversion(o):
                    values[0:2] = iNaT
                    values = o._shallow_copy(values)
                else:
                    values[0:2] = null_obj
                # check values has the same dtype as the original

                self.assertEqual(values.dtype, o.dtype)

                # create repeated values, 'n'th element is repeated by n+1
                # times
                if isinstance(o, (DatetimeIndex, PeriodIndex)):
                    expected_index = o.copy()
                    expected_index.name = None

                    # attach name to klass
                    o = klass(values.repeat(range(1, len(o) + 1)))
                    o.name = 'a'
                else:
                    if is_datetimetz(o):
                        expected_index = orig._values._shallow_copy(values)
                    else:
                        expected_index = pd.Index(values)
                    expected_index.name = None
                    o = o.repeat(range(1, len(o) + 1))
                    o.name = 'a'

                # check values has the same dtype as the original
                self.assertEqual(o.dtype, orig.dtype)
                # check values correctly have NaN
                nanloc = np.zeros(len(o), dtype=np.bool)
                nanloc[:3] = True
                if isinstance(o, Index):
                    self.assert_numpy_array_equal(pd.isnull(o), nanloc)
                else:
                    exp = pd.Series(nanloc, o.index, name='a')
                    self.assert_series_equal(pd.isnull(o), exp)

                expected_s_na = Series(list(range(10, 2, -1)) + [3],
                                       index=expected_index[9:0:-1],
                                       dtype='int64',
                                       name='a')
                expected_s = Series(list(range(10, 2, -1)),
                                    index=expected_index[9:1:-1],
                                    dtype='int64',
                                    name='a')

                result_s_na = o.value_counts(dropna=False)
                tm.assert_series_equal(result_s_na, expected_s_na)
                self.assertTrue(result_s_na.index.name is None)
                self.assertEqual(result_s_na.name, 'a')
                result_s = o.value_counts()
                tm.assert_series_equal(o.value_counts(), expected_s)
                self.assertTrue(result_s.index.name is None)
                self.assertEqual(result_s.name, 'a')

                result = o.unique()
                if isinstance(o, Index):
                    tm.assert_index_equal(result, Index(values[1:], name='a'))
                elif is_datetimetz(o):
                    # unable to compare NaT / nan
                    tm.assert_numpy_array_equal(result[1:],
                                                values[2:].asobject.values)
                    self.assertIs(result[0], pd.NaT)
                else:
                    tm.assert_numpy_array_equal(result[1:], values[2:])

                    self.assertTrue(pd.isnull(result[0]))
                    self.assertEqual(result.dtype, orig.dtype)

                self.assertEqual(o.nunique(), 8)
                self.assertEqual(o.nunique(dropna=False), 9)