Пример #1
0
def test_is_object():
    assert com.is_object_dtype(object)
    assert com.is_object_dtype(np.array([], dtype=object))

    assert not com.is_object_dtype(int)
    assert not com.is_object_dtype(np.array([], dtype=int))
    assert not com.is_object_dtype([1, 2, 3])
Пример #2
0
 def safe_na_op(lvalues, rvalues):
     try:
         with np.errstate(all='ignore'):
             return na_op(lvalues, rvalues)
     except Exception:
         if isinstance(rvalues, ABCSeries):
             if is_object_dtype(rvalues):
                 # if dtype is object, try elementwise op
                 return libalgos.arrmap_object(rvalues,
                                               lambda x: op(lvalues, x))
         else:
             if is_object_dtype(lvalues):
                 return libalgos.arrmap_object(lvalues,
                                               lambda x: op(x, rvalues))
         raise
Пример #3
0
    def __mul__(self, other):
        other = lib.item_from_zerodim(other)

        if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
            return NotImplemented

        if is_scalar(other):
            # numpy will accept float and int, raise TypeError for others
            result = self._data * other
            freq = None
            if self.freq is not None and not isna(other):
                freq = self.freq * other
            return type(self)(result, freq=freq)

        if not hasattr(other, "dtype"):
            # list, tuple
            other = np.array(other)
        if len(other) != len(self) and not is_timedelta64_dtype(other):
            # Exclude timedelta64 here so we correctly raise TypeError
            #  for that instead of ValueError
            raise ValueError("Cannot multiply with unequal lengths")

        if is_object_dtype(other):
            # this multiplication will succeed only if all elements of other
            #  are int or float scalars, so we will end up with
            #  timedelta64[ns]-dtyped result
            result = [self[n] * other[n] for n in range(len(self))]
            result = np.array(result)
            return type(self)(result)

        # numpy will accept float or int dtype, raise TypeError for others
        result = self._data * other
        return type(self)(result)
Пример #4
0
        def func(self, other, sort=True):
            other = self._as_like_interval_index(other)

            # GH 19016: ensure set op will not return a prohibited dtype
            subtypes = [self.dtype.subtype, other.dtype.subtype]
            common_subtype = find_common_type(subtypes)
            if is_object_dtype(common_subtype):
                msg = ('can only do {op} between two IntervalIndex '
                       'objects that have compatible dtypes')
                raise TypeError(msg.format(op=op_name))

            if op_name == 'difference':
                result = getattr(self._multiindex, op_name)(other._multiindex,
                                                            sort)
            else:
                result = getattr(self._multiindex, op_name)(other._multiindex)
            result_name = get_op_result_name(self, other)

            # GH 19101: ensure empty results have correct dtype
            if result.empty:
                result = result.values.astype(self.dtype.subtype)
            else:
                result = result.values

            return type(self).from_tuples(result, closed=self.closed,
                                          name=result_name)
Пример #5
0
    def memory_usage(self, deep=False):
        """
        Memory usage of the values

        Parameters
        ----------
        deep : bool
            Introspect the data deeply, interrogate
            `object` dtypes for system-level memory consumption

        Returns
        -------
        bytes used

        Notes
        -----
        Memory usage does not include memory consumed by elements that
        are not components of the array if deep=False or if used on PyPy

        See Also
        --------
        numpy.ndarray.nbytes
        """
        if hasattr(self.values, 'memory_usage'):
            return self.values.memory_usage(deep=deep)

        v = self.values.nbytes
        if deep and is_object_dtype(self) and not PYPY:
            v += lib.memory_usage_of_objects(self.values)
        return v
Пример #6
0
        def func(self, other, sort=sort):
            self._assert_can_do_setop(other)
            other = ensure_index(other)
            if not isinstance(other, IntervalIndex):
                result = getattr(self.astype(object), op_name)(other)
                if op_name in ('difference',):
                    result = result.astype(self.dtype)
                return result
            elif self.closed != other.closed:
                msg = ('can only do set operations between two IntervalIndex '
                       'objects that are closed on the same side')
                raise ValueError(msg)

            # GH 19016: ensure set op will not return a prohibited dtype
            subtypes = [self.dtype.subtype, other.dtype.subtype]
            common_subtype = find_common_type(subtypes)
            if is_object_dtype(common_subtype):
                msg = ('can only do {op} between two IntervalIndex '
                       'objects that have compatible dtypes')
                raise TypeError(msg.format(op=op_name))

            result = getattr(self._multiindex, op_name)(other._multiindex,
                                                        sort=sort)
            result_name = get_op_result_name(self, other)

            # GH 19101: ensure empty results have correct dtype
            if result.empty:
                result = result.values.astype(self.dtype.subtype)
            else:
                result = result.values

            return type(self).from_tuples(result, closed=self.closed,
                                          name=result_name)
Пример #7
0
    def memory_usage(self, deep=False):
        values = self.sp_values

        v = values.nbytes

        if deep and is_object_dtype(self) and not PYPY:
            v += lib.memory_usage_of_objects(values)

        return v
Пример #8
0
    def __array__(self, dtype=None):
        # TODO(https://github.com/pandas-dev/pandas/pull/23593)
        # Maybe push to parent once datetimetz __array__ is figured out.
        if is_object_dtype(dtype):
            return np.array(list(self), dtype=object)
        elif is_int64_dtype(dtype):
            return self.asi8

        return self._data
Пример #9
0
 def safe_na_op(lvalues, rvalues):
     try:
         with np.errstate(all='ignore'):
             return na_op(lvalues, rvalues)
     except Exception:
         if is_object_dtype(lvalues):
             return libalgos.arrmap_object(lvalues,
                                           lambda x: op(x, rvalues))
         raise
Пример #10
0
def unconvert(values, dtype, compress=None):

    as_is_ext = isinstance(values, ExtType) and values.code == 0

    if as_is_ext:
        values = values.data

    if is_categorical_dtype(dtype):
        return values

    elif is_object_dtype(dtype):
        return np.array(values, dtype=object)

    dtype = pandas_dtype(dtype).base

    if not as_is_ext:
        values = values.encode('latin1')

    if compress:
        if compress == u'zlib':
            _check_zlib()
            decompress = zlib.decompress
        elif compress == u'blosc':
            _check_blosc()
            decompress = blosc.decompress
        else:
            raise ValueError("compress must be one of 'zlib' or 'blosc'")

        try:
            return np.frombuffer(
                _move_into_mutable_buffer(decompress(values)),
                dtype=dtype,
            )
        except _BadMove as e:
            # Pull the decompressed data off of the `_BadMove` exception.
            # We don't just store this in the locals because we want to
            # minimize the risk of giving users access to a `bytes` object
            # whose data is also given to a mutable buffer.
            values = e.args[0]
            if len(values) > 1:
                # The empty string and single characters are memoized in many
                # string creating functions in the capi. This case should not
                # warn even though we need to make a copy because we are only
                # copying at most 1 byte.
                warnings.warn(
                    'copying data after decompressing; this may mean that'
                    ' decompress is caching its result',
                    PerformanceWarning,
                )
                # fall through to copying `np.fromstring`

    # Copy the bytes into a numpy array.
    buf = np.frombuffer(values, dtype=dtype)
    buf = buf.copy()  # required to not mutate the original data
    buf.flags.writeable = True
    return buf
Пример #11
0
def make_sparse(arr, kind='block', fill_value=None, dtype=None, copy=False):
    """
    Convert ndarray to sparse format

    Parameters
    ----------
    arr : ndarray
    kind : {'block', 'integer'}
    fill_value : NaN or another value
    dtype : np.dtype, optional
    copy : bool, default False

    Returns
    -------
    (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)
    """

    arr = _sanitize_values(arr)

    if arr.ndim > 1:
        raise TypeError("expected dimension <= 1 data")

    if fill_value is None:
        fill_value = na_value_for_dtype(arr.dtype)

    if isna(fill_value):
        mask = notna(arr)
    else:
        # For str arrays in NumPy 1.12.0, operator!= below isn't
        # element-wise but just returns False if fill_value is not str,
        # so cast to object comparison to be safe
        if is_string_dtype(arr):
            arr = arr.astype(object)

        if is_object_dtype(arr.dtype):
            # element-wise equality check method in numpy doesn't treat
            # each element type, eg. 0, 0.0, and False are treated as
            # same. So we have to check the both of its type and value.
            mask = splib.make_mask_object_ndarray(arr, fill_value)
        else:
            mask = arr != fill_value

    length = len(arr)
    if length != len(mask):
        # the arr is a SparseArray
        indices = mask.sp_index.indices
    else:
        indices = mask.nonzero()[0].astype(np.int32)

    index = _make_index(length, indices, kind)
    sparsified_values = arr[mask]
    if dtype is not None:
        sparsified_values = astype_nansafe(sparsified_values, dtype=dtype)
    # TODO: copy
    return sparsified_values, index, fill_value
Пример #12
0
 def astype(self, dtype, copy=True):
     if is_interval_dtype(dtype):
         if copy:
             self = self.copy()
         return self
     elif is_object_dtype(dtype):
         return Index(self.values, dtype=object)
     elif is_categorical_dtype(dtype):
         from pandas import Categorical
         return Categorical(self, ordered=True)
     raise ValueError('Cannot cast IntervalIndex to dtype %s' % dtype)
Пример #13
0
    def na_op(x, y):

        # dispatch to the categorical if we have a categorical
        # in either operand
        if is_categorical_dtype(x):
            return op(x, y)
        elif is_categorical_dtype(y) and not is_scalar(y):
            return op(y, x)

        if is_object_dtype(x.dtype):
            result = _comp_method_OBJECT_ARRAY(op, x, y)
        else:

            # we want to compare like types
            # we only want to convert to integer like if
            # we are not NotImplemented, otherwise
            # we would allow datetime64 (but viewed as i8) against
            # integer comparisons
            if is_datetimelike_v_numeric(x, y):
                raise TypeError("invalid type comparison")

            # numpy does not like comparisons vs None
            if is_scalar(y) and isna(y):
                if name == '__ne__':
                    return np.ones(len(x), dtype=bool)
                else:
                    return np.zeros(len(x), dtype=bool)

            # we have a datetime/timedelta and may need to convert
            mask = None
            if (needs_i8_conversion(x) or
                    (not is_scalar(y) and needs_i8_conversion(y))):

                if is_scalar(y):
                    mask = isna(x)
                    y = libindex.convert_scalar(x, com._values_from_object(y))
                else:
                    mask = isna(x) | isna(y)
                    y = y.view('i8')
                x = x.view('i8')

            try:
                with np.errstate(all='ignore'):
                    result = getattr(x, name)(y)
                if result is NotImplemented:
                    raise TypeError("invalid type comparison")
            except AttributeError:
                result = op(x, y)

            if mask is not None and mask.any():
                result[mask] = masker

        return result
Пример #14
0
def _bn_ok_dtype(dt, name):
    # Bottleneck chokes on datetime64
    if (not is_object_dtype(dt) and not is_datetime_or_timedelta_dtype(dt)):

        # bottleneck does not properly upcast during the sum
        # so can overflow
        if name == 'nansum':
            if dt.itemsize < 8:
                return False

        return True
    return False
Пример #15
0
    def test_memory_usage(self):
        for o in self.objs:
            res = o.memory_usage()
            res_deep = o.memory_usage(deep=True)

            if (is_object_dtype(o) or (isinstance(o, Series) and
                                       is_object_dtype(o.index))):
                # if there are objects, only deep will pick them up
                assert res_deep > res
            else:
                assert res == res_deep

            if isinstance(o, Series):
                assert ((o.memory_usage(index=False) +
                         o.index.memory_usage()) ==
                        o.memory_usage(index=True))

            # sys.getsizeof will call the .memory_usage with
            # deep=True, and add on some GC overhead
            diff = res_deep - sys.getsizeof(o)
            assert abs(diff) < 100
Пример #16
0
def interval_range(start=None, end=None, freq=None, periods=None,
                   name=None, closed='right', **kwargs):
    """
    Return a fixed frequency IntervalIndex

    Parameters
    ----------
    start : string or datetime-like, default None
        Left bound for generating data
    end : string or datetime-like, default None
        Right bound for generating data
    freq : interger, string or DateOffset, default 1
    periods : interger, default None
    name : str, default None
        Name of the resulting index
    closed : string, default 'right'
        options are: 'left', 'right', 'both', 'neither'

    Notes
    -----
    2 of start, end, or periods must be specified

    Returns
    -------
    rng : IntervalIndex
    """

    if freq is None:
        freq = 1

    if start is None:
        if periods is None or end is None:
            raise ValueError("must specify 2 of start, end, periods")
        start = end - periods * freq
    if end is None:
        if periods is None or start is None:
            raise ValueError("must specify 2 of start, end, periods")
        end = start + periods * freq
    if periods is None:
        if start is None or end is None:
            raise ValueError("must specify 2 of start, end, periods")
        pass

    # must all be same units or None
    arr = np.array([start, end, freq])
    if is_object_dtype(arr):
        raise ValueError("start, end, freq need to be the same type")

    return IntervalIndex.from_breaks(np.arange(start, end, freq),
                                     name=name,
                                     closed=closed)
Пример #17
0
def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype):
    # GH 4343
    tm.skip_if_no_package('scipy')

    # Make one ndarray and from it one sparse matrix, both to be used for
    # constructing frames and comparing results
    arr = np.eye(3, dtype=dtype)
    # GH 16179
    arr[0, 1] = dtype(2)
    try:
        spm = spmatrix(arr)
        assert spm.dtype == arr.dtype
    except (TypeError, AssertionError):
        # If conversion to sparse fails for this spmatrix type and arr.dtype,
        # then the combination is not currently supported in NumPy, so we
        # can just skip testing it thoroughly
        return

    sdf = pd.SparseDataFrame(spm, index=index, columns=columns,
                             default_fill_value=fill_value)

    # Expected result construction is kind of tricky for all
    # dtype-fill_value combinations; easiest to cast to something generic
    # and except later on
    rarr = arr.astype(object)
    rarr[arr == 0] = np.nan
    expected = pd.SparseDataFrame(rarr, index=index, columns=columns).fillna(
        fill_value if fill_value is not None else np.nan)

    # Assert frame is as expected
    sdf_obj = sdf.astype(object)
    tm.assert_sp_frame_equal(sdf_obj, expected)
    tm.assert_frame_equal(sdf_obj.to_dense(), expected.to_dense())

    # Assert spmatrices equal
    assert dict(sdf.to_coo().todok()) == dict(spm.todok())

    # Ensure dtype is preserved if possible
    was_upcast = ((fill_value is None or is_float(fill_value)) and
                  not is_object_dtype(dtype) and
                  not is_float_dtype(dtype))
    res_dtype = (bool if is_bool_dtype(dtype) else
                 float if was_upcast else
                 dtype)
    tm.assert_contains_all(sdf.dtypes, {np.dtype(res_dtype)})
    assert sdf.to_coo().dtype == res_dtype

    # However, adding a str column results in an upcast to object
    sdf['strings'] = np.arange(len(sdf)).astype(str)
    assert sdf.to_coo().dtype == np.object_
Пример #18
0
def _comp_method_OBJECT_ARRAY(op, x, y):
    if isinstance(y, list):
        y = construct_1d_object_array_from_listlike(y)
    if isinstance(y, (np.ndarray, ABCSeries, ABCIndex)):
        if not is_object_dtype(y.dtype):
            y = y.astype(np.object_)

        if isinstance(y, (ABCSeries, ABCIndex)):
            y = y.values

        result = lib.vec_compare(x, y, op)
    else:
        result = lib.scalar_compare(x, y, op)
    return result
Пример #19
0
    def wrapper(self, other):
        meth = getattr(dtl.DatetimeLikeArrayMixin, opname)

        if isinstance(other, (datetime, np.datetime64, compat.string_types)):
            if isinstance(other, (datetime, np.datetime64)):
                # GH#18435 strings get a pass from tzawareness compat
                self._assert_tzawareness_compat(other)

            try:
                other = _to_m8(other, tz=self.tz)
            except ValueError:
                # string that cannot be parsed to Timestamp
                return ops.invalid_comparison(self, other, op)

            result = meth(self, other)
            if isna(other):
                result.fill(nat_result)
        elif lib.is_scalar(other):
            return ops.invalid_comparison(self, other, op)
        else:
            if isinstance(other, list):
                # FIXME: This can break for object-dtype with mixed types
                other = type(self)(other)
            elif not isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)):
                # Following Timestamp convention, __eq__ is all-False
                # and __ne__ is all True, others raise TypeError.
                return ops.invalid_comparison(self, other, op)

            if is_object_dtype(other):
                result = op(self.astype('O'), np.array(other))
            elif not (is_datetime64_dtype(other) or
                      is_datetime64tz_dtype(other)):
                # e.g. is_timedelta64_dtype(other)
                return ops.invalid_comparison(self, other, op)
            else:
                self._assert_tzawareness_compat(other)
                result = meth(self, np.asarray(other))

            result = com.values_from_object(result)

            # Make sure to pass an array to result[...]; indexing with
            # Series breaks with older version of numpy
            o_mask = np.array(isna(other))
            if o_mask.any():
                result[o_mask] = nat_result

        if self.hasnans:
            result[self._isnan] = nat_result

        return result
Пример #20
0
    def test_setitem(self):

        df = DataFrame({'A': range(10)})
        s = pd.cut(df.A, 5)
        assert isinstance(s.cat.categories, IntervalIndex)

        # B & D end up as Categoricals
        # the remainer are converted to in-line objects
        # contining an IntervalIndex.values
        df['B'] = s
        df['C'] = np.array(s)
        df['D'] = s.values
        df['E'] = np.array(s.values)

        assert is_categorical_dtype(df['B'])
        assert is_interval_dtype(df['B'].cat.categories)
        assert is_categorical_dtype(df['D'])
        assert is_interval_dtype(df['D'].cat.categories)

        assert is_object_dtype(df['C'])
        assert is_object_dtype(df['E'])

        # they compare equal as Index
        # when converted to numpy objects
        c = lambda x: Index(np.array(x))
        tm.assert_index_equal(c(df.B), c(df.B), check_names=False)
        tm.assert_index_equal(c(df.B), c(df.C), check_names=False)
        tm.assert_index_equal(c(df.B), c(df.D), check_names=False)
        tm.assert_index_equal(c(df.B), c(df.D), check_names=False)

        # B & D are the same Series
        tm.assert_series_equal(df['B'], df['B'], check_names=False)
        tm.assert_series_equal(df['B'], df['D'], check_names=False)

        # C & E are the same Series
        tm.assert_series_equal(df['C'], df['C'], check_names=False)
        tm.assert_series_equal(df['C'], df['E'], check_names=False)
Пример #21
0
    def __rfloordiv__(self, other):
        if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
            return NotImplemented

        other = lib.item_from_zerodim(other)
        if is_scalar(other):
            if isinstance(other, (timedelta, np.timedelta64, Tick)):
                other = Timedelta(other)
                if other is NaT:
                    # treat this specifically as timedelta-NaT
                    result = np.empty(self.shape, dtype=np.float64)
                    result.fill(np.nan)
                    return result

                # dispatch to Timedelta implementation
                result = other.__floordiv__(self._data)
                return result

            raise TypeError("Cannot divide {typ} by {cls}"
                            .format(typ=type(other).__name__,
                                    cls=type(self).__name__))

        if not hasattr(other, "dtype"):
            # list, tuple
            other = np.array(other)
        if len(other) != len(self):
            raise ValueError("Cannot divide with unequal lengths")

        elif is_timedelta64_dtype(other):
            other = type(self)(other)

            # numpy timedelta64 does not natively support floordiv, so operate
            #  on the i8 values
            result = other.asi8 // self.asi8
            mask = self._isnan | other._isnan
            if mask.any():
                result = result.astype(np.int64)
                result[mask] = np.nan
            return result

        elif is_object_dtype(other):
            result = [other[n] // self[n] for n in range(len(self))]
            result = np.array(result)
            return result

        else:
            dtype = getattr(other, "dtype", type(other).__name__)
            raise TypeError("Cannot divide {typ} by {cls}"
                            .format(typ=dtype, cls=type(self).__name__))
Пример #22
0
    def __truediv__(self, other):
        # timedelta / X is well-defined for timedelta-like or numeric X
        other = lib.item_from_zerodim(other)

        if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
            return NotImplemented

        if isinstance(other, (timedelta, np.timedelta64, Tick)):
            other = Timedelta(other)
            if other is NaT:
                # specifically timedelta64-NaT
                result = np.empty(self.shape, dtype=np.float64)
                result.fill(np.nan)
                return result

            # otherwise, dispatch to Timedelta implementation
            return self._data / other

        elif lib.is_scalar(other):
            # assume it is numeric
            result = self._data / other
            freq = None
            if self.freq is not None:
                # Tick division is not implemented, so operate on Timedelta
                freq = self.freq.delta / other
            return type(self)(result, freq=freq)

        if not hasattr(other, "dtype"):
            # e.g. list, tuple
            other = np.array(other)

        if len(other) != len(self):
            raise ValueError("Cannot divide vectors with unequal lengths")

        elif is_timedelta64_dtype(other):
            # let numpy handle it
            return self._data / other

        elif is_object_dtype(other):
            # Note: we do not do type inference on the result, so either
            #  an object array or numeric-dtyped (if numpy does inference)
            #  will be returned.  GH#23829
            result = [self[n] / other[n] for n in range(len(self))]
            result = np.array(result)
            return result

        else:
            result = self._data / other
            return type(self)(result)
Пример #23
0
 def astype(self, dtype, copy=True):
     if is_object_dtype(dtype):
         return self._box_values_as_index()
     elif is_string_dtype(dtype) and not is_categorical_dtype(dtype):
         return Index(self.format(), name=self.name, dtype=object)
     elif is_integer_dtype(dtype):
         return Index(self.values.astype('i8', copy=copy), name=self.name,
                      dtype='i8')
     elif (is_datetime_or_timedelta_dtype(dtype) and
           not is_dtype_equal(self.dtype, dtype)) or is_float_dtype(dtype):
         # disallow conversion between datetime/timedelta,
         # and conversions for any datetimelike to float
         msg = 'Cannot cast {name} to dtype {dtype}'
         raise TypeError(msg.format(name=type(self).__name__, dtype=dtype))
     return super(DatetimeIndexOpsMixin, self).astype(dtype, copy=copy)
Пример #24
0
 def astype(self, dtype, copy=True):
     dtype = pandas_dtype(dtype)
     if is_float_dtype(dtype):
         values = self._values.astype(dtype, copy=copy)
     elif is_integer_dtype(dtype):
         if self.hasnans:
             raise ValueError('cannot convert float NaN to integer')
         values = self._values.astype(dtype, copy=copy)
     elif is_object_dtype(dtype):
         values = self._values.astype('object', copy=copy)
     else:
         raise TypeError('Setting %s dtype to anything other than '
                         'float64 or object is not supported' %
                         self.__class__)
     return Index(values, name=self.name, dtype=dtype)
Пример #25
0
 def astype(self, dtype, copy=True, how='start'):
     dtype = pandas_dtype(dtype)
     if is_object_dtype(dtype):
         return self.asobject
     elif is_integer_dtype(dtype):
         if copy:
             return self._int64index.copy()
         else:
             return self._int64index
     elif is_datetime64_dtype(dtype):
         return self.to_timestamp(how=how)
     elif is_datetime64tz_dtype(dtype):
         return self.to_timestamp(how=how).tz_localize(dtype.tz)
     elif is_period_dtype(dtype):
         return self.asfreq(freq=dtype.freq)
     raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype)
Пример #26
0
 def _f(*args, **kwargs):
     obj_iter = itertools.chain(args, compat.itervalues(kwargs))
     if any(self.check(obj) for obj in obj_iter):
         msg = 'reduction operation {name!r} not allowed for this dtype'
         raise TypeError(msg.format(name=f.__name__.replace('nan', '')))
     try:
         with np.errstate(invalid='ignore'):
             return f(*args, **kwargs)
     except ValueError as e:
         # we want to transform an object array
         # ValueError message to the more typical TypeError
         # e.g. this is normally a disallowed function on
         # object arrays that contain strings
         if is_object_dtype(args[0]):
             raise TypeError(e)
         raise
Пример #27
0
def _bn_ok_dtype(dt, name):
    # Bottleneck chokes on datetime64
    if (not is_object_dtype(dt) and not is_datetime_or_timedelta_dtype(dt)):

        # GH 15507
        # bottleneck does not properly upcast during the sum
        # so can overflow

        # GH 9422
        # further we also want to preserve NaN when all elements
        # are NaN, unlinke bottleneck/numpy which consider this
        # to be 0
        if name in ['nansum', 'nanprod']:
            return False

        return True
    return False
Пример #28
0
        def f(values, axis=None, skipna=True, **kwds):
            if len(self.kwargs) > 0:
                for k, v in compat.iteritems(self.kwargs):
                    if k not in kwds:
                        kwds[k] = v
            try:
                if values.size == 0:

                    # we either return np.nan or pd.NaT
                    if is_numeric_dtype(values):
                        values = values.astype('float64')
                    fill_value = na_value_for_dtype(values.dtype)

                    if values.ndim == 1:
                        return fill_value
                    else:
                        result_shape = (values.shape[:axis] +
                                        values.shape[axis + 1:])
                        result = np.empty(result_shape, dtype=values.dtype)
                        result.fill(fill_value)
                        return result

                if (_USE_BOTTLENECK and skipna and
                        _bn_ok_dtype(values.dtype, bn_name)):
                    result = bn_func(values, axis=axis, **kwds)

                    # prefer to treat inf/-inf as NA, but must compute the func
                    # twice :(
                    if _has_infs(result):
                        result = alt(values, axis=axis, skipna=skipna, **kwds)
                else:
                    result = alt(values, axis=axis, skipna=skipna, **kwds)
            except Exception:
                try:
                    result = alt(values, axis=axis, skipna=skipna, **kwds)
                except ValueError as e:
                    # we want to transform an object array
                    # ValueError message to the more typical TypeError
                    # e.g. this is normally a disallowed function on
                    # object arrays that contain strings

                    if is_object_dtype(values):
                        raise TypeError(e)
                    raise

            return result
Пример #29
0
    def __rtruediv__(self, other):
        # X / timedelta is defined only for timedelta-like X
        other = lib.item_from_zerodim(other)

        if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
            return NotImplemented

        if isinstance(other, (timedelta, np.timedelta64, Tick)):
            other = Timedelta(other)
            if other is NaT:
                # specifically timedelta64-NaT
                result = np.empty(self.shape, dtype=np.float64)
                result.fill(np.nan)
                return result

            # otherwise, dispatch to Timedelta implementation
            return other / self._data

        elif lib.is_scalar(other):
            raise TypeError("Cannot divide {typ} by {cls}"
                            .format(typ=type(other).__name__,
                                    cls=type(self).__name__))

        if not hasattr(other, "dtype"):
            # e.g. list, tuple
            other = np.array(other)

        if len(other) != len(self):
            raise ValueError("Cannot divide vectors with unequal lengths")

        elif is_timedelta64_dtype(other):
            # let numpy handle it
            return other / self._data

        elif is_object_dtype(other):
            # Note: unlike in __truediv__, we do not _need_ to do type#
            #  inference on the result.  It does not raise, a numeric array
            #  is returned.  GH#23829
            result = [other[n] / self[n] for n in range(len(self))]
            return np.array(result)

        else:
            raise TypeError("Cannot divide {dtype} data by {cls}"
                            .format(dtype=other.dtype,
                                    cls=type(self).__name__))
Пример #30
0
 def astype(self, dtype, copy=True):
     dtype = pandas_dtype(dtype)
     if is_float_dtype(dtype):
         values = self._values.astype(dtype, copy=copy)
     elif is_integer_dtype(dtype):
         if self.hasnans:
             raise ValueError('cannot convert float NaN to integer')
         values = self._values.astype(dtype, copy=copy)
     elif is_object_dtype(dtype):
         values = self._values.astype('object', copy=copy)
     elif is_categorical_dtype(dtype):
         return CategoricalIndex(self, name=self.name, dtype=dtype,
                                 copy=copy)
     else:
         raise TypeError('Setting {cls} dtype to anything other than '
                         'float64, object, or category is not supported'
                         .format(cls=self.__class__))
     return Index(values, name=self.name, dtype=dtype)
Пример #31
0
    def __floordiv__(self, other):

        if is_scalar(other):
            if isinstance(other, self._recognized_scalars):
                other = Timedelta(other)
                # mypy assumes that __new__ returns an instance of the class
                # github.com/python/mypy/issues/1020
                if cast("Timedelta | NaTType", other) is NaT:
                    # treat this specifically as timedelta-NaT
                    result = np.empty(self.shape, dtype=np.float64)
                    result.fill(np.nan)
                    return result

                # dispatch to Timedelta implementation
                return other.__rfloordiv__(self._ndarray)

            # at this point we should only have numeric scalars; anything
            #  else will raise
            result = self._ndarray // other
            freq = None
            if self.freq is not None:
                # Note: freq gets division, not floor-division
                freq = self.freq / other
                if freq.nanos == 0 and self.freq.nanos != 0:
                    # e.g. if self.freq is Nano(1) then dividing by 2
                    #  rounds down to zero
                    freq = None
            return type(self)(result, freq=freq)

        if not hasattr(other, "dtype"):
            # list, tuple
            other = np.array(other)
        if len(other) != len(self):
            raise ValueError("Cannot divide with unequal lengths")

        elif is_timedelta64_dtype(other.dtype):
            other = type(self)(other)

            # numpy timedelta64 does not natively support floordiv, so operate
            #  on the i8 values
            result = self.asi8 // other.asi8
            mask = self._isnan | other._isnan
            if mask.any():
                result = result.astype(np.float64)
                np.putmask(result, mask, np.nan)
            return result

        elif is_object_dtype(other.dtype):
            # error: Incompatible types in assignment (expression has type
            # "List[Any]", variable has type "ndarray")
            srav = self.ravel()
            orav = other.ravel()
            res_list = [srav[n] // orav[n] for n in range(len(srav))]
            result_flat = np.asarray(res_list)
            inferred = lib.infer_dtype(result_flat, skipna=False)

            result = result_flat.reshape(self.shape)

            if inferred == "timedelta":
                result, _ = sequence_to_td64ns(result)
                return type(self)(result)
            if inferred == "datetime":
                # GH#39750 occurs when result is all-NaT, which in this
                #  case should be interpreted as td64nat. This can only
                #  occur when self is all-td64nat
                return self * np.nan
            return result

        elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype):
            result = self._ndarray // other
            return type(self)(result)

        else:
            dtype = getattr(other, "dtype", type(other).__name__)
            raise TypeError(f"Cannot divide {dtype} by {type(self).__name__}")
Пример #32
0
def coerce_to_array(values, dtype, mask=None, copy=False):
    """
    Coerce the input values array to numpy arrays with a mask

    Parameters
    ----------
    values : 1D list-like
    dtype : integer dtype
    mask : boolean 1D array, optional
    copy : boolean, default False
        if True, copy the input

    Returns
    -------
    tuple of (values, mask)
    """
    # if values is integer numpy array, preserve it's dtype
    if dtype is None and hasattr(values, 'dtype'):
        if is_integer_dtype(values.dtype):
            dtype = values.dtype

    if dtype is not None:
        if (isinstance(dtype, string_types)
                and (dtype.startswith("Int") or dtype.startswith("UInt"))):
            # Avoid DeprecationWarning from NumPy about np.dtype("Int64")
            # https://github.com/numpy/numpy/pull/7476
            dtype = dtype.lower()

        if not issubclass(type(dtype), _IntegerDtype):
            try:
                dtype = _dtypes[str(np.dtype(dtype))]
            except KeyError:
                raise ValueError("invalid dtype specified {}".format(dtype))

    if isinstance(values, IntegerArray):
        values, mask = values._data, values._mask
        if dtype is not None:
            values = values.astype(dtype.numpy_dtype, copy=False)

        if copy:
            values = values.copy()
            mask = mask.copy()
        return values, mask

    values = np.array(values, copy=copy)
    if is_object_dtype(values):
        inferred_type = lib.infer_dtype(values, skipna=True)
        if inferred_type == 'empty':
            values = np.empty(len(values))
            values.fill(np.nan)
        elif inferred_type not in [
                'floating', 'integer', 'mixed-integer', 'mixed-integer-float'
        ]:
            raise TypeError("{} cannot be converted to an IntegerDtype".format(
                values.dtype))

    elif not (is_integer_dtype(values) or is_float_dtype(values)):
        raise TypeError("{} cannot be converted to an IntegerDtype".format(
            values.dtype))

    if mask is None:
        mask = isna(values)
    else:
        assert len(mask) == len(values)

    if not values.ndim == 1:
        raise TypeError("values must be a 1D list-like")
    if not mask.ndim == 1:
        raise TypeError("mask must be a 1D list-like")

    # infer dtype if needed
    if dtype is None:
        dtype = np.dtype('int64')
    else:
        dtype = dtype.type

    # if we are float, let's make sure that we can
    # safely cast

    # we copy as need to coerce here
    if mask.any():
        values = values.copy()
        values[mask] = 1
        values = safe_cast(values, dtype, copy=False)
    else:
        values = safe_cast(values, dtype, copy=False)

    return values, mask
Пример #33
0
def ndarray_to_mgr(
    values, index, columns, dtype: Optional[DtypeObj], copy: bool, typ: str
) -> Manager:
    # used in DataFrame.__init__
    # input must be a ndarray, list, Series, Index, ExtensionArray

    if isinstance(values, ABCSeries):
        if columns is None:
            if values.name is not None:
                columns = Index([values.name])
        if index is None:
            index = values.index
        else:
            values = values.reindex(index)

        # zero len case (GH #2234)
        if not len(values) and columns is not None and len(columns):
            values = np.empty((0, 1), dtype=object)

    if is_extension_array_dtype(values) or is_extension_array_dtype(dtype):
        # GH#19157

        if isinstance(values, np.ndarray) and values.ndim > 1:
            # GH#12513 a EA dtype passed with a 2D array, split into
            #  multiple EAs that view the values
            values = [values[:, n] for n in range(values.shape[1])]
        else:
            values = [values]

        if columns is None:
            columns = Index(range(len(values)))

        return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ)

    # by definition an array here
    # the dtypes will be coerced to a single dtype
    values = _prep_ndarray(values, copy=copy)

    if dtype is not None and not is_dtype_equal(values.dtype, dtype):
        shape = values.shape
        flat = values.ravel()

        if not is_integer_dtype(dtype):
            # TODO: skipping integer_dtype is needed to keep the tests passing,
            #  not clear it is correct
            # Note: we really only need _try_cast, but keeping to exposed funcs
            values = sanitize_array(
                flat, None, dtype=dtype, copy=copy, raise_cast_failure=True
            )
        else:
            try:
                values = construct_1d_ndarray_preserving_na(
                    flat, dtype=dtype, copy=False
                )
            except Exception as err:
                # e.g. ValueError when trying to cast object dtype to float64
                msg = f"failed to cast to '{dtype}' (Exception was: {err})"
                raise ValueError(msg) from err
        values = values.reshape(shape)

    # _prep_ndarray ensures that values.ndim == 2 at this point
    index, columns = _get_axes(
        values.shape[0], values.shape[1], index=index, columns=columns
    )
    values = values.T

    # if we don't have a dtype specified, then try to convert objects
    # on the entire block; this is to convert if we have datetimelike's
    # embedded in an object type
    if dtype is None and is_object_dtype(values.dtype):

        if values.ndim == 2 and values.shape[0] != 1:
            # transpose and separate blocks

            dvals_list = [maybe_infer_to_datetimelike(row) for row in values]
            dvals_list = [ensure_block_shape(dval, 2) for dval in dvals_list]

            # TODO: What about re-joining object columns?
            dvals_list = [maybe_squeeze_dt64tz(x) for x in dvals_list]
            block_values = [
                new_block(dvals_list[n], placement=n, ndim=2)
                for n in range(len(dvals_list))
            ]

        else:
            datelike_vals = maybe_infer_to_datetimelike(values)
            datelike_vals = maybe_squeeze_dt64tz(datelike_vals)
            block_values = [datelike_vals]
    else:
        block_values = [maybe_squeeze_dt64tz(values)]

    return create_block_manager_from_blocks(block_values, [columns, index])
Пример #34
0
def _get_dummies_1d(
    data,
    prefix,
    prefix_sep="_",
    dummy_na: bool = False,
    sparse: bool = False,
    drop_first: bool = False,
    dtype: Optional[Dtype] = None,
) -> DataFrame:
    from pandas.core.reshape.concat import concat

    # Series avoids inconsistent NaN handling
    codes, levels = factorize_from_iterable(Series(data))

    if dtype is None:
        dtype = np.uint8
    # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str,
    # dtype[Any], Type[object]]"; expected "Type[Any]"
    dtype = np.dtype(dtype)  # type: ignore[arg-type]

    if is_object_dtype(dtype):
        raise ValueError("dtype=object is not a valid dtype for get_dummies")

    def get_empty_frame(data) -> DataFrame:
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        return DataFrame(index=index)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_frame(data)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_frame(data)

    number_of_cols = len(levels)

    if prefix is None:
        dummy_cols = levels
    else:
        dummy_cols = Index(
            [f"{prefix}{prefix_sep}{level}" for level in levels])

    index: Optional[Index]
    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:

        fill_value: Union[bool, float, int]
        if is_integer_dtype(dtype):
            fill_value = 0
        elif dtype == bool:
            fill_value = False
        else:
            fill_value = 0.0

        sparse_series = []
        N = len(data)
        sp_indices: List[List] = [[] for _ in range(len(dummy_cols))]
        mask = codes != -1
        codes = codes[mask]
        n_idx = np.arange(N)[mask]

        for ndx, code in zip(n_idx, codes):
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(
                np.ones(len(ixs), dtype=dtype),
                sparse_index=IntIndex(N, ixs),
                fill_value=fill_value,
                dtype=dtype,
            )
            sparse_series.append(Series(data=sarr, index=index, name=col))

        out = concat(sparse_series, axis=1, copy=False)
        # TODO: overload concat with Literal for axis
        out = cast(DataFrame, out)
        return out

    else:
        # take on axis=1 + transpose to ensure ndarray layout is column-major
        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=1).T

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Пример #35
0
def _get_dummies_1d(
    data,
    prefix,
    prefix_sep="_",
    dummy_na=False,
    sparse=False,
    drop_first=False,
    dtype=None,
):
    from pandas.core.reshape.concat import concat

    # Series avoids inconsistent NaN handling
    codes, levels = _factorize_from_iterable(Series(data))

    if dtype is None:
        dtype = np.uint8
    dtype = np.dtype(dtype)

    if is_object_dtype(dtype):
        raise ValueError("dtype=object is not a valid dtype for get_dummies")

    def get_empty_frame(data):
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        return DataFrame(index=index)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_frame(data)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_frame(data)

    number_of_cols = len(levels)

    if prefix is None:
        dummy_cols = levels
    else:

        # PY2 embedded unicode, gh-22084
        def _make_col_name(prefix, prefix_sep, level):
            fstr = "{prefix}{prefix_sep}{level}"
            return fstr.format(prefix=prefix,
                               prefix_sep=prefix_sep,
                               level=level)

        dummy_cols = [
            _make_col_name(prefix, prefix_sep, level) for level in levels
        ]

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:

        if is_integer_dtype(dtype):
            fill_value = 0
        elif dtype == bool:
            fill_value = False
        else:
            fill_value = 0.0

        sparse_series = []
        N = len(data)
        sp_indices = [[] for _ in range(len(dummy_cols))]
        mask = codes != -1
        codes = codes[mask]
        n_idx = np.arange(N)[mask]

        for ndx, code in zip(n_idx, codes):
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(
                np.ones(len(ixs), dtype=dtype),
                sparse_index=IntIndex(N, ixs),
                fill_value=fill_value,
                dtype=dtype,
            )
            sparse_series.append(Series(data=sarr, index=index, name=col))

        out = concat(sparse_series, axis=1, copy=False)
        return out

    else:
        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0)

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Пример #36
0
def _convert_listlike_datetimes(arg, box, format, name=None, tz=None,
                                unit=None, errors=None,
                                infer_datetime_format=None, dayfirst=None,
                                yearfirst=None, exact=None):
    """
    Helper function for to_datetime. Performs the conversions of 1D listlike
    of dates

    Parameters
    ----------
    arg : list, tuple, ndarray, Series, Index
        date to be parced
    box : boolean
        True boxes result as an Index-like, False returns an ndarray
    name : object
        None or string for the Index name
    tz : object
        None or 'utc'
    unit : string
        None or string of the frequency of the passed data
    errors : string
        error handing behaviors from to_datetime, 'raise', 'coerce', 'ignore'
    infer_datetime_format : boolean
        inferring format behavior from to_datetime
    dayfirst : boolean
        dayfirst parsing behavior from to_datetime
    yearfirst : boolean
        yearfirst parsing behavior from to_datetime
    exact : boolean
        exact format matching behavior from to_datetime

    Returns
    -------
    ndarray of parsed dates
        Returns:

        - Index-like if box=True
        - ndarray of Timestamps if box=False
    """
    from pandas import DatetimeIndex
    from pandas.core.arrays import DatetimeArray
    from pandas.core.arrays.datetimes import (
        maybe_convert_dtype, objects_to_datetime64ns)

    if isinstance(arg, (list, tuple)):
        arg = np.array(arg, dtype='O')

    # these are shortcutable
    if is_datetime64tz_dtype(arg):
        if not isinstance(arg, (DatetimeArray, DatetimeIndex)):
            return DatetimeIndex(arg, tz=tz, name=name)
        if tz == 'utc':
            arg = arg.tz_convert(None).tz_localize(tz)
        return arg

    elif is_datetime64_ns_dtype(arg):
        if box and not isinstance(arg, (DatetimeArray, DatetimeIndex)):
            try:
                return DatetimeIndex(arg, tz=tz, name=name)
            except ValueError:
                pass

        return arg

    elif unit is not None:
        if format is not None:
            raise ValueError("cannot specify both format and unit")
        arg = getattr(arg, 'values', arg)
        result, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit,
                                                              errors=errors)
        if box:
            if errors == 'ignore':
                from pandas import Index
                result = Index(result, name=name)
            else:
                result = DatetimeIndex(result, name=name)
            # GH 23758: We may still need to localize the result with tz
            # GH 25546: Apply tz_parsed first (from arg), then tz (from caller)
            # result will be naive but in UTC
            try:
                result = result.tz_localize('UTC').tz_convert(tz_parsed)
            except AttributeError:
                # Regular Index from 'ignore' path
                return result
            if tz is not None:
                if result.tz is None:
                    result = result.tz_localize(tz)
                else:
                    result = result.tz_convert(tz)
        return result
    elif getattr(arg, 'ndim', 1) > 1:
        raise TypeError('arg must be a string, datetime, list, tuple, '
                        '1-d array, or Series')

    # warn if passing timedelta64, raise for PeriodDtype
    # NB: this must come after unit transformation
    orig_arg = arg
    arg, _ = maybe_convert_dtype(arg, copy=False)

    arg = ensure_object(arg)
    require_iso8601 = False

    if infer_datetime_format and format is None:
        format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)

    if format is not None:
        # There is a special fast-path for iso8601 formatted
        # datetime strings, so in those cases don't use the inferred
        # format because this path makes process slower in this
        # special case
        format_is_iso8601 = _format_is_iso(format)
        if format_is_iso8601:
            require_iso8601 = not infer_datetime_format
            format = None

    tz_parsed = None
    result = None

    if format is not None:
        try:
            # shortcut formatting here
            if format == '%Y%m%d':
                try:
                    # pass orig_arg as float-dtype may have been converted to
                    # datetime64[ns]
                    orig_arg = ensure_object(orig_arg)
                    result = _attempt_YYYYMMDD(orig_arg, errors=errors)
                except (ValueError, TypeError, tslibs.OutOfBoundsDatetime):
                    raise ValueError("cannot convert the input to "
                                     "'%Y%m%d' date format")

            # fallback
            if result is None:
                try:
                    result, timezones = array_strptime(
                        arg, format, exact=exact, errors=errors)
                    if '%Z' in format or '%z' in format:
                        return _return_parsed_timezone_results(
                            result, timezones, box, tz, name)
                except tslibs.OutOfBoundsDatetime:
                    if errors == 'raise':
                        raise
                    elif errors == 'coerce':
                        result = np.empty(arg.shape, dtype='M8[ns]')
                        iresult = result.view('i8')
                        iresult.fill(tslibs.iNaT)
                    else:
                        result = arg
                except ValueError:
                    # if format was inferred, try falling back
                    # to array_to_datetime - terminate here
                    # for specified formats
                    if not infer_datetime_format:
                        if errors == 'raise':
                            raise
                        elif errors == 'coerce':
                            result = np.empty(arg.shape, dtype='M8[ns]')
                            iresult = result.view('i8')
                            iresult.fill(tslibs.iNaT)
                        else:
                            result = arg
        except ValueError as e:
            # Fallback to try to convert datetime objects if timezone-aware
            #  datetime objects are found without passing `utc=True`
            try:
                values, tz = conversion.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, name=name, tz=tz)
            except (ValueError, TypeError):
                raise e

    if result is None:
        assert format is None or infer_datetime_format
        utc = tz == 'utc'
        result, tz_parsed = objects_to_datetime64ns(
            arg, dayfirst=dayfirst, yearfirst=yearfirst,
            utc=utc, errors=errors, require_iso8601=require_iso8601,
            allow_object=True)

    if tz_parsed is not None:
        if box:
            # We can take a shortcut since the datetime64 numpy array
            # is in UTC
            return DatetimeIndex._simple_new(result, name=name,
                                             tz=tz_parsed)
        else:
            # Convert the datetime64 numpy array to an numpy array
            # of datetime objects
            result = [Timestamp(ts, tz=tz_parsed).to_pydatetime()
                      for ts in result]
            return np.array(result, dtype=object)

    if box:
        # Ensure we return an Index in all cases where box=True
        if is_datetime64_dtype(result):
            return DatetimeIndex(result, tz=tz, name=name)
        elif is_object_dtype(result):
            # e.g. an Index of datetime objects
            from pandas import Index
            return Index(result, name=name)
    return result
Пример #37
0
def sanitize_array(
    data, index, dtype=None, copy: bool = False, raise_cast_failure: bool = False
):
    """
    Sanitize input data to an ndarray, copy if specified, coerce to the
    dtype if specified.
    """
    if dtype is not None:
        dtype = pandas_dtype(dtype)

    if isinstance(data, ma.MaskedArray):
        mask = ma.getmaskarray(data)
        if mask.any():
            data, fill_value = maybe_upcast(data, copy=True)
            data.soften_mask()  # set hardmask False if it was True
            data[mask] = fill_value
        else:
            data = data.copy()

    # extract ndarray or ExtensionArray, ensure we have no PandasArray
    data = extract_array(data, extract_numpy=True)

    # GH#846
    if isinstance(data, np.ndarray):

        if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype):
            # possibility of nan -> garbage
            try:
                subarr = _try_cast(data, dtype, copy, True)
            except ValueError:
                if copy:
                    subarr = data.copy()
                else:
                    subarr = np.array(data, copy=False)
        else:
            # we will try to copy be-definition here
            subarr = _try_cast(data, dtype, copy, raise_cast_failure)

    elif isinstance(data, ABCExtensionArray):
        # it is already ensured above this is not a PandasArray
        subarr = data

        if dtype is not None:
            subarr = subarr.astype(dtype, copy=copy)
        elif copy:
            subarr = subarr.copy()
        return subarr

    elif isinstance(data, (list, tuple)) and len(data) > 0:
        if dtype is not None:
            subarr = _try_cast(data, dtype, copy, raise_cast_failure)
        else:
            subarr = maybe_convert_platform(data)

        subarr = maybe_cast_to_datetime(subarr, dtype)

    elif isinstance(data, range):
        # GH#16804
        arr = np.arange(data.start, data.stop, data.step, dtype="int64")
        subarr = _try_cast(arr, dtype, copy, raise_cast_failure)
    else:
        subarr = _try_cast(data, dtype, copy, raise_cast_failure)

    # scalar like, GH
    if getattr(subarr, "ndim", 0) == 0:
        if isinstance(data, list):  # pragma: no cover
            subarr = np.array(data, dtype=object)
        elif index is not None:
            value = data

            # figure out the dtype from the value (upcast if necessary)
            if dtype is None:
                dtype, value = infer_dtype_from_scalar(value)
            else:
                # need to possibly convert the value here
                value = maybe_cast_to_datetime(value, dtype)

            subarr = construct_1d_arraylike_from_scalar(value, len(index), dtype)

        else:
            return subarr.item()

    # the result that we want
    elif subarr.ndim == 1:
        if index is not None:

            # a 1-element ndarray
            if len(subarr) != len(index) and len(subarr) == 1:
                subarr = construct_1d_arraylike_from_scalar(
                    subarr[0], len(index), subarr.dtype
                )

    elif subarr.ndim > 1:
        if isinstance(data, np.ndarray):
            raise Exception("Data must be 1-dimensional")
        else:
            subarr = com.asarray_tuplesafe(data, dtype=dtype)

    if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)):
        # This is to prevent mixed-type Series getting all casted to
        # NumPy string type, e.g. NaN --> '-1#IND'.
        if issubclass(subarr.dtype.type, str):
            # GH#16605
            # If not empty convert the data to dtype
            # GH#19853: If data is a scalar, subarr has already the result
            if not lib.is_scalar(data):
                if not np.all(isna(data)):
                    data = np.array(data, dtype=dtype, copy=False)
                subarr = np.array(data, dtype=object, copy=copy)

        if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype):
            inferred = lib.infer_dtype(subarr, skipna=False)
            if inferred == "period":
                from pandas.core.arrays import period_array

                try:
                    subarr = period_array(subarr)
                except IncompatibleFrequency:
                    pass

    return subarr
Пример #38
0
 def _is_numeric(self) -> bool:
     return not is_object_dtype(self.subtype)
Пример #39
0
    def _cast_types(self, values, cast_type, column):
        """
        Cast values to specified type

        Parameters
        ----------
        values : ndarray
        cast_type : string or np.dtype
           dtype to cast values to
        column : string
            column name - used only for error reporting

        Returns
        -------
        converted : ndarray
        """
        if is_categorical_dtype(cast_type):
            known_cats = (isinstance(cast_type, CategoricalDtype)
                          and cast_type.categories is not None)

            if not is_object_dtype(values) and not known_cats:
                # TODO: this is for consistency with
                # c-parser which parses all categories
                # as strings

                values = astype_nansafe(values, np.dtype(str))

            cats = Index(values).unique().dropna()
            values = Categorical._from_inferred_categories(
                cats,
                cats.get_indexer(values),
                cast_type,
                true_values=self.true_values)

        # use the EA's implementation of casting
        elif is_extension_array_dtype(cast_type):
            # ensure cast_type is an actual dtype and not a string
            cast_type = pandas_dtype(cast_type)
            array_type = cast_type.construct_array_type()
            try:
                if is_bool_dtype(cast_type):
                    return array_type._from_sequence_of_strings(
                        values,
                        dtype=cast_type,
                        true_values=self.true_values,
                        false_values=self.false_values,
                    )
                else:
                    return array_type._from_sequence_of_strings(
                        values, dtype=cast_type)
            except NotImplementedError as err:
                raise NotImplementedError(
                    f"Extension Array: {array_type} must implement "
                    "_from_sequence_of_strings in order to be used in parser methods"
                ) from err

        else:
            try:
                values = astype_nansafe(values,
                                        cast_type,
                                        copy=True,
                                        skipna=True)
            except ValueError as err:
                raise ValueError(
                    f"Unable to convert column {column} to type {cast_type}"
                ) from err
        return values
Пример #40
0
def _try_cast(
    arr: list | np.ndarray,
    dtype: DtypeObj | None,
    copy: bool,
    raise_cast_failure: bool,
) -> ArrayLike:
    """
    Convert input to numpy ndarray and optionally cast to a given dtype.

    Parameters
    ----------
    arr : ndarray or list
        Excludes: ExtensionArray, Series, Index.
    dtype : np.dtype, ExtensionDtype or None
    copy : bool
        If False, don't copy the data if not needed.
    raise_cast_failure : bool
        If True, and if a dtype is specified, raise errors during casting.
        Otherwise an object array is returned.

    Returns
    -------
    np.ndarray or ExtensionArray
    """
    is_ndarray = isinstance(arr, np.ndarray)

    if dtype is None:
        # perf shortcut as this is the most common case
        if is_ndarray:
            arr = cast(np.ndarray, arr)
            if arr.dtype != object:
                return sanitize_to_nanoseconds(arr, copy=copy)

            out = maybe_infer_to_datetimelike(arr)
            if out is arr and copy:
                out = out.copy()
            return out

        else:
            # i.e. list
            varr = np.array(arr, copy=False)
            # filter out cases that we _dont_ want to go through
            #  maybe_infer_to_datetimelike
            if varr.dtype != object or varr.size == 0:
                return varr
            return maybe_infer_to_datetimelike(varr)

    elif isinstance(dtype, ExtensionDtype):
        # create an extension array from its dtype
        if isinstance(dtype, DatetimeTZDtype):
            # We can't go through _from_sequence because it handles dt64naive
            #  data differently; _from_sequence treats naive as wall times,
            #  while maybe_cast_to_datetime treats it as UTC
            #  see test_maybe_promote_any_numpy_dtype_with_datetimetz

            return maybe_cast_to_datetime(arr, dtype)
            # TODO: copy?

        array_type = dtype.construct_array_type()._from_sequence
        subarr = array_type(arr, dtype=dtype, copy=copy)
        return subarr

    elif is_object_dtype(dtype):
        if not is_ndarray:
            subarr = construct_1d_object_array_from_listlike(arr)
            return subarr
        return ensure_wrapped_if_datetimelike(arr).astype(dtype, copy=copy)

    elif dtype.kind == "U":
        # TODO: test cases with arr.dtype.kind in ["m", "M"]
        return lib.ensure_string_array(arr, convert_na_value=False, copy=copy)

    elif dtype.kind in ["m", "M"]:
        return maybe_cast_to_datetime(arr, dtype)

    try:
        # GH#15832: Check if we are requesting a numeric dtype and
        # that we can convert the data to the requested dtype.
        if is_integer_dtype(dtype):
            # this will raise if we have e.g. floats

            subarr = maybe_cast_to_integer_array(arr, dtype)
        else:
            # 4 tests fail if we move this to a try/except/else; see
            #  test_constructor_compound_dtypes, test_constructor_cast_failure
            #  test_constructor_dict_cast2, test_loc_setitem_dtype
            subarr = np.array(arr, dtype=dtype, copy=copy)

    except (ValueError, TypeError):
        if raise_cast_failure:
            raise
        else:
            # we only get here with raise_cast_failure False, which means
            #  called via the DataFrame constructor
            # GH#24435
            warnings.warn(
                f"Could not cast to {dtype}, falling back to object. This "
                "behavior is deprecated. In a future version, when a dtype is "
                "passed to 'DataFrame', either all columns will be cast to that "
                "dtype, or a TypeError will be raised.",
                FutureWarning,
                stacklevel=7,
            )
            subarr = np.array(arr, dtype=object, copy=copy)
    return subarr
Пример #41
0
    def wrapper(self, other):

        res_name = get_op_result_name(self, other)

        # TODO: shouldn't we be applying finalize whenever
        #  not isinstance(other, ABCSeries)?
        finalizer = (
            lambda x: x.__finalize__(self)
            if isinstance(other, (np.ndarray, ABCIndexClass))
            else x
        )

        if isinstance(other, ABCDataFrame):  # pragma: no cover
            # Defer to DataFrame implementation; fail early
            return NotImplemented

        if isinstance(other, ABCSeries) and not self._indexed_same(other):
            raise ValueError("Can only compare identically-labeled Series objects")

        other = lib.item_from_zerodim(other)
        if isinstance(other, list):
            # TODO: same for tuples?
            other = np.asarray(other)

        if isinstance(other, (np.ndarray, ABCExtensionArray, ABCIndexClass)):
            # TODO: make this treatment consistent across ops and classes.
            #  We are not catching all listlikes here (e.g. frozenset, tuple)
            #  The ambiguous case is object-dtype.  See GH#27803
            if len(self) != len(other):
                raise ValueError("Lengths must match to compare")

        lvalues = extract_array(self, extract_numpy=True)
        rvalues = extract_array(other, extract_numpy=True)

        if should_extension_dispatch(lvalues, rvalues):
            res_values = dispatch_to_extension_op(op, lvalues, rvalues)

        elif is_scalar(rvalues) and isna(rvalues):
            # numpy does not like comparisons vs None
            if op is operator.ne:
                res_values = np.ones(len(lvalues), dtype=bool)
            else:
                res_values = np.zeros(len(lvalues), dtype=bool)

        elif is_object_dtype(lvalues.dtype):
            res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)

        else:
            op_name = "__{op}__".format(op=op.__name__)
            method = getattr(lvalues, op_name)
            with np.errstate(all="ignore"):
                res_values = method(rvalues)

            if res_values is NotImplemented:
                res_values = invalid_comparison(lvalues, rvalues, op)
            if is_scalar(res_values):
                raise TypeError(
                    "Could not compare {typ} type with Series".format(typ=type(rvalues))
                )

        result = self._constructor(res_values, index=self.index)
        result = finalizer(result)

        # Set the result's name after finalizer is called because finalizer
        #  would set it back to self.name
        result.name = res_name
        return result
Пример #42
0
def coerce_to_array(
    values, dtype=None, mask=None, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
    """
    Coerce the input values array to numpy arrays with a mask.

    Parameters
    ----------
    values : 1D list-like
    dtype : float dtype
    mask : bool 1D array, optional
    copy : bool, default False
        if True, copy the input

    Returns
    -------
    tuple of (values, mask)
    """
    # if values is floating numpy array, preserve its dtype
    if dtype is None and hasattr(values, "dtype"):
        if is_float_dtype(values.dtype):
            dtype = values.dtype

    if dtype is not None:
        if isinstance(dtype, str) and dtype.startswith("Float"):
            # Avoid DeprecationWarning from NumPy about np.dtype("Float64")
            # https://github.com/numpy/numpy/pull/7476
            dtype = dtype.lower()

        if not issubclass(type(dtype), FloatingDtype):
            try:
                dtype = FLOAT_STR_TO_DTYPE[str(np.dtype(dtype))]
            except KeyError as err:
                raise ValueError(f"invalid dtype specified {dtype}") from err

    if isinstance(values, FloatingArray):
        values, mask = values._data, values._mask
        if dtype is not None:
            values = values.astype(dtype.numpy_dtype, copy=False)

        if copy:
            values = values.copy()
            mask = mask.copy()
        return values, mask

    values = np.array(values, copy=copy)
    if is_object_dtype(values.dtype):
        inferred_type = lib.infer_dtype(values, skipna=True)
        if inferred_type == "empty":
            pass
        elif inferred_type not in [
            "floating",
            "integer",
            "mixed-integer",
            "integer-na",
            "mixed-integer-float",
        ]:
            raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")

    elif is_bool_dtype(values) and is_float_dtype(dtype):
        values = np.array(values, dtype=float, copy=copy)

    elif not (is_integer_dtype(values) or is_float_dtype(values)):
        raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")

    if values.ndim != 1:
        raise TypeError("values must be a 1D list-like")

    if mask is None:
        mask = libmissing.is_numeric_na(values)

    else:
        assert len(mask) == len(values)

    if not mask.ndim == 1:
        raise TypeError("mask must be a 1D list-like")

    # infer dtype if needed
    if dtype is None:
        dtype = np.dtype("float64")
    else:
        dtype = dtype.type

    # if we are float, let's make sure that we can
    # safely cast

    # we copy as need to coerce here
    # TODO should this be a safe cast?
    if mask.any():
        values = values.copy()
        values[mask] = np.nan
    values = values.astype(dtype, copy=False)  # , casting="safe")

    return values, mask
Пример #43
0
def _get_dummies_1d(data,
                    prefix,
                    prefix_sep='_',
                    dummy_na=False,
                    sparse=False,
                    drop_first=False,
                    dtype=None):
    # Series avoids inconsistent NaN handling
    codes, levels = _factorize_from_iterable(Series(data))

    if dtype is None:
        dtype = np.uint8
    dtype = np.dtype(dtype)

    if is_object_dtype(dtype):
        raise ValueError("dtype=object is not a valid dtype for get_dummies")

    def get_empty_Frame(data, sparse):
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        if not sparse:
            return DataFrame(index=index)
        else:
            return SparseDataFrame(index=index, default_fill_value=0)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_Frame(data, sparse)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_Frame(data, sparse)

    number_of_cols = len(levels)

    if prefix is not None:
        dummy_strs = [
            u'{prefix}{sep}{level}'
            if isinstance(v, text_type) else '{prefix}{sep}{level}'
            for v in levels
        ]
        dummy_cols = [
            dummy_str.format(prefix=prefix, sep=prefix_sep, level=v)
            for dummy_str, v in zip(dummy_strs, levels)
        ]
    else:
        dummy_cols = levels

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:
        sparse_series = {}
        N = len(data)
        sp_indices = [[] for _ in range(len(dummy_cols))]
        for ndx, code in enumerate(codes):
            if code == -1:
                # Blank entries if not dummy_na and code == -1, #GH4446
                continue
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(np.ones(len(ixs), dtype=dtype),
                               sparse_index=IntIndex(N, ixs),
                               fill_value=0,
                               dtype=dtype)
            sparse_series[col] = SparseSeries(data=sarr, index=index)

        out = SparseDataFrame(sparse_series,
                              index=index,
                              columns=dummy_cols,
                              default_fill_value=0,
                              dtype=dtype)
        return out

    else:
        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0)

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
Пример #44
0
def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
    """
    Parameters
    ----------
    data : list-like
    copy : bool, default False
    unit : str, default "ns"
        The timedelta unit to treat integers as multiples of.
    errors : {"raise", "coerce", "ignore"}, default "raise"
        How to handle elements that cannot be converted to timedelta64[ns].
        See ``pandas.to_timedelta`` for details.

    Returns
    -------
    converted : numpy.ndarray
        The sequence converted to a numpy array with dtype ``timedelta64[ns]``.
    inferred_freq : Tick or None
        The inferred frequency of the sequence.

    Raises
    ------
    ValueError : Data cannot be converted to timedelta64[ns].

    Notes
    -----
    Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause
    errors to be ignored; they are caught and subsequently ignored at a
    higher level.
    """
    inferred_freq = None
    unit = parse_timedelta_unit(unit)

    # Unwrap whatever we have into a np.ndarray
    if not hasattr(data, "dtype"):
        # e.g. list, tuple
        if np.ndim(data) == 0:
            # i.e. generator
            data = list(data)
        data = np.array(data, copy=False)
    elif isinstance(data, ABCSeries):
        data = data._values
    elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)):
        inferred_freq = data.freq
        data = data._data

    # Convert whatever we have into timedelta64[ns] dtype
    if is_object_dtype(data.dtype) or is_string_dtype(data.dtype):
        # no need to make a copy, need to convert if string-dtyped
        data = objects_to_td64ns(data, unit=unit, errors=errors)
        copy = False

    elif is_integer_dtype(data.dtype):
        # treat as multiples of the given unit
        data, copy_made = ints_to_td64ns(data, unit=unit)
        copy = copy and not copy_made

    elif is_float_dtype(data.dtype):
        # cast the unit, multiply base/frac separately
        # to avoid precision issues from float -> int
        mask = np.isnan(data)
        m, p = precision_from_unit(unit)
        base = data.astype(np.int64)
        frac = data - base
        if p:
            frac = np.round(frac, p)
        data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")
        data[mask] = iNaT
        copy = False

    elif is_timedelta64_dtype(data.dtype):
        if data.dtype != TD64NS_DTYPE:
            # non-nano unit
            # TODO: watch out for overflows
            data = data.astype(TD64NS_DTYPE)
            copy = False

    else:
        # This includes datetime64-dtype, see GH#23539, GH#29794
        raise TypeError(
            f"dtype {data.dtype} cannot be converted to timedelta64[ns]")

    data = np.array(data, copy=copy)

    assert data.dtype == "m8[ns]", data
    return data, inferred_freq
Пример #45
0
def init_ndarray(values, index, columns, dtype=None, copy=False):
    # input must be a ndarray, list, Series, index

    if isinstance(values, ABCSeries):
        if columns is None:
            if values.name is not None:
                columns = [values.name]
        if index is None:
            index = values.index
        else:
            values = values.reindex(index)

        # zero len case (GH #2234)
        if not len(values) and columns is not None and len(columns):
            values = np.empty((0, 1), dtype=object)

    # we could have a categorical type passed or coerced to 'category'
    # recast this to an arrays_to_mgr
    if is_categorical_dtype(getattr(values, "dtype",
                                    None)) or is_categorical_dtype(dtype):

        if not hasattr(values, "dtype"):
            values = _prep_ndarray(values, copy=copy)
            values = values.ravel()
        elif copy:
            values = values.copy()

        index, columns = _get_axes(len(values), 1, index, columns)
        return arrays_to_mgr([values], columns, index, columns, dtype=dtype)
    elif is_extension_array_dtype(values) or is_extension_array_dtype(dtype):
        # GH#19157

        if isinstance(values, np.ndarray) and values.ndim > 1:
            # GH#12513 a EA dtype passed with a 2D array, split into
            #  multiple EAs that view the values
            values = [values[:, n] for n in range(values.shape[1])]
        else:
            values = [values]

        if columns is None:
            columns = list(range(len(values)))
        return arrays_to_mgr(values, columns, index, columns, dtype=dtype)

    # by definition an array here
    # the dtypes will be coerced to a single dtype
    values = _prep_ndarray(values, copy=copy)

    if dtype is not None:
        if not is_dtype_equal(values.dtype, dtype):
            try:
                values = values.astype(dtype)
            except Exception as orig:
                # e.g. ValueError when trying to cast object dtype to float64
                raise ValueError(
                    f"failed to cast to '{dtype}' (Exception was: {orig})"
                ) from orig

    index, columns = _get_axes(*values.shape, index=index, columns=columns)
    values = values.T

    # if we don't have a dtype specified, then try to convert objects
    # on the entire block; this is to convert if we have datetimelike's
    # embedded in an object type
    if dtype is None and is_object_dtype(values):

        if values.ndim == 2 and values.shape[0] != 1:
            # transpose and separate blocks

            dvals_list = [maybe_infer_to_datetimelike(row) for row in values]
            for n in range(len(dvals_list)):
                if isinstance(dvals_list[n], np.ndarray):
                    dvals_list[n] = dvals_list[n].reshape(1, -1)

            from pandas.core.internals.blocks import make_block

            # TODO: What about re-joining object columns?
            block_values = [
                make_block(dvals_list[n], placement=[n])
                for n in range(len(dvals_list))
            ]

        else:
            datelike_vals = maybe_infer_to_datetimelike(values)
            block_values = [datelike_vals]
    else:
        block_values = [values]

    return create_block_manager_from_blocks(block_values, [columns, index])
Пример #46
0
    def _str_map(
        self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
    ):
        # TODO: de-duplicate with StringArray method. This method is moreless copy and
        # paste.

        from pandas.arrays import (
            BooleanArray,
            IntegerArray,
        )

        if dtype is None:
            dtype = self.dtype
        if na_value is None:
            na_value = self.dtype.na_value

        mask = isna(self)
        arr = np.asarray(self)

        if is_integer_dtype(dtype) or is_bool_dtype(dtype):
            constructor: type[IntegerArray] | type[BooleanArray]
            if is_integer_dtype(dtype):
                constructor = IntegerArray
            else:
                constructor = BooleanArray

            na_value_is_na = isna(na_value)
            if na_value_is_na:
                na_value = 1
            result = lib.map_infer_mask(
                arr,
                f,
                mask.view("uint8"),
                convert=False,
                na_value=na_value,
                # error: Value of type variable "_DTypeScalar" of "dtype" cannot be
                # "object"
                # error: Argument 1 to "dtype" has incompatible type
                # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
                # "Type[object]"
                dtype=np.dtype(dtype),  # type: ignore[type-var,arg-type]
            )

            if not na_value_is_na:
                mask[:] = False

            return constructor(result, mask)

        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
            # i.e. StringDtype
            result = lib.map_infer_mask(
                arr, f, mask.view("uint8"), convert=False, na_value=na_value
            )
            result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True)
            return type(self)(result)
        else:
            # This is when the result type is object. We reach this when
            # -> We know the result type is truly object (e.g. .encode returns bytes
            #    or .findall returns a list).
            # -> We don't know the result type. E.g. `.get` can return anything.
            return lib.map_infer_mask(arr, f, mask.view("uint8"))
Пример #47
0
def _try_cast(
    arr,
    dtype: Optional[DtypeObj],
    copy: bool,
    raise_cast_failure: bool,
):
    """
    Convert input to numpy ndarray and optionally cast to a given dtype.

    Parameters
    ----------
    arr : ndarray, scalar, list, tuple, iterator (catchall)
        Excludes: ExtensionArray, Series, Index.
    dtype : np.dtype, ExtensionDtype or None
    copy : bool
        If False, don't copy the data if not needed.
    raise_cast_failure : bool
        If True, and if a dtype is specified, raise errors during casting.
        Otherwise an object array is returned.
    """
    # perf shortcut as this is the most common case
    if isinstance(arr, np.ndarray):
        if maybe_castable(arr) and not copy and dtype is None:
            return arr

    if isinstance(dtype, ExtensionDtype) and dtype.kind != "M":
        # create an extension array from its dtype
        # DatetimeTZ case needs to go through maybe_cast_to_datetime
        array_type = dtype.construct_array_type()._from_sequence
        subarr = array_type(arr, dtype=dtype, copy=copy)
        return subarr

    try:
        # GH#15832: Check if we are requesting a numeric dtype and
        # that we can convert the data to the requested dtype.
        if is_integer_dtype(dtype):
            # this will raise if we have e.g. floats
            maybe_cast_to_integer_array(arr, dtype)
            subarr = arr
        else:
            subarr = maybe_cast_to_datetime(arr, dtype)

        # Take care in creating object arrays (but iterators are not
        # supported):
        if is_object_dtype(dtype) and (
                is_list_like(subarr) and
                not (is_iterator(subarr) or isinstance(subarr, np.ndarray))):
            subarr = construct_1d_object_array_from_listlike(subarr)
        elif not is_extension_array_dtype(subarr):
            subarr = construct_1d_ndarray_preserving_na(subarr,
                                                        dtype,
                                                        copy=copy)
    except OutOfBoundsDatetime:
        # in case of out of bound datetime64 -> always raise
        raise
    except (ValueError, TypeError):
        if dtype is not None and raise_cast_failure:
            raise
        else:
            subarr = np.array(arr, dtype=object, copy=copy)
    return subarr
Пример #48
0
    def to_numpy(
        self,
        dtype: npt.DTypeLike | None = None,
        copy: bool = False,
        na_value: Scalar = lib.no_default,
    ) -> np.ndarray:
        """
        Convert to a NumPy Array.

        By default converts to an object-dtype NumPy array. Specify the `dtype` and
        `na_value` keywords to customize the conversion.

        Parameters
        ----------
        dtype : dtype, default object
            The numpy dtype to convert to.
        copy : bool, default False
            Whether to ensure that the returned value is a not a view on
            the array. Note that ``copy=False`` does not *ensure* that
            ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
            a copy is made, even if not strictly necessary. This is typically
            only possible when no missing values are present and `dtype`
            is the equivalent numpy dtype.
        na_value : scalar, optional
             Scalar missing value indicator to use in numpy array. Defaults
             to the native missing value indicator of this array (pd.NA).

        Returns
        -------
        numpy.ndarray

        Examples
        --------
        An object-dtype is the default result

        >>> a = pd.array([True, False, pd.NA], dtype="boolean")
        >>> a.to_numpy()
        array([True, False, <NA>], dtype=object)

        When no missing values are present, an equivalent dtype can be used.

        >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool")
        array([ True, False])
        >>> pd.array([1, 2], dtype="Int64").to_numpy("int64")
        array([1, 2])

        However, requesting such dtype will raise a ValueError if
        missing values are present and the default missing value :attr:`NA`
        is used.

        >>> a = pd.array([True, False, pd.NA], dtype="boolean")
        >>> a
        <BooleanArray>
        [True, False, <NA>]
        Length: 3, dtype: boolean

        >>> a.to_numpy(dtype="bool")
        Traceback (most recent call last):
        ...
        ValueError: cannot convert to bool numpy array in presence of missing values

        Specify a valid `na_value` instead

        >>> a.to_numpy(dtype="bool", na_value=False)
        array([ True, False, False])
        """
        if na_value is lib.no_default:
            na_value = libmissing.NA
        if dtype is None:
            dtype = object
        if self._hasna:
            if (not is_object_dtype(dtype) and not is_string_dtype(dtype)
                    and na_value is libmissing.NA):
                raise ValueError(
                    f"cannot convert to '{dtype}'-dtype NumPy array "
                    "with missing values. Specify an appropriate 'na_value' "
                    "for this dtype.")
            # don't pass copy to astype -> always need a copy since we are mutating
            data = self._data.astype(dtype)
            data[self._mask] = na_value
        else:
            data = self._data.astype(dtype, copy=copy)
        return data
Пример #49
0
    def _cmp_method(self, other, op):
        # ensure pandas array for list-like and eliminate non-interval scalars
        if is_list_like(other):
            if len(self) != len(other):
                raise ValueError("Lengths must match to compare")
            other = array(other)
        elif not isinstance(other, Interval):
            # non-interval scalar -> no matches
            return invalid_comparison(self, other, op)

        # determine the dtype of the elements we want to compare
        if isinstance(other, Interval):
            other_dtype = pandas_dtype("interval")
        elif not is_categorical_dtype(other.dtype):
            other_dtype = other.dtype
        else:
            # for categorical defer to categories for dtype
            other_dtype = other.categories.dtype

            # extract intervals if we have interval categories with matching closed
            if is_interval_dtype(other_dtype):
                if self.closed != other.categories.closed:
                    return invalid_comparison(self, other, op)

                other = other.categories.take(
                    other.codes, allow_fill=True, fill_value=other.categories._na_value
                )

        # interval-like -> need same closed and matching endpoints
        if is_interval_dtype(other_dtype):
            if self.closed != other.closed:
                return invalid_comparison(self, other, op)
            elif not isinstance(other, Interval):
                other = type(self)(other)

            if op is operator.eq:
                return (self._left == other.left) & (self._right == other.right)
            elif op is operator.ne:
                return (self._left != other.left) | (self._right != other.right)
            elif op is operator.gt:
                return (self._left > other.left) | (
                    (self._left == other.left) & (self._right > other.right)
                )
            elif op is operator.ge:
                return (self == other) | (self > other)
            elif op is operator.lt:
                return (self._left < other.left) | (
                    (self._left == other.left) & (self._right < other.right)
                )
            else:
                # operator.lt
                return (self == other) | (self < other)

        # non-interval/non-object dtype -> no matches
        if not is_object_dtype(other_dtype):
            return invalid_comparison(self, other, op)

        # object dtype -> iteratively check for intervals
        result = np.zeros(len(self), dtype=bool)
        for i, obj in enumerate(other):
            try:
                result[i] = op(self[i], obj)
            except TypeError:
                if obj is NA:
                    # comparison with np.nan returns NA
                    # github.com/pandas-dev/pandas/pull/37124#discussion_r509095092
                    result[i] = op is operator.ne
                else:
                    raise
        return result
Пример #50
0
def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike:
    """
    Evaluate a comparison operation `=`, `!=`, `>=`, `>`, `<=`, or `<`.

    Note: the caller is responsible for ensuring that numpy warnings are
    suppressed (with np.errstate(all="ignore")) if needed.

    Parameters
    ----------
    left : np.ndarray or ExtensionArray
    right : object
        Cannot be a DataFrame, Series, or Index.
    op : {operator.eq, operator.ne, operator.gt, operator.ge, operator.lt, operator.le}

    Returns
    -------
    ndarray or ExtensionArray
    """
    # NB: We assume extract_array has already been called on left and right
    lvalues = ensure_wrapped_if_datetimelike(left)
    rvalues = ensure_wrapped_if_datetimelike(right)

    rvalues = lib.item_from_zerodim(rvalues)
    if isinstance(rvalues, list):
        # We don't catch tuple here bc we may be comparing e.g. MultiIndex
        #  to a tuple that represents a single entry, see test_compare_tuple_strs
        rvalues = np.asarray(rvalues)

    if isinstance(rvalues, (np.ndarray, ABCExtensionArray)):
        # TODO: make this treatment consistent across ops and classes.
        #  We are not catching all listlikes here (e.g. frozenset, tuple)
        #  The ambiguous case is object-dtype.  See GH#27803
        if len(lvalues) != len(rvalues):
            raise ValueError(
                "Lengths must match to compare", lvalues.shape, rvalues.shape
            )

    if should_extension_dispatch(lvalues, rvalues) or (
        (isinstance(rvalues, (Timedelta, BaseOffset, Timestamp)) or right is NaT)
        and not is_object_dtype(lvalues.dtype)
    ):
        # Call the method on lvalues
        res_values = op(lvalues, rvalues)

    elif is_scalar(rvalues) and isna(rvalues):  # TODO: but not pd.NA?
        # numpy does not like comparisons vs None
        if op is operator.ne:
            res_values = np.ones(lvalues.shape, dtype=bool)
        else:
            res_values = np.zeros(lvalues.shape, dtype=bool)

    elif is_numeric_v_string_like(lvalues, rvalues):
        # GH#36377 going through the numexpr path would incorrectly raise
        return invalid_comparison(lvalues, rvalues, op)

    elif is_object_dtype(lvalues.dtype) or isinstance(rvalues, str):
        res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues)

    else:
        res_values = _na_arithmetic_op(lvalues, rvalues, op, is_cmp=True)

    return res_values
Пример #51
0
def sanitize_array(data,
                   index,
                   dtype=None,
                   copy=False,
                   raise_cast_failure=False):
    """
    Sanitize input data to an ndarray, copy if specified, coerce to the
    dtype if specified.
    """
    if dtype is not None:
        dtype = pandas_dtype(dtype)

    if isinstance(data, ma.MaskedArray):
        mask = ma.getmaskarray(data)
        if mask.any():
            data, fill_value = maybe_upcast(data, copy=True)
            data.soften_mask()  # set hardmask False if it was True
            data[mask] = fill_value
        else:
            data = data.copy()

    data = extract_array(data, extract_numpy=True)

    # GH#846
    if isinstance(data, np.ndarray):

        if dtype is not None:
            subarr = np.array(data, copy=False)

            # possibility of nan -> garbage
            if is_float_dtype(data.dtype) and is_integer_dtype(dtype):
                try:
                    subarr = _try_cast(data, True, dtype, copy, True)
                except ValueError:
                    if copy:
                        subarr = data.copy()
            else:
                subarr = _try_cast(data, True, dtype, copy, raise_cast_failure)
        elif isinstance(data, Index):
            # don't coerce Index types
            # e.g. indexes can have different conversions (so don't fast path
            # them)
            # GH#6140
            subarr = sanitize_index(data, index, copy=copy)
        else:

            # we will try to copy be-definition here
            subarr = _try_cast(data, True, dtype, copy, raise_cast_failure)

    elif isinstance(data, ExtensionArray):
        if isinstance(data, ABCPandasArray):
            # We don't want to let people put our PandasArray wrapper
            # (the output of Series/Index.array), into a Series. So
            # we explicitly unwrap it here.
            subarr = data.to_numpy()
        else:
            subarr = data

        # everything else in this block must also handle ndarray's,
        # becuase we've unwrapped PandasArray into an ndarray.

        if dtype is not None:
            subarr = data.astype(dtype)

        if copy:
            subarr = data.copy()
        return subarr

    elif isinstance(data, (list, tuple)) and len(data) > 0:
        if dtype is not None:
            try:
                subarr = _try_cast(data, False, dtype, copy,
                                   raise_cast_failure)
            except Exception:
                if raise_cast_failure:  # pragma: no cover
                    raise
                subarr = np.array(data, dtype=object, copy=copy)
                subarr = lib.maybe_convert_objects(subarr)

        else:
            subarr = maybe_convert_platform(data)

        subarr = maybe_cast_to_datetime(subarr, dtype)

    elif isinstance(data, range):
        # GH#16804
        arr = np.arange(data.start, data.stop, data.step, dtype='int64')
        subarr = _try_cast(arr, False, dtype, copy, raise_cast_failure)
    else:
        subarr = _try_cast(data, False, dtype, copy, raise_cast_failure)

    # scalar like, GH
    if getattr(subarr, 'ndim', 0) == 0:
        if isinstance(data, list):  # pragma: no cover
            subarr = np.array(data, dtype=object)
        elif index is not None:
            value = data

            # figure out the dtype from the value (upcast if necessary)
            if dtype is None:
                dtype, value = infer_dtype_from_scalar(value)
            else:
                # need to possibly convert the value here
                value = maybe_cast_to_datetime(value, dtype)

            subarr = construct_1d_arraylike_from_scalar(
                value, len(index), dtype)

        else:
            return subarr.item()

    # the result that we want
    elif subarr.ndim == 1:
        if index is not None:

            # a 1-element ndarray
            if len(subarr) != len(index) and len(subarr) == 1:
                subarr = construct_1d_arraylike_from_scalar(
                    subarr[0], len(index), subarr.dtype)

    elif subarr.ndim > 1:
        if isinstance(data, np.ndarray):
            raise Exception('Data must be 1-dimensional')
        else:
            subarr = com.asarray_tuplesafe(data, dtype=dtype)

    # This is to prevent mixed-type Series getting all casted to
    # NumPy string type, e.g. NaN --> '-1#IND'.
    if issubclass(subarr.dtype.type, str):
        # GH#16605
        # If not empty convert the data to dtype
        # GH#19853: If data is a scalar, subarr has already the result
        if not lib.is_scalar(data):
            if not np.all(isna(data)):
                data = np.array(data, dtype=dtype, copy=False)
            subarr = np.array(data, dtype=object, copy=copy)

    if is_object_dtype(subarr.dtype) and dtype != 'object':
        inferred = lib.infer_dtype(subarr, skipna=False)
        if inferred == 'period':
            try:
                subarr = period_array(subarr)
            except IncompatibleFrequency:
                pass

    return subarr
Пример #52
0
 def astype(self, dtype, copy=True):
     if is_object_dtype(dtype):
         return self._box_values(self.asi8)
     return super(DatetimeLikeArrayMixin, self).astype(dtype, copy)
Пример #53
0
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool,
                   typ: str) -> Manager:
    # used in DataFrame.__init__
    # input must be a ndarray, list, Series, Index, ExtensionArray

    if isinstance(values, ABCSeries):
        if columns is None:
            if values.name is not None:
                columns = Index([values.name])
        if index is None:
            index = values.index
        else:
            values = values.reindex(index)

        # zero len case (GH #2234)
        if not len(values) and columns is not None and len(columns):
            values = np.empty((0, 1), dtype=object)

    vdtype = getattr(values, "dtype", None)
    if is_1d_only_ea_dtype(vdtype) or isinstance(dtype, ExtensionDtype):
        # GH#19157

        if isinstance(values,
                      (np.ndarray, ExtensionArray)) and values.ndim > 1:
            # GH#12513 a EA dtype passed with a 2D array, split into
            #  multiple EAs that view the values
            # error: No overload variant of "__getitem__" of "ExtensionArray"
            # matches argument type "Tuple[slice, int]"
            values = [
                values[:, n]  # type: ignore[call-overload]
                for n in range(values.shape[1])
            ]
        else:
            values = [values]

        if columns is None:
            columns = Index(range(len(values)))
        else:
            columns = ensure_index(columns)

        return arrays_to_mgr(values, columns, index, dtype=dtype, typ=typ)

    elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype):
        # i.e. Datetime64TZ
        values = extract_array(values, extract_numpy=True)
        if copy:
            values = values.copy()
        if values.ndim == 1:
            values = values.reshape(-1, 1)

    else:
        # by definition an array here
        # the dtypes will be coerced to a single dtype
        values = _prep_ndarray(values, copy=copy)

    if dtype is not None and not is_dtype_equal(values.dtype, dtype):
        shape = values.shape
        flat = values.ravel()

        # GH#40110 see similar check inside sanitize_array
        rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f")

        values = sanitize_array(flat,
                                None,
                                dtype=dtype,
                                copy=copy,
                                raise_cast_failure=rcf)

        values = values.reshape(shape)

    # _prep_ndarray ensures that values.ndim == 2 at this point
    index, columns = _get_axes(values.shape[0],
                               values.shape[1],
                               index=index,
                               columns=columns)

    _check_values_indices_shape_match(values, index, columns)

    if typ == "array":

        if issubclass(values.dtype.type, str):
            values = np.array(values, dtype=object)

        if dtype is None and is_object_dtype(values.dtype):
            arrays = [
                ensure_wrapped_if_datetimelike(
                    maybe_infer_to_datetimelike(values[:, i]))
                for i in range(values.shape[1])
            ]
        else:
            if is_datetime_or_timedelta_dtype(values.dtype):
                values = ensure_wrapped_if_datetimelike(values)
            arrays = [values[:, i] for i in range(values.shape[1])]

        return ArrayManager(arrays, [index, columns], verify_integrity=False)

    values = values.T

    # if we don't have a dtype specified, then try to convert objects
    # on the entire block; this is to convert if we have datetimelike's
    # embedded in an object type
    if dtype is None and is_object_dtype(values.dtype):
        obj_columns = list(values)
        maybe_datetime = [maybe_infer_to_datetimelike(x) for x in obj_columns]
        # don't convert (and copy) the objects if no type inference occurs
        if any(x is not y for x, y in zip(obj_columns, maybe_datetime)):
            dvals_list = [
                ensure_block_shape(dval, 2) for dval in maybe_datetime
            ]
            block_values = [
                new_block_2d(dvals_list[n], placement=BlockPlacement(n))
                for n in range(len(dvals_list))
            ]
        else:
            bp = BlockPlacement(slice(len(columns)))
            nb = new_block_2d(values, placement=bp)
            block_values = [nb]
    else:
        bp = BlockPlacement(slice(len(columns)))
        nb = new_block_2d(values, placement=bp)
        block_values = [nb]

    if len(columns) == 0:
        block_values = []

    return create_block_manager_from_blocks(block_values, [columns, index],
                                            verify_integrity=False)
Пример #54
0
def sequence_to_td64ns(data,
                       copy: bool = False,
                       unit=None,
                       errors="raise") -> tuple[np.ndarray, Tick | None]:
    """
    Parameters
    ----------
    data : list-like
    copy : bool, default False
    unit : str, optional
        The timedelta unit to treat integers as multiples of. For numeric
        data this defaults to ``'ns'``.
        Must be un-specified if the data contains a str and ``errors=="raise"``.
    errors : {"raise", "coerce", "ignore"}, default "raise"
        How to handle elements that cannot be converted to timedelta64[ns].
        See ``pandas.to_timedelta`` for details.

    Returns
    -------
    converted : numpy.ndarray
        The sequence converted to a numpy array with dtype ``timedelta64[ns]``.
    inferred_freq : Tick or None
        The inferred frequency of the sequence.

    Raises
    ------
    ValueError : Data cannot be converted to timedelta64[ns].

    Notes
    -----
    Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause
    errors to be ignored; they are caught and subsequently ignored at a
    higher level.
    """
    inferred_freq = None
    if unit is not None:
        unit = parse_timedelta_unit(unit)

    # Unwrap whatever we have into a np.ndarray
    if not hasattr(data, "dtype"):
        # e.g. list, tuple
        if np.ndim(data) == 0:
            # i.e. generator
            data = list(data)
        data = np.array(data, copy=False)
    elif isinstance(data, ABCMultiIndex):
        raise TypeError("Cannot create a DatetimeArray from a MultiIndex.")
    else:
        data = extract_array(data, extract_numpy=True)

    if isinstance(data, IntegerArray):
        data = data.to_numpy("int64", na_value=iNaT)
    elif not isinstance(data, (np.ndarray, ExtensionArray)):
        # GH#24539 e.g. xarray, dask object
        data = np.asarray(data)
    elif isinstance(data, ABCCategorical):
        data = data.categories.take(data.codes, fill_value=NaT)._values
        copy = False

    if isinstance(data, TimedeltaArray):
        inferred_freq = data.freq

    # Convert whatever we have into timedelta64[ns] dtype
    if is_object_dtype(data.dtype) or is_string_dtype(data.dtype):
        # no need to make a copy, need to convert if string-dtyped
        data = objects_to_td64ns(data, unit=unit, errors=errors)
        copy = False

    elif is_integer_dtype(data.dtype):
        # treat as multiples of the given unit
        data, copy_made = ints_to_td64ns(data, unit=unit)
        copy = copy and not copy_made

    elif is_float_dtype(data.dtype):
        # cast the unit, multiply base/frac separately
        # to avoid precision issues from float -> int
        mask = np.isnan(data)
        # The next few lines are effectively a vectorized 'cast_from_unit'
        m, p = precision_from_unit(unit or "ns")
        base = data.astype(np.int64)
        frac = data - base
        if p:
            frac = np.round(frac, p)
        data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")
        data[mask] = iNaT
        copy = False

    elif is_timedelta64_dtype(data.dtype):
        if data.dtype != TD64NS_DTYPE:
            # non-nano unit
            data = ensure_timedelta64ns(data)
            copy = False

    else:
        # This includes datetime64-dtype, see GH#23539, GH#29794
        raise TypeError(
            f"dtype {data.dtype} cannot be converted to timedelta64[ns]")

    data = np.array(data, copy=copy)

    assert data.dtype == "m8[ns]", data
    return data, inferred_freq
Пример #55
0
 def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
     if not isinstance(dtype, IntervalDtype):
         return False
     common_subtype = find_common_type([self.dtype.subtype, dtype.subtype])
     return not is_object_dtype(common_subtype)
Пример #56
0
def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"):
    """
    Parameters
    ----------
    array : list-like
    copy : bool, default False
    unit : str, default "ns"
        The timedelta unit to treat integers as multiples of.
    errors : {"raise", "coerce", "ignore"}, default "raise"
        How to handle elements that cannot be converted to timedelta64[ns].
        See ``pandas.to_timedelta`` for details.

    Returns
    -------
    converted : numpy.ndarray
        The sequence converted to a numpy array with dtype ``timedelta64[ns]``.
    inferred_freq : Tick or None
        The inferred frequency of the sequence.

    Raises
    ------
    ValueError : Data cannot be converted to timedelta64[ns].

    Notes
    -----
    Unlike `pandas.to_timedelta`, if setting ``errors=ignore`` will not cause
    errors to be ignored; they are caught and subsequently ignored at a
    higher level.
    """
    inferred_freq = None
    unit = parse_timedelta_unit(unit)

    # Unwrap whatever we have into a np.ndarray
    if not hasattr(data, 'dtype'):
        # e.g. list, tuple
        if np.ndim(data) == 0:
            # i.e. generator
            data = list(data)
        data = np.array(data, copy=False)
    elif isinstance(data, ABCSeries):
        data = data._values
    elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArrayMixin)):
        inferred_freq = data.freq
        data = data._data

    # Convert whatever we have into timedelta64[ns] dtype
    if is_object_dtype(data) or is_string_dtype(data):
        # no need to make a copy, need to convert if string-dtyped
        data = objects_to_td64ns(data, unit=unit, errors=errors)
        copy = False

    elif is_integer_dtype(data):
        # treat as multiples of the given unit
        data, copy_made = ints_to_td64ns(data, unit=unit)
        copy = copy and not copy_made

    elif is_float_dtype(data):
        # treat as multiples of the given unit.  If after converting to nanos,
        #  there are fractional components left, these are truncated
        #  (i.e. NOT rounded)
        mask = np.isnan(data)
        coeff = np.timedelta64(1, unit) / np.timedelta64(1, 'ns')
        data = (coeff * data).astype(np.int64).view('timedelta64[ns]')
        data[mask] = iNaT
        copy = False

    elif is_timedelta64_dtype(data):
        if data.dtype != _TD_DTYPE:
            # non-nano unit
            # TODO: watch out for overflows
            data = data.astype(_TD_DTYPE)
            copy = False

    elif is_datetime64_dtype(data):
        # GH#23539
        warnings.warn("Passing datetime64-dtype data to TimedeltaIndex is "
                      "deprecated, will raise a TypeError in a future "
                      "version",
                      FutureWarning, stacklevel=4)
        data = ensure_int64(data).view(_TD_DTYPE)

    else:
        raise TypeError("dtype {dtype} cannot be converted to timedelta64[ns]"
                        .format(dtype=data.dtype))

    data = np.array(data, copy=copy)
    assert data.dtype == 'm8[ns]', data
    return data, inferred_freq
Пример #57
0
    def __floordiv__(self, other):
        if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)):
            return NotImplemented

        other = lib.item_from_zerodim(other)
        if is_scalar(other):
            if isinstance(other, (timedelta, np.timedelta64, Tick)):
                other = Timedelta(other)
                if other is NaT:
                    # treat this specifically as timedelta-NaT
                    result = np.empty(self.shape, dtype=np.float64)
                    result.fill(np.nan)
                    return result

                # dispatch to Timedelta implementation
                result = other.__rfloordiv__(self._data)
                return result

            # at this point we should only have numeric scalars; anything
            #  else will raise
            result = self.asi8 // other
            result[self._isnan] = iNaT
            freq = None
            if self.freq is not None:
                # Note: freq gets division, not floor-division
                freq = self.freq / other
            return type(self)(result.view('m8[ns]'), freq=freq)

        if not hasattr(other, "dtype"):
            # list, tuple
            other = np.array(other)
        if len(other) != len(self):
            raise ValueError("Cannot divide with unequal lengths")

        elif is_timedelta64_dtype(other):
            other = type(self)(other)

            # numpy timedelta64 does not natively support floordiv, so operate
            #  on the i8 values
            result = self.asi8 // other.asi8
            mask = self._isnan | other._isnan
            if mask.any():
                result = result.astype(np.int64)
                result[mask] = np.nan
            return result

        elif is_object_dtype(other):
            result = [self[n] // other[n] for n in range(len(self))]
            result = np.array(result)
            if lib.infer_dtype(result) == 'timedelta':
                result, _ = sequence_to_td64ns(result)
                return type(self)(result)
            return result

        elif is_integer_dtype(other) or is_float_dtype(other):
            result = self._data // other
            return type(self)(result)

        else:
            dtype = getattr(other, "dtype", type(other).__name__)
            raise TypeError("Cannot divide {typ} by {cls}"
                            .format(typ=dtype, cls=type(self).__name__))
Пример #58
0
    def __floordiv__(self, other):

        if is_scalar(other):
            if isinstance(other, self._recognized_scalars):
                other = Timedelta(other)
                if other is NaT:
                    # treat this specifically as timedelta-NaT
                    result = np.empty(self.shape, dtype=np.float64)
                    result.fill(np.nan)
                    return result

                # dispatch to Timedelta implementation
                result = other.__rfloordiv__(self._data)
                return result

            # at this point we should only have numeric scalars; anything
            #  else will raise
            result = self.asi8 // other
            np.putmask(result, self._isnan, iNaT)
            freq = None
            if self.freq is not None:
                # Note: freq gets division, not floor-division
                freq = self.freq / other
                if freq.nanos == 0 and self.freq.nanos != 0:
                    # e.g. if self.freq is Nano(1) then dividing by 2
                    #  rounds down to zero
                    freq = None
            return type(self)(result.view("m8[ns]"), freq=freq)

        if not hasattr(other, "dtype"):
            # list, tuple
            other = np.array(other)
        if len(other) != len(self):
            raise ValueError("Cannot divide with unequal lengths")

        elif is_timedelta64_dtype(other.dtype):
            other = type(self)(other)

            # numpy timedelta64 does not natively support floordiv, so operate
            #  on the i8 values
            result = self.asi8 // other.asi8
            mask = self._isnan | other._isnan
            if mask.any():
                result = result.astype(np.float64)
                np.putmask(result, mask, np.nan)
            return result

        elif is_object_dtype(other.dtype):
            result = [self[n] // other[n] for n in range(len(self))]
            result = np.array(result)
            if lib.infer_dtype(result, skipna=False) == "timedelta":
                result, _ = sequence_to_td64ns(result)
                return type(self)(result)
            return result

        elif is_integer_dtype(other.dtype) or is_float_dtype(other.dtype):
            result = self._data // other
            return type(self)(result)

        else:
            dtype = getattr(other, "dtype", type(other).__name__)
            raise TypeError(f"Cannot divide {dtype} by {type(self).__name__}")
Пример #59
0
 def test_astype_object_series(self, all_data):
     # Unlike the base class, we do not expect the resulting Block
     #  to be ObjectBlock
     ser = pd.Series(all_data, name="A")
     result = ser.astype(object)
     assert is_object_dtype(result._data.blocks[0].dtype)
Пример #60
0
    def __truediv__(self, other):
        # timedelta / X is well-defined for timedelta-like or numeric X

        if isinstance(other, self._recognized_scalars):
            other = Timedelta(other)
            # mypy assumes that __new__ returns an instance of the class
            # github.com/python/mypy/issues/1020
            if cast("Timedelta | NaTType", other) is NaT:
                # specifically timedelta64-NaT
                result = np.empty(self.shape, dtype=np.float64)
                result.fill(np.nan)
                return result

            # otherwise, dispatch to Timedelta implementation
            return self._ndarray / other

        elif lib.is_scalar(other):
            # assume it is numeric
            result = self._ndarray / other
            freq = None
            if self.freq is not None:
                # Tick division is not implemented, so operate on Timedelta
                freq = self.freq.delta / other
            return type(self)(result, freq=freq)

        if not hasattr(other, "dtype"):
            # e.g. list, tuple
            other = np.array(other)

        if len(other) != len(self):
            raise ValueError("Cannot divide vectors with unequal lengths")

        elif is_timedelta64_dtype(other.dtype):
            # let numpy handle it
            return self._ndarray / other

        elif is_object_dtype(other.dtype):
            # We operate on raveled arrays to avoid problems in inference
            #  on NaT
            srav = self.ravel()
            orav = other.ravel()
            result_list = [srav[n] / orav[n] for n in range(len(srav))]
            result = np.array(result_list).reshape(self.shape)

            # We need to do dtype inference in order to keep DataFrame ops
            #  behavior consistent with Series behavior
            inferred = lib.infer_dtype(result, skipna=False)
            if inferred == "timedelta":
                flat = result.ravel()
                result = type(self)._from_sequence(flat).reshape(result.shape)
            elif inferred == "floating":
                result = result.astype(float)
            elif inferred == "datetime":
                # GH#39750 this occurs when result is all-NaT, in which case
                #  we want to interpret these NaTs as td64.
                #  We construct an all-td64NaT result.
                # error: Incompatible types in assignment (expression has type
                # "TimedeltaArray", variable has type "ndarray[Any,
                # dtype[floating[_64Bit]]]")
                result = self * np.nan  # type: ignore[assignment]

            return result

        else:
            result = self._ndarray / other
            return type(self)(result)