def is_bool_indexer(key): # type: (Any) -> bool """ Check whether `key` is a valid boolean indexer. Parameters ---------- key : Any Only list-likes may be considered boolean indexers. All other types are not considered a boolean indexer. For array-like input, boolean ndarrays or ExtensionArrays with ``_is_boolean`` set are considered boolean indexers. Returns ------- bool Raises ------ ValueError When the array is an object-dtype ndarray or ExtensionArray and contains missing values. """ na_msg = 'cannot index with vector containing NA / NaN values' if (isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (is_array_like(key) and is_extension_array_dtype(key.dtype))): if key.dtype == np.object_: key = np.asarray(values_from_object(key)) if not lib.is_bool_array(key): if isna(key).any(): raise ValueError(na_msg) return False return True elif is_bool_dtype(key.dtype): # an ndarray with bool-dtype by definition has no missing values. # So we only need to check for NAs in ExtensionArrays if is_extension_array_dtype(key.dtype): if np.any(key.isna()): raise ValueError(na_msg) return True elif isinstance(key, list): try: arr = np.asarray(key) return arr.dtype == np.bool_ and len(arr) == len(key) except TypeError: # pragma: no cover return False return False
def __sub__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(-other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(-other) elif isinstance(other, (datetime, np.datetime64)): result = self._sub_datetimelike_scalar(other) elif lib.is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these maybe_integer_op_deprecated(self) result = self._time_shift(-other) elif isinstance(other, Period): result = self._sub_period(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(-other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.sub) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] result = self._sub_datetime_arraylike(other) elif is_period_dtype(other): # PeriodIndex result = self._sub_period_array(other) elif is_integer_dtype(other): maybe_integer_op_deprecated(self) result = self._addsub_int_array(other, operator.sub) elif isinstance(other, ABCIndexClass): raise TypeError("cannot subtract {cls} and {typ}" .format(cls=type(self).__name__, typ=type(other).__name__)) elif is_float_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot subtract {dtype}-dtype from {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_extension_array_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): from pandas.core.arrays import TimedeltaArrayMixin # TODO: infer freq? return TimedeltaArrayMixin(result) return result
def to_numpy(self): """ A NumPy ndarray representing the values in this Series or Index. .. versionadded:: 0.24.0 The returned array will be the same up to equality (values equal in `self` will be equal in the returned array; likewise for values that are not equal). When `self` contains an ExtensionArray, the dtype may be different. For example, for a category-dtype Series, ``to_numpy()`` will return a NumPy array and the categorical dtype will be lost. Returns ------- numpy.ndarray See Also -------- Series.array : Get the actual data stored within. Index.array : Get the actual data stored within. DataFrame.to_numpy : Similar method for DataFrame. Notes ----- For NumPy dtypes, this will be a reference to the actual data stored in this Series or Index. Modifying the result in place will modify the data stored in the Series or Index (not that we recommend doing that). For extension types, ``to_numpy()`` *may* require copying data and coercing the result to a NumPy type (possibly object), which may be expensive. When you need a no-copy reference to the underlying data, :attr:`Series.array` should be used instead. This table lays out the different dtypes and return types of ``to_numpy()`` for various dtypes within pandas. ================== ================================ dtype array type ================== ================================ category[T] ndarray[T] (same dtype as input) period ndarray[object] (Periods) interval ndarray[object] (Intervals) IntegerNA ndarray[object] datetime64[ns, tz] ndarray[object] (Timestamps) ================== ================================ Examples -------- >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.to_numpy() array(['a', 'b', 'a'], dtype=object) """ if (is_extension_array_dtype(self.dtype) or is_datetime64tz_dtype(self.dtype)): # TODO(DatetimeArray): remove the second clause. return np.asarray(self._values) return self._values
def __add__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(other) elif isinstance(other, (datetime, np.datetime64)): result = self._add_datetimelike_scalar(other) elif lib.is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these maybe_integer_op_deprecated(self) result = self._time_shift(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.add) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] return self._add_datetime_arraylike(other) elif is_integer_dtype(other): maybe_integer_op_deprecated(self) result = self._addsub_int_array(other, operator.add) elif is_float_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot add {dtype}-dtype to {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_period_dtype(other): # if self is a TimedeltaArray and other is a PeriodArray with # a timedelta-like (i.e. Tick) freq, this operation is valid. # Defer to the PeriodArray implementation. # In remaining cases, this will end up raising TypeError. return NotImplemented elif is_extension_array_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented if is_timedelta64_dtype(result) and isinstance(result, np.ndarray): from pandas.core.arrays import TimedeltaArrayMixin # TODO: infer freq? return TimedeltaArrayMixin(result) return result
def test_isna_extension_array(self, data_missing): # If your `isna` returns an ExtensionArray, you must also implement # _reduce. At the *very* least, you must implement any and all na = data_missing.isna() if is_extension_array_dtype(na): assert na._reduce('any') assert na.any() assert not na._reduce('all') assert not na.all() assert na.dtype._is_boolean
def astype(self, dtype, copy=True): dtype = pandas_dtype(dtype) if needs_i8_conversion(dtype): msg = ('Cannot convert Float64Index to dtype {dtype}; integer ' 'values are required for conversion').format(dtype=dtype) raise TypeError(msg) elif (is_integer_dtype(dtype) and not is_extension_array_dtype(dtype)) and self.hasnans: # TODO(jreback); this can change once we have an EA Index type # GH 13149 raise ValueError('Cannot convert NA to integer') return super().astype(dtype, copy=copy)
def _ndarray_values(self): """The data as an ndarray, possibly losing information. The expectation is that this is cheap to compute, and is primarily used for interacting with our indexers. - categorical -> codes """ # type: () -> np.ndarray if is_extension_array_dtype(self): return self.values._ndarray_values return self.values
def _ndarray_values(self) -> np.ndarray: """ The data as an ndarray, possibly losing information. The expectation is that this is cheap to compute, and is primarily used for interacting with our indexers. - categorical -> codes """ if is_extension_array_dtype(self): return self.array._ndarray_values # As a mixin, we depend on the mixing class having values. # Special mixin syntax may be developed in the future: # https://github.com/python/typing/issues/246 return self.values # type: ignore
def __iter__(self): """ Return an iterator of the values. These are each a scalar type, which is a Python scalar (for str, int, float) or a pandas scalar (for Timestamp/Timedelta/Interval/Period) """ # We are explicity making element iterators. if is_datetimelike(self._values): return map(com.maybe_box_datetimelike, self._values) elif is_extension_array_dtype(self._values): return iter(self._values) else: return map(self._values.item, range(self._values.size))
def tolist(self): """ Return a list of the values. These are each a scalar type, which is a Python scalar (for str, int, float) or a pandas scalar (for Timestamp/Timedelta/Interval/Period) See Also -------- numpy.ndarray.tolist """ if is_datetimelike(self._values): return [com._maybe_box_datetimelike(x) for x in self._values] elif is_extension_array_dtype(self._values): return list(self._values) else: return self._values.tolist()
def __add__(self, other): other = lib.item_from_zerodim(other) if isinstance(other, (ABCSeries, ABCDataFrame)): return NotImplemented # scalar others elif other is NaT: result = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_delta(other) elif isinstance(other, DateOffset): # specifically _not_ a Tick result = self._add_offset(other) elif isinstance(other, (datetime, np.datetime64)): result = self._add_datelike(other) elif lib.is_integer(other): # This check must come after the check for np.timedelta64 # as is_integer returns True for these result = self.shift(other) # array-like others elif is_timedelta64_dtype(other): # TimedeltaIndex, ndarray[timedelta64] result = self._add_delta(other) elif is_offsetlike(other): # Array/Index of DateOffset objects result = self._addsub_offset_array(other, operator.add) elif is_datetime64_dtype(other) or is_datetime64tz_dtype(other): # DatetimeIndex, ndarray[datetime64] return self._add_datelike(other) elif is_integer_dtype(other): result = self._addsub_int_array(other, operator.add) elif is_float_dtype(other) or is_period_dtype(other): # Explicitly catch invalid dtypes raise TypeError("cannot add {dtype}-dtype to {cls}" .format(dtype=other.dtype, cls=type(self).__name__)) elif is_extension_array_dtype(other): # Categorical op will raise; defer explicitly return NotImplemented else: # pragma: no cover return NotImplemented return result
def _check_op(self, s, op, other, op_name, exc=NotImplementedError): if exc is None: if s.dtype.is_unsigned_integer and (op_name == '__rsub__'): # TODO see https://github.com/pandas-dev/pandas/issues/22023 pytest.skip("unsigned subtraction gives negative values") if (hasattr(other, 'dtype') and not is_extension_array_dtype(other.dtype) and pd.api.types.is_integer_dtype(other.dtype)): # other is np.int64 and would therefore always result in # upcasting, so keeping other as same numpy_dtype other = other.astype(s.dtype.numpy_dtype) result = op(s, other) expected = s.combine(other, op) if op_name == '__rdiv__': # combine is not giving the correct result for this case pytest.skip("skipping reverse div in python 2") elif op_name in ('__rtruediv__', '__truediv__', '__div__'): expected = expected.astype(float) if op_name == '__rtruediv__': # TODO reverse operators result in object dtype result = result.astype(float) elif op_name.startswith('__r'): # TODO reverse operators result in object dtype # see https://github.com/pandas-dev/pandas/issues/22024 expected = expected.astype(s.dtype) result = result.astype(s.dtype) else: # combine method result in 'biggest' (int64) dtype expected = expected.astype(s.dtype) pass if (op_name == '__rpow__') and isinstance(other, pd.Series): # TODO pow on Int arrays gives different result with NA # see https://github.com/pandas-dev/pandas/issues/22022 result = result.fillna(1) self.assert_series_equal(result, expected) else: with pytest.raises(exc): op(s, other)
def unstack(obj, level, fill_value=None): if isinstance(level, (tuple, list)): if len(level) != 1: # _unstack_multiple only handles MultiIndexes, # and isn't needed for a single level return _unstack_multiple(obj, level, fill_value=fill_value) else: level = level[0] if isinstance(obj, DataFrame): if isinstance(obj.index, MultiIndex): return _unstack_frame(obj, level, fill_value=fill_value) else: return obj.T.stack(dropna=False) else: if is_extension_array_dtype(obj.dtype): return _unstack_extension_series(obj, level, fill_value) unstacker = _Unstacker(obj.values, obj.index, level=level, fill_value=fill_value, constructor=obj._constructor_expanddim) return unstacker.get_result()
def get_dtype_kinds(l): """ Parameters ---------- l : list of arrays Returns ------- a set of kinds that exist in this list of arrays """ typs = set() for arr in l: dtype = arr.dtype if is_categorical_dtype(dtype): typ = 'category' elif is_sparse(arr): typ = 'sparse' elif isinstance(arr, ABCRangeIndex): typ = 'range' elif is_datetime64tz_dtype(arr): # if to_concat contains different tz, # the result must be object dtype typ = str(arr.dtype) elif is_datetime64_dtype(dtype): typ = 'datetime' elif is_timedelta64_dtype(dtype): typ = 'timedelta' elif is_object_dtype(dtype): typ = 'object' elif is_bool_dtype(dtype): typ = 'bool' elif is_extension_array_dtype(dtype): typ = str(arr.dtype) else: typ = dtype.kind typs.add(typ) return typs
def _try_cast(arr, take_fast_path, dtype, copy, raise_cast_failure): # perf shortcut as this is the most common case if take_fast_path: if maybe_castable(arr) and not copy and dtype is None: return arr try: # GH#15832: Check if we are requesting a numeric dype and # that we can convert the data to the requested dtype. if is_integer_dtype(dtype): subarr = maybe_cast_to_integer_array(arr, dtype) subarr = maybe_cast_to_datetime(arr, dtype) # Take care in creating object arrays (but iterators are not # supported): if is_object_dtype(dtype) and (is_list_like(subarr) and not (is_iterator(subarr) or isinstance(subarr, np.ndarray))): subarr = construct_1d_object_array_from_listlike(subarr) elif not is_extension_type(subarr): subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): # We *do* allow casting to categorical, since we know # that Categorical is the only array type for 'category'. subarr = Categorical(arr, dtype.categories, ordered=dtype.ordered) elif is_extension_array_dtype(dtype): # create an extension array from its dtype array_type = dtype.construct_array_type()._from_sequence subarr = array_type(arr, dtype=dtype, copy=copy) elif dtype is not None and raise_cast_failure: raise else: subarr = np.array(arr, dtype=object, copy=copy) return subarr
def test_is_extension_array_dtype(self, data): assert is_extension_array_dtype(data) assert is_extension_array_dtype(data.dtype) assert is_extension_array_dtype(pd.Series(data)) assert isinstance(data.dtype, ExtensionDtype)
def test_is_not_extension_array_dtype(self, values): assert not is_extension_array_dtype(values)
def array(self): # type: () -> ExtensionArray """ The ExtensionArray of the data backing this Series or Index. .. versionadded:: 0.24.0 Returns ------- ExtensionArray An ExtensionArray of the values stored within. For extension types, this is the actual array. For NumPy native types, this is a thin (no copy) wrapper around :class:`numpy.ndarray`. ``.array`` differs ``.values`` which may require converting the data to a different form. See Also -------- Index.to_numpy : Similar method that always returns a NumPy array. Series.to_numpy : Similar method that always returns a NumPy array. Notes ----- This table lays out the different array types for each extension dtype within pandas. ================== ============================= dtype array type ================== ============================= category Categorical period PeriodArray interval IntervalArray IntegerNA IntegerArray datetime64[ns, tz] DatetimeArray ================== ============================= For any 3rd-party extension types, the array type will be an ExtensionArray. For all remaining dtypes ``.array`` will be a :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray stored within. If you absolutely need a NumPy array (possibly with copying / coercing data), then use :meth:`Series.to_numpy` instead. Examples -------- For regular NumPy types like int, and float, a PandasArray is returned. >>> pd.Series([1, 2, 3]).array <PandasArray> [1, 2, 3] Length: 3, dtype: int64 For extension types, like Categorical, the actual ExtensionArray is returned >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.array [a, b, a] Categories (2, object): [a, b] """ result = self._values if is_datetime64_ns_dtype(result.dtype): from pandas.arrays import DatetimeArray result = DatetimeArray(result) elif is_timedelta64_ns_dtype(result.dtype): from pandas.arrays import TimedeltaArray result = TimedeltaArray(result) elif not is_extension_array_dtype(result.dtype): from pandas.core.arrays.numpy_ import PandasArray result = PandasArray(result) return result
def get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype na """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.float64, np.nan if is_uniform_reindex(join_units): # XXX: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): if unit.block is None: has_none_blocks = True else: dtypes[i] = unit.dtype upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) for dtype, unit in zip(dtypes, join_units): if dtype is None: continue if is_categorical_dtype(dtype): upcast_cls = 'category' elif is_datetime64tz_dtype(dtype): upcast_cls = 'datetimetz' elif issubclass(dtype.type, np.bool_): upcast_cls = 'bool' elif issubclass(dtype.type, np.object_): upcast_cls = 'object' elif is_datetime64_dtype(dtype): upcast_cls = 'datetime' elif is_timedelta64_dtype(dtype): upcast_cls = 'timedelta' elif is_sparse(dtype): upcast_cls = dtype.subtype.name elif is_extension_array_dtype(dtype): upcast_cls = 'object' elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: upcast_cls = 'float' # Null blocks should not influence upcast class selection, unless there # are only null blocks, when same upcasting rules must be applied to # null upcast classes. if unit.is_na: null_upcast_classes[upcast_cls].append(dtype) else: upcast_classes[upcast_cls].append(dtype) if not upcast_classes: upcast_classes = null_upcast_classes # create the result if 'object' in upcast_classes: return np.dtype(np.object_), np.nan elif 'bool' in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None elif 'category' in upcast_classes: return np.dtype(np.object_), np.nan elif 'datetimetz' in upcast_classes: dtype = upcast_classes['datetimetz'] return dtype[0], tslibs.iNaT elif 'datetime' in upcast_classes: return np.dtype('M8[ns]'), tslibs.iNaT elif 'timedelta' in upcast_classes: return np.dtype('m8[ns]'), tslibs.iNaT else: # pragma try: g = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: if is_float_dtype(g): return g, g.type(np.nan) elif is_numeric_dtype(g): if has_none_blocks: return np.float64, np.nan else: return g, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def array(self) -> ExtensionArray: """ The ExtensionArray of the data backing this Series or Index. .. versionadded:: 0.24.0 Returns ------- ExtensionArray An ExtensionArray of the values stored within. For extension types, this is the actual array. For NumPy native types, this is a thin (no copy) wrapper around :class:`numpy.ndarray`. ``.array`` differs ``.values`` which may require converting the data to a different form. See Also -------- Index.to_numpy : Similar method that always returns a NumPy array. Series.to_numpy : Similar method that always returns a NumPy array. Notes ----- This table lays out the different array types for each extension dtype within pandas. ================== ============================= dtype array type ================== ============================= category Categorical period PeriodArray interval IntervalArray IntegerNA IntegerArray datetime64[ns, tz] DatetimeArray ================== ============================= For any 3rd-party extension types, the array type will be an ExtensionArray. For all remaining dtypes ``.array`` will be a :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray stored within. If you absolutely need a NumPy array (possibly with copying / coercing data), then use :meth:`Series.to_numpy` instead. Examples -------- For regular NumPy types like int, and float, a PandasArray is returned. >>> pd.Series([1, 2, 3]).array <PandasArray> [1, 2, 3] Length: 3, dtype: int64 For extension types, like Categorical, the actual ExtensionArray is returned >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.array [a, b, a] Categories (2, object): [a, b] """ # As a mixin, we depend on the mixing class having _values. # Special mixin syntax may be developed in the future: # https://github.com/python/typing/issues/246 result = self._values # type: ignore if is_datetime64_ns_dtype(result.dtype): from pandas.arrays import DatetimeArray result = DatetimeArray(result) elif is_timedelta64_ns_dtype(result.dtype): from pandas.arrays import TimedeltaArray result = TimedeltaArray(result) elif not is_extension_array_dtype(result.dtype): from pandas.core.arrays.numpy_ import PandasArray result = PandasArray(result) return result
def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any: """ Check if `indexer` is a valid array indexer for `array`. For a boolean mask, `array` and `indexer` are checked to have the same length. The dtype is validated, and if it is an integer or boolean ExtensionArray, it is checked if there are missing values present, and it is converted to the appropriate numpy array. Other dtypes will raise an error. Non-array indexers (integer, slice, Ellipsis, tuples, ..) are passed through as is. .. versionadded:: 1.0.0 Parameters ---------- array : array-like The array that is being indexed (only used for the length). indexer : array-like or list-like The array-like that's used to index. List-like input that is not yet a numpy array or an ExtensionArray is converted to one. Other input types are passed through as is. Returns ------- numpy.ndarray The validated indexer as a numpy array that can be used to index. Raises ------ IndexError When the lengths don't match. ValueError When `indexer` cannot be converted to a numpy ndarray to index (e.g. presence of missing values). See Also -------- api.types.is_bool_dtype : Check if `key` is of boolean dtype. Examples -------- When checking a boolean mask, a boolean ndarray is returned when the arguments are all valid. >>> mask = pd.array([True, False]) >>> arr = pd.array([1, 2]) >>> pd.api.indexers.check_array_indexer(arr, mask) array([ True, False]) An IndexError is raised when the lengths don't match. >>> mask = pd.array([True, False, True]) >>> pd.api.indexers.check_array_indexer(arr, mask) Traceback (most recent call last): ... IndexError: Boolean index has wrong length: 3 instead of 2. NA values in a boolean array are treated as False. >>> mask = pd.array([True, pd.NA]) >>> pd.api.indexers.check_array_indexer(arr, mask) array([ True, False]) A numpy boolean mask will get passed through (if the length is correct): >>> mask = np.array([True, False]) >>> pd.api.indexers.check_array_indexer(arr, mask) array([ True, False]) Similarly for integer indexers, an integer ndarray is returned when it is a valid indexer, otherwise an error is (for integer indexers, a matching length is not required): >>> indexer = pd.array([0, 2], dtype="Int64") >>> arr = pd.array([1, 2, 3]) >>> pd.api.indexers.check_array_indexer(arr, indexer) array([0, 2]) >>> indexer = pd.array([0, pd.NA], dtype="Int64") >>> pd.api.indexers.check_array_indexer(arr, indexer) Traceback (most recent call last): ... ValueError: Cannot index with an integer indexer containing NA values For non-integer/boolean dtypes, an appropriate error is raised: >>> indexer = np.array([0., 2.], dtype="float64") >>> pd.api.indexers.check_array_indexer(arr, indexer) Traceback (most recent call last): ... IndexError: arrays used as indices must be of integer or boolean type """ from pandas.core.construction import array as pd_array # whathever is not an array-like is returned as-is (possible valid array # indexers that are not array-like: integer, slice, Ellipsis, None) # In this context, tuples are not considered as array-like, as they have # a specific meaning in indexing (multi-dimensional indexing) if is_list_like(indexer): if isinstance(indexer, tuple): return indexer else: return indexer # convert list-likes to array if not is_array_like(indexer): indexer = pd_array(indexer) if len(indexer) == 0: # empty list is converted to float array by pd.array indexer = np.array([], dtype=np.intp) dtype = indexer.dtype if is_bool_dtype(dtype): if is_extension_array_dtype(dtype): indexer = indexer.to_numpy(dtype=bool, na_value=False) else: indexer = np.asarray(indexer, dtype=bool) # GH26658 if len(indexer) != len(array): raise IndexError(f"Boolean index has wrong length: " f"{len(indexer)} instead of {len(array)}") elif is_integer_dtype(dtype): try: indexer = np.asarray(indexer, dtype=np.intp) except ValueError as err: raise ValueError( "Cannot index with an integer indexer containing NA values" ) from err else: raise IndexError( "arrays used as indices must be of integer or boolean type") return indexer
def _concat_compat(to_concat, axis=0): """ provide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a non-datetimelike and provide a combined dtype for the resulting array that preserves the overall dtype if possible) Parameters ---------- to_concat : array of arrays axis : axis to provide concatenation Returns ------- a single array, preserving the combined dtypes """ # filter empty arrays # 1-d dtypes always are included here def is_nonempty(x): try: return x.shape[axis] > 0 except Exception: return True nonempty = [x for x in to_concat if is_nonempty(x)] # If all arrays are empty, there's nothing to convert, just short-cut to # the concatenation, #3121. # # Creating an empty array directly is tempting, but the winnings would be # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. typs = get_dtype_kinds(to_concat) _contains_datetime = any(typ.startswith('datetime') for typ in typs) _contains_period = any(typ.startswith('period') for typ in typs) if 'category' in typs: # this must be priort to _concat_datetime, # to support Categorical + datetime-like return _concat_categorical(to_concat, axis=axis) elif _contains_datetime or 'timedelta' in typs or _contains_period: return _concat_datetime(to_concat, axis=axis, typs=typs) # these are mandated to handle empties as well elif 'sparse' in typs: return _concat_sparse(to_concat, axis=axis, typs=typs) extensions = [is_extension_array_dtype(x) for x in to_concat] if any(extensions) and axis == 1: to_concat = [np.atleast_2d(x.astype('object')) for x in to_concat] if not nonempty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise # cast this to float) typs = get_dtype_kinds(to_concat) if len(typs) != 1: if (not len(typs - {'i', 'u', 'f'}) or not len(typs - {'bool', 'i', 'u'})): # let numpy coerce pass else: # coerce to object to_concat = [x.astype('object') for x in to_concat] return np.concatenate(to_concat, axis=axis)
def array( data: Sequence[object], dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None, copy: bool = True, ) -> ABCExtensionArray: """ Create an array. .. versionadded:: 0.24.0 Parameters ---------- data : Sequence of objects The scalars inside `data` should be instances of the scalar type for `dtype`. It's expected that `data` represents a 1-dimensional array of data. When `data` is an Index or Series, the underlying array will be extracted from `data`. dtype : str, np.dtype, or ExtensionDtype, optional The dtype to use for the array. This may be a NumPy dtype or an extension type registered with pandas using :meth:`pandas.api.extensions.register_extension_dtype`. If not specified, there are two possibilities: 1. When `data` is a :class:`Series`, :class:`Index`, or :class:`ExtensionArray`, the `dtype` will be taken from the data. 2. Otherwise, pandas will attempt to infer the `dtype` from the data. Note that when `data` is a NumPy array, ``data.dtype`` is *not* used for inferring the array type. This is because NumPy cannot represent all the types of data that can be held in extension arrays. Currently, pandas will infer an extension dtype for sequences of ============================== ===================================== Scalar Type Array Type ============================== ===================================== :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray` :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` ============================== ===================================== For all other cases, NumPy's usual inference rules will be used. copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require copying data, even if ``copy=False``. Returns ------- ExtensionArray The newly created array. Raises ------ ValueError When `data` is not 1-dimensional. See Also -------- numpy.array : Construct a NumPy array. Series : Construct a pandas Series. Index : Construct a pandas Index. arrays.PandasArray : ExtensionArray wrapping a NumPy array. Series.array : Extract the array stored within a Series. Notes ----- Omitting the `dtype` argument means pandas will attempt to infer the best array type from the values in the data. As new array types are added by pandas and 3rd party libraries, the "best" array type may change. We recommend specifying `dtype` to ensure that 1. the correct array type for the data is returned 2. the returned array type doesn't change as new extension types are added by pandas and third-party libraries Additionally, if the underlying memory representation of the returned array matters, we recommend specifying the `dtype` as a concrete object rather than a string alias or allowing it to be inferred. For example, a future version of pandas or a 3rd-party library may include a dedicated ExtensionArray for string data. In this event, the following would no longer return a :class:`arrays.PandasArray` backed by a NumPy array. >>> pd.array(['a', 'b'], dtype=str) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 This would instead return the new ExtensionArray dedicated for string data. If you really need the new array to be backed by a NumPy array, specify that in the dtype. >>> pd.array(['a', 'b'], dtype=np.dtype("<U1")) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 Or use the dedicated constructor for the array you're expecting, and wrap that in a PandasArray >>> pd.array(np.array(['a', 'b'], dtype='<U1')) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 Finally, Pandas has arrays that mostly overlap with NumPy * :class:`arrays.DatetimeArray` * :class:`arrays.TimedeltaArray` When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray`` rather than a ``PandasArray``. This is for symmetry with the case of timezone-aware data, which NumPy does not natively support. >>> pd.array(['2015', '2016'], dtype='datetime64[ns]') <DatetimeArray> ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] Length: 2, dtype: datetime64[ns] >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]') <TimedeltaArray> ['01:00:00', '02:00:00'] Length: 2, dtype: timedelta64[ns] Examples -------- If a dtype is not specified, `data` is passed through to :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned. >>> pd.array([1, 2]) <PandasArray> [1, 2] Length: 2, dtype: int64 Or the NumPy dtype can be specified >>> pd.array([1, 2], dtype=np.dtype("int32")) <PandasArray> [1, 2] Length: 2, dtype: int32 You can use the string alias for `dtype` >>> pd.array(['a', 'b', 'a'], dtype='category') [a, b, a] Categories (2, object): [a, b] Or specify the actual dtype >>> pd.array(['a', 'b', 'a'], ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) [a, b, a] Categories (3, object): [a < b < c] Because omitting the `dtype` passes the data through to NumPy, a mixture of valid integers and NA will return a floating-point NumPy array. >>> pd.array([1, 2, np.nan]) <PandasArray> [1.0, 2.0, nan] Length: 3, dtype: float64 To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify the dtype: >>> pd.array([1, 2, np.nan], dtype='Int64') <IntegerArray> [1, 2, NaN] Length: 3, dtype: Int64 Pandas will infer an ExtensionArray for some types of data: >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) <PeriodArray> ['2000-01-01', '2000-01-01'] Length: 2, dtype: period[D] `data` must be 1-dimensional. A ValueError is raised when the input has the wrong dimensionality. >>> pd.array(1) Traceback (most recent call last): ... ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( period_array, ExtensionArray, IntervalArray, PandasArray, DatetimeArray, TimedeltaArray, ) from pandas.core.internals.arrays import extract_array if lib.is_scalar(data): msg = ("Cannot pass scalar '{}' to 'pandas.array'.") raise ValueError(msg.format(data)) data = extract_array(data, extract_numpy=True) if dtype is None and isinstance(data, ExtensionArray): dtype = data.dtype # this returns None for not-found dtypes. if isinstance(dtype, str): dtype = registry.find(dtype) or dtype if is_extension_array_dtype(dtype): cls = cast(ExtensionDtype, dtype).construct_array_type() return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: inferred_dtype = lib.infer_dtype(data, skipna=False) if inferred_dtype == 'period': try: return period_array(data, copy=copy) except tslibs.IncompatibleFrequency: # We may have a mixture of frequencies. # We choose to return an ndarray, rather than raising. pass elif inferred_dtype == 'interval': try: return IntervalArray(data, copy=copy) except ValueError: # We may have a mixture of `closed` here. # We choose to return an ndarray, rather than raising. pass elif inferred_dtype.startswith('datetime'): # datetime, datetime64 try: return DatetimeArray._from_sequence(data, copy=copy) except ValueError: # Mixture of timezones, fall back to PandasArray pass elif inferred_dtype.startswith('timedelta'): # timedelta, timedelta64 return TimedeltaArray._from_sequence(data, copy=copy) # TODO(BooleanArray): handle this type # Pandas overrides NumPy for # 1. datetime64[ns] # 2. timedelta64[ns] # so that a DatetimeArray is returned. if is_datetime64_ns_dtype(dtype): return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy) elif is_timedelta64_ns_dtype(dtype): return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy) result = PandasArray._from_sequence(data, dtype=dtype, copy=copy) return result
def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): # input must be a ndarray, list, Series, index if isinstance(values, ABCSeries): if columns is None: if values.name is not None: columns = [values.name] if index is None: index = values.index else: values = values.reindex(index) # zero len case (GH #2234) if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) # we could have a categorical type passed or coerced to 'category' # recast this to an arrays_to_mgr if is_categorical_dtype(getattr(values, "dtype", None)) or is_categorical_dtype(dtype): if not hasattr(values, "dtype"): values = _prep_ndarray(values, copy=copy) values = values.ravel() elif copy: values = values.copy() index, columns = _get_axes(len(values), 1, index, columns) return arrays_to_mgr([values], columns, index, columns, dtype=dtype) elif is_extension_array_dtype(values) or is_extension_array_dtype(dtype): # GH#19157 if isinstance(values, np.ndarray) and values.ndim > 1: # GH#12513 a EA dtype passed with a 2D array, split into # multiple EAs that view the values values = [values[:, n] for n in range(values.shape[1])] else: values = [values] if columns is None: columns = Index(range(len(values))) return arrays_to_mgr(values, columns, index, columns, dtype=dtype) # by definition an array here # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) if dtype is not None: if not is_dtype_equal(values.dtype, dtype): try: values = values.astype(dtype) except Exception as orig: # e.g. ValueError when trying to cast object dtype to float64 raise ValueError( f"failed to cast to '{dtype}' (Exception was: {orig})" ) from orig # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes(values.shape[0], values.shape[1], index=index, columns=columns) values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values): if values.ndim == 2 and values.shape[0] != 1: # transpose and separate blocks dvals_list = [maybe_infer_to_datetimelike(row) for row in values] for n in range(len(dvals_list)): if isinstance(dvals_list[n], np.ndarray): dvals_list[n] = dvals_list[n].reshape(1, -1) from pandas.core.internals.blocks import make_block # TODO: What about re-joining object columns? block_values = [ make_block(dvals_list[n], placement=[n]) for n in range(len(dvals_list)) ] else: datelike_vals = maybe_infer_to_datetimelike(values) block_values = [datelike_vals] else: block_values = [values] return create_block_manager_from_blocks(block_values, [columns, index])
def _try_cast( arr, dtype: Optional[Union[np.dtype, "ExtensionDtype"]], copy: bool, raise_cast_failure: bool, ): """ Convert input to numpy ndarray and optionally cast to a given dtype. Parameters ---------- arr : ndarray, list, tuple, iterator (catchall) Excludes: ExtensionArray, Series, Index. dtype : np.dtype, ExtensionDtype or None copy : bool If False, don't copy the data if not needed. raise_cast_failure : bool If True, and if a dtype is specified, raise errors during casting. Otherwise an object array is returned. """ # perf shortcut as this is the most common case if isinstance(arr, np.ndarray): if maybe_castable(arr) and not copy and dtype is None: return arr try: # GH#15832: Check if we are requesting a numeric dype and # that we can convert the data to the requested dtype. if is_integer_dtype(dtype): subarr = maybe_cast_to_integer_array(arr, dtype) subarr = maybe_cast_to_datetime(arr, dtype) # Take care in creating object arrays (but iterators are not # supported): if is_object_dtype(dtype) and ( is_list_like(subarr) and not (is_iterator(subarr) or isinstance(subarr, np.ndarray))): subarr = construct_1d_object_array_from_listlike(subarr) elif not is_extension_array_dtype(subarr): subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) except OutOfBoundsDatetime: # in case of out of bound datetime64 -> always raise raise except (ValueError, TypeError): if is_categorical_dtype(dtype): # We *do* allow casting to categorical, since we know # that Categorical is the only array type for 'category'. dtype = cast(CategoricalDtype, dtype) subarr = dtype.construct_array_type()(arr, dtype.categories, ordered=dtype.ordered) elif is_extension_array_dtype(dtype): # create an extension array from its dtype dtype = cast(ExtensionDtype, dtype) array_type = dtype.construct_array_type()._from_sequence subarr = array_type(arr, dtype=dtype, copy=copy) elif dtype is not None and raise_cast_failure: raise else: subarr = np.array(arr, dtype=object, copy=copy) return subarr
def sanitize_array(data, index, dtype=None, copy: bool = False, raise_cast_failure: bool = False): """ Sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified. """ if dtype is not None: dtype = pandas_dtype(dtype) if isinstance(data, ma.MaskedArray): mask = ma.getmaskarray(data) if mask.any(): data, fill_value = maybe_upcast(data, copy=True) data.soften_mask() # set hardmask False if it was True data[mask] = fill_value else: data = data.copy() # extract ndarray or ExtensionArray, ensure we have no PandasArray data = extract_array(data, extract_numpy=True) # GH#846 if isinstance(data, np.ndarray): if dtype is not None and is_float_dtype( data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) except ValueError: if copy: subarr = data.copy() else: subarr = np.array(data, copy=False) else: # we will try to copy be-definition here subarr = _try_cast(data, dtype, copy, raise_cast_failure) elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray subarr = data if dtype is not None: subarr = subarr.astype(dtype, copy=copy) elif copy: subarr = subarr.copy() return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: if dtype is not None: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: subarr = maybe_convert_platform(data) subarr = maybe_cast_to_datetime(subarr, dtype) elif isinstance(data, range): # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) elif isinstance(data, abc.Set): raise TypeError("Set type is unordered") else: subarr = _try_cast(data, dtype, copy, raise_cast_failure) # scalar like, GH if getattr(subarr, "ndim", 0) == 0: if isinstance(data, list): # pragma: no cover subarr = np.array(data, dtype=object) elif index is not None: value = data # figure out the dtype from the value (upcast if necessary) if dtype is None: dtype, value = infer_dtype_from_scalar(value) else: # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) subarr = construct_1d_arraylike_from_scalar( value, len(index), dtype) else: return subarr.item() # the result that we want elif subarr.ndim == 1: if index is not None: # a 1-element ndarray if len(subarr) != len(index) and len(subarr) == 1: subarr = construct_1d_arraylike_from_scalar( subarr[0], len(index), subarr.dtype) elif subarr.ndim > 1: if isinstance(data, np.ndarray): raise Exception("Data must be 1-dimensional") else: subarr = com.asarray_tuplesafe(data, dtype=dtype) if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)): # This is to prevent mixed-type Series getting all casted to # NumPy string type, e.g. NaN --> '-1#IND'. if issubclass(subarr.dtype.type, str): # GH#16605 # If not empty convert the data to dtype # GH#19853: If data is a scalar, subarr has already the result if not lib.is_scalar(data): if not np.all(isna(data)): data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype): inferred = lib.infer_dtype(subarr, skipna=False) if inferred in {"interval", "period"}: subarr = array(subarr) return subarr
def hash_array( vals, encoding: str = "utf8", hash_key: str = _default_hash_key, categorize: bool = True, ): """ Given a 1d array, return an array of deterministic integers. Parameters ---------- vals : ndarray, Categorical encoding : str, default 'utf8' Encoding for data & key when strings. hash_key : str, default _default_hash_key Hash_key for string key to encode. categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if not hasattr(vals, "dtype"): raise TypeError("must pass a ndarray-like") dtype = vals.dtype # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) elif is_extension_array_dtype(dtype): vals, _ = vals._values_for_factorize() dtype = vals.dtype # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(dtype, np.complex128): return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals)) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. elif isinstance(dtype, np.bool): vals = vals.astype("u8") elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): vals = vals.view("i8").astype("u8", copy=False) elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: vals = vals.view("u{}".format(vals.dtype.itemsize)).astype("u8") else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: from pandas import factorize, Categorical, Index codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = hashing.hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types vals = hashing.hash_object_array( vals.astype(str).astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xBF58476D1CE4E5B9) vals ^= vals >> 27 vals *= np.uint64(0x94D049BB133111EB) vals ^= vals >> 31 return vals
def array(data: Sequence[object], dtype: Optional[Union[str, np.dtype, ExtensionDtype]] = None, copy: bool = True, ) -> ABCExtensionArray: """ Create an array. .. versionadded:: 0.24.0 Parameters ---------- data : Sequence of objects The scalars inside `data` should be instances of the scalar type for `dtype`. It's expected that `data` represents a 1-dimensional array of data. When `data` is an Index or Series, the underlying array will be extracted from `data`. dtype : str, np.dtype, or ExtensionDtype, optional The dtype to use for the array. This may be a NumPy dtype or an extension type registered with pandas using :meth:`pandas.api.extensions.register_extension_dtype`. If not specified, there are two possibilities: 1. When `data` is a :class:`Series`, :class:`Index`, or :class:`ExtensionArray`, the `dtype` will be taken from the data. 2. Otherwise, pandas will attempt to infer the `dtype` from the data. Note that when `data` is a NumPy array, ``data.dtype`` is *not* used for inferring the array type. This is because NumPy cannot represent all the types of data that can be held in extension arrays. Currently, pandas will infer an extension dtype for sequences of ============================== ===================================== Scalar Type Array Type ============================== ===================================== :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray` :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` ============================== ===================================== For all other cases, NumPy's usual inference rules will be used. copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require copying data, even if ``copy=False``. Returns ------- ExtensionArray The newly created array. Raises ------ ValueError When `data` is not 1-dimensional. See Also -------- numpy.array : Construct a NumPy array. Series : Construct a pandas Series. Index : Construct a pandas Index. arrays.PandasArray : ExtensionArray wrapping a NumPy array. Series.array : Extract the array stored within a Series. Notes ----- Omitting the `dtype` argument means pandas will attempt to infer the best array type from the values in the data. As new array types are added by pandas and 3rd party libraries, the "best" array type may change. We recommend specifying `dtype` to ensure that 1. the correct array type for the data is returned 2. the returned array type doesn't change as new extension types are added by pandas and third-party libraries Additionally, if the underlying memory representation of the returned array matters, we recommend specifying the `dtype` as a concrete object rather than a string alias or allowing it to be inferred. For example, a future version of pandas or a 3rd-party library may include a dedicated ExtensionArray for string data. In this event, the following would no longer return a :class:`arrays.PandasArray` backed by a NumPy array. >>> pd.array(['a', 'b'], dtype=str) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 This would instead return the new ExtensionArray dedicated for string data. If you really need the new array to be backed by a NumPy array, specify that in the dtype. >>> pd.array(['a', 'b'], dtype=np.dtype("<U1")) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 Or use the dedicated constructor for the array you're expecting, and wrap that in a PandasArray >>> pd.array(np.array(['a', 'b'], dtype='<U1')) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 Finally, Pandas has arrays that mostly overlap with NumPy * :class:`arrays.DatetimeArray` * :class:`arrays.TimedeltaArray` When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray`` rather than a ``PandasArray``. This is for symmetry with the case of timezone-aware data, which NumPy does not natively support. >>> pd.array(['2015', '2016'], dtype='datetime64[ns]') <DatetimeArray> ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] Length: 2, dtype: datetime64[ns] >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]') <TimedeltaArray> ['01:00:00', '02:00:00'] Length: 2, dtype: timedelta64[ns] Examples -------- If a dtype is not specified, `data` is passed through to :meth:`numpy.array`, and a :class:`arrays.PandasArray` is returned. >>> pd.array([1, 2]) <PandasArray> [1, 2] Length: 2, dtype: int64 Or the NumPy dtype can be specified >>> pd.array([1, 2], dtype=np.dtype("int32")) <PandasArray> [1, 2] Length: 2, dtype: int32 You can use the string alias for `dtype` >>> pd.array(['a', 'b', 'a'], dtype='category') [a, b, a] Categories (2, object): [a, b] Or specify the actual dtype >>> pd.array(['a', 'b', 'a'], ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) [a, b, a] Categories (3, object): [a < b < c] Because omitting the `dtype` passes the data through to NumPy, a mixture of valid integers and NA will return a floating-point NumPy array. >>> pd.array([1, 2, np.nan]) <PandasArray> [1.0, 2.0, nan] Length: 3, dtype: float64 To use pandas' nullable :class:`pandas.arrays.IntegerArray`, specify the dtype: >>> pd.array([1, 2, np.nan], dtype='Int64') <IntegerArray> [1, 2, NaN] Length: 3, dtype: Int64 Pandas will infer an ExtensionArray for some types of data: >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) <PeriodArray> ['2000-01-01', '2000-01-01'] Length: 2, dtype: period[D] `data` must be 1-dimensional. A ValueError is raised when the input has the wrong dimensionality. >>> pd.array(1) Traceback (most recent call last): ... ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( period_array, ExtensionArray, IntervalArray, PandasArray, DatetimeArray, TimedeltaArray, ) from pandas.core.internals.arrays import extract_array if lib.is_scalar(data): msg = ( "Cannot pass scalar '{}' to 'pandas.array'." ) raise ValueError(msg.format(data)) data = extract_array(data, extract_numpy=True) if dtype is None and isinstance(data, ExtensionArray): dtype = data.dtype # this returns None for not-found dtypes. if isinstance(dtype, str): dtype = registry.find(dtype) or dtype if is_extension_array_dtype(dtype): cls = dtype.construct_array_type() return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: inferred_dtype = lib.infer_dtype(data, skipna=False) if inferred_dtype == 'period': try: return period_array(data, copy=copy) except tslibs.IncompatibleFrequency: # We may have a mixture of frequencies. # We choose to return an ndarray, rather than raising. pass elif inferred_dtype == 'interval': try: return IntervalArray(data, copy=copy) except ValueError: # We may have a mixture of `closed` here. # We choose to return an ndarray, rather than raising. pass elif inferred_dtype.startswith('datetime'): # datetime, datetime64 try: return DatetimeArray._from_sequence(data, copy=copy) except ValueError: # Mixture of timezones, fall back to PandasArray pass elif inferred_dtype.startswith('timedelta'): # timedelta, timedelta64 return TimedeltaArray._from_sequence(data, copy=copy) # TODO(BooleanArray): handle this type # Pandas overrides NumPy for # 1. datetime64[ns] # 2. timedelta64[ns] # so that a DatetimeArray is returned. if is_datetime64_ns_dtype(dtype): return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy) elif is_timedelta64_ns_dtype(dtype): return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy) result = PandasArray._from_sequence(data, dtype=dtype, copy=copy) return result
def _convert_to_ndarrays( self, dct: dict, na_values, na_fvalues, verbose: bool = False, converters=None, dtypes=None, ): result = {} for c, values in dct.items(): conv_f = None if converters is None else converters.get(c, None) if isinstance(dtypes, dict): cast_type = dtypes.get(c, None) else: # single dtype or None cast_type = dtypes if self.na_filter: col_na_values, col_na_fvalues = _get_na_values( c, na_values, na_fvalues, self.keep_default_na) else: col_na_values, col_na_fvalues = set(), set() if conv_f is not None: # conv_f applied to data before inference if cast_type is not None: warnings.warn( ("Both a converter and dtype were specified " f"for column {c} - only the converter will be used."), ParserWarning, stacklevel=7, ) try: values = lib.map_infer(values, conv_f) except ValueError: # error: Argument 2 to "isin" has incompatible type "List[Any]"; # expected "Union[Union[ExtensionArray, ndarray], Index, Series]" mask = algorithms.isin( values, list(na_values) # type: ignore[arg-type] ).view(np.uint8) values = lib.map_infer_mask(values, conv_f, mask) cvals, na_count = self._infer_types(values, set(col_na_values) | col_na_fvalues, try_num_bool=False) else: is_ea = is_extension_array_dtype(cast_type) is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type) # skip inference if specified dtype is object # or casting to an EA try_num_bool = not (cast_type and is_str_or_ea_dtype) # general type inference and conversion cvals, na_count = self._infer_types( values, set(col_na_values) | col_na_fvalues, try_num_bool) # type specified in dtype param or cast_type is an EA if cast_type and (not is_dtype_equal(cvals, cast_type) or is_extension_array_dtype(cast_type)): if not is_ea and na_count > 0: try: if is_bool_dtype(cast_type): raise ValueError( f"Bool column has NA values in column {c}") except (AttributeError, TypeError): # invalid input to is_bool_dtype pass cast_type = pandas_dtype(cast_type) cvals = self._cast_types(cvals, cast_type, c) result[c] = cvals if verbose and na_count: print(f"Filled {na_count} NA values in column {c!s}") return result
def _cast_types(self, values, cast_type, column): """ Cast values to specified type Parameters ---------- values : ndarray cast_type : string or np.dtype dtype to cast values to column : string column name - used only for error reporting Returns ------- converted : ndarray """ if is_categorical_dtype(cast_type): known_cats = (isinstance(cast_type, CategoricalDtype) and cast_type.categories is not None) if not is_object_dtype(values) and not known_cats: # TODO: this is for consistency with # c-parser which parses all categories # as strings values = astype_nansafe(values, np.dtype(str)) cats = Index(values).unique().dropna() values = Categorical._from_inferred_categories( cats, cats.get_indexer(values), cast_type, true_values=self.true_values) # use the EA's implementation of casting elif is_extension_array_dtype(cast_type): # ensure cast_type is an actual dtype and not a string cast_type = pandas_dtype(cast_type) array_type = cast_type.construct_array_type() try: if is_bool_dtype(cast_type): return array_type._from_sequence_of_strings( values, dtype=cast_type, true_values=self.true_values, false_values=self.false_values, ) else: return array_type._from_sequence_of_strings( values, dtype=cast_type) except NotImplementedError as err: raise NotImplementedError( f"Extension Array: {array_type} must implement " "_from_sequence_of_strings in order to be used in parser methods" ) from err else: try: values = astype_nansafe(values, cast_type, copy=True, skipna=True) except ValueError as err: raise ValueError( f"Unable to convert column {column} to type {cast_type}" ) from err return values
def ndarray_to_mgr(values, index, columns, dtype: DtypeObj | None, copy: bool, typ: str) -> Manager: # used in DataFrame.__init__ # input must be a ndarray, list, Series, Index, ExtensionArray if isinstance(values, ABCSeries): if columns is None: if values.name is not None: columns = Index([values.name]) if index is None: index = values.index else: values = values.reindex(index) # zero len case (GH #2234) if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) vdtype = getattr(values, "dtype", None) if is_1d_only_ea_dtype(vdtype) or isinstance(dtype, ExtensionDtype): # GH#19157 if isinstance(values, np.ndarray) and values.ndim > 1: # GH#12513 a EA dtype passed with a 2D array, split into # multiple EAs that view the values values = [values[:, n] for n in range(values.shape[1])] else: values = [values] if columns is None: columns = Index(range(len(values))) return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ) if is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): # i.e. Datetime64TZ values = extract_array(values, extract_numpy=True) if copy: values = values.copy() if values.ndim == 1: values = values.reshape(-1, 1) else: # by definition an array here # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) if dtype is not None and not is_dtype_equal(values.dtype, dtype): shape = values.shape flat = values.ravel() if not is_integer_dtype(dtype): # TODO: skipping integer_dtype is needed to keep the tests passing, # not clear it is correct # Note: we really only need _try_cast, but keeping to exposed funcs values = sanitize_array(flat, None, dtype=dtype, copy=copy, raise_cast_failure=True) else: try: values = construct_1d_ndarray_preserving_na(flat, dtype=dtype, copy=False) except Exception as err: # e.g. ValueError when trying to cast object dtype to float64 msg = f"failed to cast to '{dtype}' (Exception was: {err})" raise ValueError(msg) from err values = values.reshape(shape) # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes(values.shape[0], values.shape[1], index=index, columns=columns) _check_values_indices_shape_match(values, index, columns) if typ == "array": if issubclass(values.dtype.type, str): values = np.array(values, dtype=object) if dtype is None and is_object_dtype(values.dtype): arrays = [ ensure_wrapped_if_datetimelike( maybe_infer_to_datetimelike(values[:, i].copy())) for i in range(values.shape[1]) ] else: if is_datetime_or_timedelta_dtype(values.dtype): values = ensure_wrapped_if_datetimelike(values) arrays = [values[:, i].copy() for i in range(values.shape[1])] return ArrayManager(arrays, [index, columns], verify_integrity=False) values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values.dtype): if values.ndim == 2 and values.shape[0] != 1: # transpose and separate blocks dvals_list = [maybe_infer_to_datetimelike(row) for row in values] dvals_list = [ensure_block_shape(dval, 2) for dval in dvals_list] # TODO: What about re-joining object columns? block_values = [ new_block(dvals_list[n], placement=n, ndim=2) for n in range(len(dvals_list)) ] else: datelike_vals = maybe_infer_to_datetimelike(values) nb = new_block(datelike_vals, placement=slice(len(columns)), ndim=2) block_values = [nb] else: nb = new_block(values, placement=slice(len(columns)), ndim=2) block_values = [nb] if len(columns) == 0: block_values = [] return create_block_manager_from_blocks(block_values, [columns, index])
def _cython_operation(self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs) -> ArrayLike: """ Returns the values of a cython operation. """ orig_values = values assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError( "number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis dtype = values.dtype is_numeric = is_numeric_dtype(dtype) cy_op = WrappedCythonOp(kind=kind, how=how) # can we do this operation with our cython functions # if not raise NotImplementedError cy_op.disallow_invalid_ops(dtype, is_numeric) if is_extension_array_dtype(dtype): return self._ea_wrap_cython_operation(kind, values, how, axis, min_count, **kwargs) elif values.ndim == 1: # expand to 2d, dispatch, then squeeze if appropriate values2d = values[None, :] res = self._cython_operation( kind=kind, values=values2d, how=how, axis=1, min_count=min_count, **kwargs, ) if res.shape[0] == 1: return res[0] # otherwise we have OHLC return res.T is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(dtype): values = ensure_int_or_float(values) elif is_integer_dtype(dtype): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric: if not is_complex_dtype(dtype): values = ensure_float64(values) ngroups = self.ngroups comp_ids, _, _ = self.group_info assert axis == 1 values = values.T out_shape = cy_op.get_output_shape(ngroups, values) func, values = cy_op.get_cython_func_and_vals(values, is_numeric) out_dtype = cy_op.get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if kind == "aggregate": counts = np.zeros(ngroups, dtype=np.int64) func(result, counts, values, comp_ids, min_count) elif kind == "transform": # TODO: min_count func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs) if is_integer_dtype(result.dtype) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all( ): assert result.ndim != 2 result = result[counts > 0] result = result.T if how not in base.cython_cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cython_cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) op_result = maybe_downcast_to_dtype(result, dtype) else: op_result = result return op_result
def _get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype na """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.float64, np.nan if _is_uniform_reindex(join_units): # FIXME: integrate property empty_dtype = join_units[0].block.dtype upcasted_na = join_units[0].block.fill_value return empty_dtype, upcasted_na has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): if unit.block is None: has_none_blocks = True else: dtypes[i] = unit.dtype upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) for dtype, unit in zip(dtypes, join_units): if dtype is None: continue if is_categorical_dtype(dtype): upcast_cls = "category" elif is_datetime64tz_dtype(dtype): upcast_cls = "datetimetz" elif issubclass(dtype.type, np.bool_): upcast_cls = "bool" elif issubclass(dtype.type, np.object_): upcast_cls = "object" elif is_datetime64_dtype(dtype): upcast_cls = "datetime" elif is_timedelta64_dtype(dtype): upcast_cls = "timedelta" elif is_sparse(dtype): upcast_cls = dtype.subtype.name elif is_extension_array_dtype(dtype): upcast_cls = "object" elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: upcast_cls = "float" # Null blocks should not influence upcast class selection, unless there # are only null blocks, when same upcasting rules must be applied to # null upcast classes. if unit.is_na: null_upcast_classes[upcast_cls].append(dtype) else: upcast_classes[upcast_cls].append(dtype) if not upcast_classes: upcast_classes = null_upcast_classes # TODO: de-duplicate with maybe_promote? # create the result if "object" in upcast_classes: return np.dtype(np.object_), np.nan elif "bool" in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan else: return np.dtype(np.bool_), None elif "category" in upcast_classes: return np.dtype(np.object_), np.nan elif "datetimetz" in upcast_classes: # GH-25014. We use NaT instead of iNaT, since this eventually # ends up in DatetimeArray.take, which does not allow iNaT. dtype = upcast_classes["datetimetz"] return dtype[0], tslibs.NaT elif "datetime" in upcast_classes: return np.dtype("M8[ns]"), tslibs.iNaT elif "timedelta" in upcast_classes: return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") else: # pragma try: g = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: if is_float_dtype(g): return g, g.type(np.nan) elif is_numeric_dtype(g): if has_none_blocks: return np.float64, np.nan else: return g, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def melt( frame: "DataFrame", id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, ignore_index: bool = True, ) -> "DataFrame": # If multiindex, gather names of columns on all level for checking presence # of `id_vars` and `value_vars` if isinstance(frame.columns, MultiIndex): cols = [x for c in frame.columns for x in c] else: cols = list(frame.columns) if value_name in frame.columns: warnings.warn( "This dataframe has a column name that matches the 'value_name' column " "name of the resultiing Dataframe. " "In the future this will raise an error, please set the 'value_name' " "parameter of DataFrame.melt to a unique name.", FutureWarning, stacklevel=3, ) if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] elif isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list): raise ValueError( "id_vars must be a list of tuples when columns are a MultiIndex" ) else: # Check that `id_vars` are in frame id_vars = list(id_vars) missing = Index(com.flatten(id_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'id_vars' are not present " f"in the DataFrame: {list(missing)}") else: id_vars = [] if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] elif isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list): raise ValueError( "value_vars must be a list of tuples when columns are a MultiIndex" ) else: value_vars = list(value_vars) # Check that `value_vars` are in frame missing = Index(com.flatten(value_vars)).difference(cols) if not missing.empty: raise KeyError("The following 'value_vars' are not present in " f"the DataFrame: {list(missing)}") if col_level is not None: idx = frame.columns.get_level_values(col_level).get_indexer( id_vars + value_vars) else: idx = frame.columns.get_indexer(id_vars + value_vars) frame = frame.iloc[:, idx] else: frame = frame.copy() if col_level is not None: # allow list or other? # frame is a copy frame.columns = frame.columns.get_level_values(col_level) if var_name is None: if isinstance(frame.columns, MultiIndex): if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: var_name = [ f"variable_{i}" for i in range(len(frame.columns.names)) ] else: var_name = [ frame.columns.name if frame.columns.name is not None else "variable" ] if isinstance(var_name, str): var_name = [var_name] N, K = frame.shape K -= len(id_vars) mdata = {} for col in id_vars: id_data = frame.pop(col) if is_extension_array_dtype(id_data): id_data = cast("Series", concat([id_data] * K, ignore_index=True)) else: id_data = np.tile(id_data._values, K) mdata[col] = id_data mcolumns = id_vars + var_name + [value_name] mdata[value_name] = frame._values.ravel("F") for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index mdata[col] = np.asanyarray( frame.columns._get_level_values(i)).repeat(N) result = frame._constructor(mdata, columns=mcolumns) if not ignore_index: result.index = _tile_compat(frame.index, K) return result
def _map_values(self, mapper, na_action=None): """ An internal function that maps values using the input correspondence (which can be a dict, Series, or function). Parameters ---------- mapper : function, dict, or Series The input correspondence object na_action : {None, 'ignore'} If 'ignore', propagate NA values, without passing them to the mapping function Returns ------- Union[Index, MultiIndex], inferred The output of the mapping function applied to the index. If the function returns a tuple with more than one element a MultiIndex will be returned. """ # we can fastpath dict/Series to an efficient map # as we know that we are not going to have to yield # python types if isinstance(mapper, dict): if hasattr(mapper, "__missing__"): # If a dictionary subclass defines a default value method, # convert mapper to a lookup function (GH #15999). dict_with_default = mapper mapper = lambda x: dict_with_default[x] else: # Dictionary does not have a default. Thus it's safe to # convert to an Series for efficiency. # we specify the keys here to handle the # possibility that they are tuples from pandas import Series mapper = Series(mapper) if isinstance(mapper, ABCSeries): # Since values were input this means we came from either # a dict or a series and mapper should be an index if is_categorical_dtype(self._values): # use the built in categorical series mapper which saves # time by mapping the categories instead of all values return self._values.map(mapper) if is_extension_array_dtype(self.dtype): values = self._values else: values = self.values indexer = mapper.index.get_indexer(values) new_values = algorithms.take_1d(mapper._values, indexer) return new_values # we must convert to python types if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"): # GH#23179 some EAs do not have `map` values = self._values if na_action is not None: raise NotImplementedError map_f = lambda values, f: values.map(f) else: values = self.astype(object) values = getattr(values, "values", values) if na_action == "ignore": def map_f(values, f): return lib.map_infer_mask(values, f, isna(values).view(np.uint8)) else: map_f = lib.map_infer # mapper is a function new_values = map_f(values, mapper) return new_values
def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: if upcasted_na is None: # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() else: fill_value = upcasted_na if self.is_na: blk_dtype = getattr(self.block, "dtype", None) if blk_dtype == np.dtype(object): # we want to avoid filling with np.nan if we are # using None; we already know that we are all # nulls values = self.block.values.ravel(order="K") if len(values) and values[0] is None: fill_value = None if is_datetime64tz_dtype(blk_dtype) or is_datetime64tz_dtype( empty_dtype): if self.block is None: # TODO(EA2D): special case unneeded with 2D EAs i8values = np.full(self.shape[1], fill_value.value) return DatetimeArray(i8values, dtype=empty_dtype) elif is_categorical_dtype(blk_dtype): pass elif is_extension_array_dtype(blk_dtype): pass elif is_extension_array_dtype(empty_dtype): missing_arr = empty_dtype.construct_array_type( )._from_sequence([], dtype=empty_dtype) ncols, nrows = self.shape assert ncols == 1, ncols empty_arr = -1 * np.ones((nrows, ), dtype=np.intp) return missing_arr.take(empty_arr, allow_fill=True, fill_value=fill_value) else: missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) return missing_arr if (not self.indexers) and (not self.block._can_consolidate): # preserve these for validation in concat_compat return self.block.values if self.block.is_bool and not self.block.is_categorical: # External code requested filling/upcasting, bool values must # be upcasted to object to avoid being upcasted to numeric. values = self.block.astype(np.object_).values elif self.block.is_extension: values = self.block.values else: # No dtype upcasting is done here, it will be performed during # concatenation itself. values = self.block.values if not self.indexers: # If there's no indexing to be done, we want to signal outside # code that this array must be copied explicitly. This is done # by returning a view and checking `retval.base`. values = values.view() else: for ax, indexer in self.indexers.items(): values = algos.take_nd(values, indexer, axis=ax) return values
def to_numpy(self, dtype=None, copy=False): """ A NumPy ndarray representing the values in this Series or Index. .. versionadded:: 0.24.0 Parameters ---------- dtype : str or numpy.dtype, optional The dtype to pass to :meth:`numpy.asarray` copy : bool, default False Whether to ensure that the returned value is a not a view on another array. Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. Returns ------- numpy.ndarray See Also -------- Series.array : Get the actual data stored within. Index.array : Get the actual data stored within. DataFrame.to_numpy : Similar method for DataFrame. Notes ----- The returned array will be the same up to equality (values equal in `self` will be equal in the returned array; likewise for values that are not equal). When `self` contains an ExtensionArray, the dtype may be different. For example, for a category-dtype Series, ``to_numpy()`` will return a NumPy array and the categorical dtype will be lost. For NumPy dtypes, this will be a reference to the actual data stored in this Series or Index (assuming ``copy=False``). Modifying the result in place will modify the data stored in the Series or Index (not that we recommend doing that). For extension types, ``to_numpy()`` *may* require copying data and coercing the result to a NumPy type (possibly object), which may be expensive. When you need a no-copy reference to the underlying data, :attr:`Series.array` should be used instead. This table lays out the different dtypes and return types of ``to_numpy()`` for various dtypes within pandas. ================== ================================ dtype array type ================== ================================ category[T] ndarray[T] (same dtype as input) period ndarray[object] (Periods) interval ndarray[object] (Intervals) IntegerNA ndarray[object] datetime64[ns, tz] ndarray[object] (Timestamps) ================== ================================ Examples -------- >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.to_numpy() array(['a', 'b', 'a'], dtype=object) Specify the `dtype` to control how datetime-aware data is represented. Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp` objects, each with the correct ``tz``. >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) >>> ser.to_numpy(dtype=object) array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'), Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')], dtype=object) Or ``dtype='datetime64[ns]'`` to return an ndarray of native datetime64 values. The values are converted to UTC and the timezone info is dropped. >>> ser.to_numpy(dtype="datetime64[ns]") ... # doctest: +ELLIPSIS array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], dtype='datetime64[ns]') """ if (is_extension_array_dtype(self.dtype) or is_datetime64tz_dtype(self.dtype)): # TODO(DatetimeArray): remove the second clause. # TODO(GH-24345): Avoid potential double copy result = np.asarray(self._values, dtype=dtype) else: result = self._values if copy: result = result.copy() return result
def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs ) -> Tuple[np.ndarray, Optional[List[str]]]: """ Returns the values of a cython operation as a Tuple of [data, names]. Names is only useful when dealing with 2D results, like ohlc (see self._name_functions). """ assert kind in ["transform", "aggregate"] orig_values = values if values.ndim > 2: raise NotImplementedError("number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis # can we do this operation with our cython functions # if not raise NotImplementedError # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes # categoricals are only 1d, so we # are not setup for dim transforming if is_categorical_dtype(values.dtype) or is_sparse(values.dtype): raise NotImplementedError(f"{values.dtype} dtype not supported") elif is_datetime64_any_dtype(values.dtype): if how in ["add", "prod", "cumsum", "cumprod"]: raise NotImplementedError( f"datetime64 type does not support {how} operations" ) elif is_timedelta64_dtype(values.dtype): if how in ["prod", "cumprod"]: raise NotImplementedError( f"timedelta64 type does not support {how} operations" ) if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function # TODO: possible need to reshape? # TODO(EA2D):kludge can be avoided when 2D EA is allowed. values = values.view("M8[ns]") is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_int_or_float(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(values) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups,) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = maybe_fill( np.empty_like(values, dtype=out_dtype), fill_value=np.nan ) # TODO: min_count result = self._transform( result, values, codes, func, is_datetimelike, **kwargs ) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] names: Optional[List[str]] = self._name_functions.get(how, None) if swapped: result = result.swapaxes(0, axis) if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype( orig_values.dtype ): # We need to use the constructors directly for these dtypes # since numpy won't recognize them # https://github.com/pandas-dev/pandas/issues/31471 result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) elif is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) if is_extension_array_dtype(orig_values.dtype): result = maybe_cast_result(result=result, obj=orig_values, how=how) return result, names
def test_is_extension_array_dtype(dtype): assert isinstance(dtype, dtypes.ExtensionDtype) assert is_extension_array_dtype(dtype)
def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index Returns ------- stacked : Series """ def factorize(index): if index.is_unique: return index, np.arange(len(index)) codes, categories = _factorize_from_iterable(index) return categories, codes N, K = frame.shape # Will also convert negative level numbers and check if out of bounds. level_num = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_codes = [lab.repeat(K) for lab in frame.index.codes] clev, clab = factorize(frame.columns) new_levels.append(clev) new_codes.append(np.tile(clab, N).ravel()) new_names = list(frame.index.names) new_names.append(frame.columns.name) new_index = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) else: levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) codes = ilab.repeat(K), np.tile(clab, N).ravel() new_index = MultiIndex(levels=levels, codes=codes, names=[frame.index.name, frame.columns.name], verify_integrity=False) if frame._is_homogeneous_type: # For homogeneous EAs, frame.values will coerce to object. So # we concatenate instead. dtypes = list(frame.dtypes.values) dtype = dtypes[0] if is_extension_array_dtype(dtype): arr = dtype.construct_array_type() new_values = arr._concat_same_type([ col._values for _, col in frame.iteritems() ]) new_values = _reorder_for_extension_array_stack(new_values, N, K) else: # homogeneous, non-EA new_values = frame.values.ravel() else: # non-homogeneous new_values = frame.values.ravel() if dropna: mask = notna(new_values) new_values = new_values[mask] new_index = new_index[mask] return frame._constructor_sliced(new_values, index=new_index)
def assert_series_equal( left, right, check_dtype=True, check_index_type="equiv", check_series_type=True, check_less_precise=no_default, check_names=True, check_exact=False, check_datetimelike_compat=False, check_categorical=True, check_category_order=True, check_freq=True, check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="Series", *, check_index=True, ): """ Check that left and right Series are equal. Parameters ---------- left : Series right : Series check_dtype : bool, default True Whether to check the Series dtype is identical. check_index_type : bool or {'equiv'}, default 'equiv' Whether to check the Index class, dtype and inferred_type are identical. check_series_type : bool, default True Whether to check the Series class is identical. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. If int, then specify the digits to compare. When comparing two numbers, if the first number has magnitude less than 1e-5, we compare the two numbers directly and check whether they are equivalent within the specified precision. Otherwise, we compare the **ratio** of the second number to the first number and check whether it is equivalent to 1 within the specified precision. .. deprecated:: 1.1.0 Use `rtol` and `atol` instead to define relative/absolute tolerance, respectively. Similar to :func:`math.isclose`. check_names : bool, default True Whether to check the Series and Index names attribute. check_exact : bool, default False Whether to compare number exactly. check_datetimelike_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True Whether to compare internal Categorical exactly. check_category_order : bool, default True Whether to compare category order of internal Categoricals. .. versionadded:: 1.0.2 check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. .. versionadded:: 1.1.0 check_flags : bool, default True Whether to check the `flags` attribute. .. versionadded:: 1.2.0 rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 atol : float, default 1e-8 Absolute tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. check_index : bool, default True Whether to check index equivalence. If False, then compare only values. .. versionadded:: 1.3.0 Examples -------- >>> from pandas.testing import assert_series_equal >>> a = pd.Series([1, 2, 3, 4]) >>> b = pd.Series([1, 2, 3, 4]) >>> assert_series_equal(a, b) """ __tracebackhide__ = True if check_less_precise is not no_default: warnings.warn( "The 'check_less_precise' keyword in testing.assert_*_equal " "is deprecated and will be removed in a future version. " "You can stop passing 'check_less_precise' to silence this warning.", FutureWarning, stacklevel=2, ) rtol = atol = _get_tol_from_less_precise(check_less_precise) # instance validation _check_isinstance(left, right, Series) if check_series_type: assert_class_equal(left, right, obj=obj) # length comparison if len(left) != len(right): msg1 = f"{len(left)}, {left.index}" msg2 = f"{len(right)}, {right.index}" raise_assert_detail(obj, "Series length are different", msg1, msg2) if check_flags: assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" if check_index: # GH #38183 assert_index_equal( left.index, right.index, exact=check_index_type, check_names=check_names, check_exact=check_exact, check_categorical=check_categorical, rtol=rtol, atol=atol, obj=f"{obj}.index", ) if check_freq and isinstance(left.index, (DatetimeIndex, TimedeltaIndex)): lidx = left.index ridx = right.index assert lidx.freq == ridx.freq, (lidx.freq, ridx.freq) if check_dtype: # We want to skip exact dtype checking when `check_categorical` # is False. We'll still raise if only one is a `Categorical`, # regardless of `check_categorical` if (is_categorical_dtype(left.dtype) and is_categorical_dtype(right.dtype) and not check_categorical): pass else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype( right.dtype): # Only check exact if dtype is numeric assert_numpy_array_equal( left._values, right._values, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) elif check_datetimelike_compat and (needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype)): # we want to check only if we have compat dtypes # e.g. integer and M|m are NOT compat, but we can simply check # the values in that case # datetimelike may have different objects (e.g. datetime.datetime # vs Timestamp) but will compare equal if not Index(left._values).equals(Index(right._values)): msg = (f"[datetimelike_compat=True] {left._values} " f"is not equal to {right._values}.") raise AssertionError(msg) elif is_interval_dtype(left.dtype) and is_interval_dtype(right.dtype): assert_interval_array_equal(left.array, right.array) elif is_categorical_dtype(left.dtype) or is_categorical_dtype(right.dtype): _testing.assert_almost_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype( right.dtype): assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) elif is_extension_array_dtype_and_needs_i8_conversion( left.dtype, right.dtype) or is_extension_array_dtype_and_needs_i8_conversion( right.dtype, left.dtype): assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) elif needs_i8_conversion(left.dtype) and needs_i8_conversion(right.dtype): # DatetimeArray or TimedeltaArray assert_extension_array_equal( left._values, right._values, check_dtype=check_dtype, index_values=np.asarray(left.index), ) else: _testing.assert_almost_equal( left._values, right._values, rtol=rtol, atol=atol, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), ) # metadata comparison if check_names: assert_attr_equal("name", left, right, obj=obj) if check_categorical: if is_categorical_dtype(left.dtype) or is_categorical_dtype( right.dtype): assert_categorical_equal( left._values, right._values, obj=f"{obj} category", check_category_order=check_category_order, )
def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs ) -> np.ndarray: """ Returns the values of a cython operation. """ orig_values = values assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError("number of dimensions is currently limited to 2") elif values.ndim == 2: # Note: it is *not* the case that axis is always 0 for 1-dim values, # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis # can we do this operation with our cython functions # if not raise NotImplementedError self._disallow_invalid_ops(values, how) if is_extension_array_dtype(values.dtype): return self._ea_wrap_cython_operation( kind, values, how, axis, min_count, **kwargs ) is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) if is_datetimelike: values = values.view("int64") is_numeric = True elif is_bool_dtype(values.dtype): values = ensure_int_or_float(values) elif is_integer_dtype(values): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): values = ensure_float64(ensure_float(values)) else: values = values.astype(object) arity = self._cython_arity.get(how, 1) vdim = values.ndim swapped = False if vdim == 1: values = values[:, None] out_shape = (self.ngroups, arity) else: if axis > 0: swapped = True assert axis == 1, axis values = values.T if arity > 1: raise NotImplementedError( "arity of more than 1 is not supported for the 'how' argument" ) out_shape = (self.ngroups,) + values.shape[1:] func, values = self._get_cython_func_and_vals(kind, how, values, is_numeric) if how == "rank": out_dtype = "float" else: if is_numeric: out_dtype = f"{values.dtype.kind}{values.dtype.itemsize}" else: out_dtype = "object" codes, _, _ = self.group_info if kind == "aggregate": result = maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = maybe_fill( np.empty_like(values, dtype=out_dtype), fill_value=np.nan ) # TODO: min_count result = self._transform( result, values, codes, func, is_datetimelike, **kwargs ) if is_integer_dtype(result) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") result[mask] = np.nan if kind == "aggregate" and self._filter_empty_groups and not counts.all(): assert result.ndim != 2 result = result[counts > 0] if vdim == 1 and arity == 1: result = result[:, 0] if swapped: result = result.swapaxes(0, axis) if how not in base.cython_cast_blocklist: # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cython_cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) result = maybe_downcast_to_dtype(result, dtype) return result
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, verify=True): """ Sort ``values`` and reorder corresponding ``labels``. ``values`` should be unique if ``labels`` is not None. Safe for use with mixed types (int, str), orders ints before strs. .. versionadded:: 0.19.0 Parameters ---------- values : list-like Sequence; must be unique if ``labels`` is not None. labels : list_like Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``na_sentinel``. na_sentinel : int, default -1 Value in ``labels`` to mark "not found". Ignored when ``labels`` is None. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up the calculation. Ignored when ``labels`` is None. verify : bool, default True Check if labels are out of bound for the values and put out of bound labels equal to na_sentinel. If ``verify=False``, it is assumed there are no out of bound labels. Ignored when ``labels`` is None. .. versionadded:: 0.25.0 Returns ------- ordered : ndarray Sorted ``values`` new_labels : ndarray Reordered ``labels``; returned when ``labels`` is not None. Raises ------ TypeError * If ``values`` is not list-like or if ``labels`` is neither None nor list-like * If ``values`` cannot be sorted ValueError * If ``labels`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): raise TypeError("Only list-like objects are allowed to be passed to" "safe_sort as values") if not isinstance(values, np.ndarray) and not is_extension_array_dtype(values): # don't convert to string types dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) def sort_mixed(values): # order ints before strings, safe in py3 str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) nums = np.sort(values[~str_pos]) strs = np.sort(values[str_pos]) return np.concatenate([nums, np.asarray(strs, dtype=object)]) sorter = None if (not is_extension_array_dtype(values) and lib.infer_dtype(values, skipna=False) == "mixed-integer"): # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: # try this anyway ordered = sort_mixed(values) # labels: if labels is None: return ordered if not is_list_like(labels): raise TypeError("Only list-like objects or None are allowed to be" "passed to safe_sort as labels") labels = ensure_platform_int(np.asarray(labels)) from pandas import Index if not assume_unique and not Index(values).is_unique: raise ValueError("values should be unique if labels is not None") if sorter is None: # mixed types (hash_klass, _), values = algorithms._get_data_algo(values, algorithms._hashtables) t = hash_klass(len(values)) t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered)) if na_sentinel == -1: # take_1d is faster, but only works for na_sentinels of -1 order2 = sorter.argsort() new_labels = algorithms.take_1d(order2, labels, fill_value=-1) if verify: mask = (labels < -len(values)) | (labels >= len(values)) else: mask = None else: reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `na_sentinel` next, so we # may deal with them here without performance loss using `mode='wrap'` new_labels = reverse_indexer.take(labels, mode="wrap") mask = labels == na_sentinel if verify: mask = mask | (labels < -len(values)) | (labels >= len(values)) if mask is not None: np.putmask(new_labels, mask, na_sentinel) return ordered, ensure_platform_int(new_labels)
def array( data: Union[Sequence[object], AnyArrayLike], dtype: Optional[Dtype] = None, copy: bool = True, ) -> ExtensionArray: """ Create an array. .. versionadded:: 0.24.0 Parameters ---------- data : Sequence of objects The scalars inside `data` should be instances of the scalar type for `dtype`. It's expected that `data` represents a 1-dimensional array of data. When `data` is an Index or Series, the underlying array will be extracted from `data`. dtype : str, np.dtype, or ExtensionDtype, optional The dtype to use for the array. This may be a NumPy dtype or an extension type registered with pandas using :meth:`pandas.api.extensions.register_extension_dtype`. If not specified, there are two possibilities: 1. When `data` is a :class:`Series`, :class:`Index`, or :class:`ExtensionArray`, the `dtype` will be taken from the data. 2. Otherwise, pandas will attempt to infer the `dtype` from the data. Note that when `data` is a NumPy array, ``data.dtype`` is *not* used for inferring the array type. This is because NumPy cannot represent all the types of data that can be held in extension arrays. Currently, pandas will infer an extension dtype for sequences of ============================== ===================================== Scalar Type Array Type ============================== ===================================== :class:`pandas.Interval` :class:`pandas.arrays.IntervalArray` :class:`pandas.Period` :class:`pandas.arrays.PeriodArray` :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` :class:`int` :class:`pandas.arrays.IntegerArray` :class:`float` :class:`pandas.arrays.FloatingArray` :class:`str` :class:`pandas.arrays.StringArray` :class:`bool` :class:`pandas.arrays.BooleanArray` ============================== ===================================== For all other cases, NumPy's usual inference rules will be used. .. versionchanged:: 1.0.0 Pandas infers nullable-integer dtype for integer data, string dtype for string data, and nullable-boolean dtype for boolean data. .. versionchanged:: 1.2.0 Pandas now also infers nullable-floating dtype for float-like input data copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require copying data, even if ``copy=False``. Returns ------- ExtensionArray The newly created array. Raises ------ ValueError When `data` is not 1-dimensional. See Also -------- numpy.array : Construct a NumPy array. Series : Construct a pandas Series. Index : Construct a pandas Index. arrays.PandasArray : ExtensionArray wrapping a NumPy array. Series.array : Extract the array stored within a Series. Notes ----- Omitting the `dtype` argument means pandas will attempt to infer the best array type from the values in the data. As new array types are added by pandas and 3rd party libraries, the "best" array type may change. We recommend specifying `dtype` to ensure that 1. the correct array type for the data is returned 2. the returned array type doesn't change as new extension types are added by pandas and third-party libraries Additionally, if the underlying memory representation of the returned array matters, we recommend specifying the `dtype` as a concrete object rather than a string alias or allowing it to be inferred. For example, a future version of pandas or a 3rd-party library may include a dedicated ExtensionArray for string data. In this event, the following would no longer return a :class:`arrays.PandasArray` backed by a NumPy array. >>> pd.array(['a', 'b'], dtype=str) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 This would instead return the new ExtensionArray dedicated for string data. If you really need the new array to be backed by a NumPy array, specify that in the dtype. >>> pd.array(['a', 'b'], dtype=np.dtype("<U1")) <PandasArray> ['a', 'b'] Length: 2, dtype: str32 Finally, Pandas has arrays that mostly overlap with NumPy * :class:`arrays.DatetimeArray` * :class:`arrays.TimedeltaArray` When data with a ``datetime64[ns]`` or ``timedelta64[ns]`` dtype is passed, pandas will always return a ``DatetimeArray`` or ``TimedeltaArray`` rather than a ``PandasArray``. This is for symmetry with the case of timezone-aware data, which NumPy does not natively support. >>> pd.array(['2015', '2016'], dtype='datetime64[ns]') <DatetimeArray> ['2015-01-01 00:00:00', '2016-01-01 00:00:00'] Length: 2, dtype: datetime64[ns] >>> pd.array(["1H", "2H"], dtype='timedelta64[ns]') <TimedeltaArray> ['0 days 01:00:00', '0 days 02:00:00'] Length: 2, dtype: timedelta64[ns] Examples -------- If a dtype is not specified, pandas will infer the best dtype from the values. See the description of `dtype` for the types pandas infers for. >>> pd.array([1, 2]) <IntegerArray> [1, 2] Length: 2, dtype: Int64 >>> pd.array([1, 2, np.nan]) <IntegerArray> [1, 2, <NA>] Length: 3, dtype: Int64 >>> pd.array([1.1, 2.2]) <FloatingArray> [1.1, 2.2] Length: 2, dtype: Float64 >>> pd.array(["a", None, "c"]) <StringArray> ['a', <NA>, 'c'] Length: 3, dtype: string >>> pd.array([pd.Period('2000', freq="D"), pd.Period("2000", freq="D")]) <PeriodArray> ['2000-01-01', '2000-01-01'] Length: 2, dtype: period[D] You can use the string alias for `dtype` >>> pd.array(['a', 'b', 'a'], dtype='category') ['a', 'b', 'a'] Categories (2, object): ['a', 'b'] Or specify the actual dtype >>> pd.array(['a', 'b', 'a'], ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) ['a', 'b', 'a'] Categories (3, object): ['a' < 'b' < 'c'] If pandas does not infer a dedicated extension type a :class:`arrays.PandasArray` is returned. >>> pd.array([1 + 1j, 3 + 2j]) <PandasArray> [(1+1j), (3+2j)] Length: 2, dtype: complex128 As mentioned in the "Notes" section, new extension types may be added in the future (by pandas or 3rd party libraries), causing the return value to no longer be a :class:`arrays.PandasArray`. Specify the `dtype` as a NumPy dtype if you need to ensure there's no future change in behavior. >>> pd.array([1, 2], dtype=np.dtype("int32")) <PandasArray> [1, 2] Length: 2, dtype: int32 `data` must be 1-dimensional. A ValueError is raised when the input has the wrong dimensionality. >>> pd.array(1) Traceback (most recent call last): ... ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( BooleanArray, DatetimeArray, FloatingArray, IntegerArray, IntervalArray, PandasArray, StringArray, TimedeltaArray, period_array, ) if lib.is_scalar(data): msg = f"Cannot pass scalar '{data}' to 'pandas.array'." raise ValueError(msg) if dtype is None and isinstance( data, (ABCSeries, ABCIndexClass, ABCExtensionArray)): dtype = data.dtype data = extract_array(data, extract_numpy=True) # this returns None for not-found dtypes. if isinstance(dtype, str): dtype = registry.find(dtype) or dtype if is_extension_array_dtype(dtype): cls = cast(ExtensionDtype, dtype).construct_array_type() return cls._from_sequence(data, dtype=dtype, copy=copy) if dtype is None: inferred_dtype = lib.infer_dtype(data, skipna=True) if inferred_dtype == "period": try: return period_array(data, copy=copy) except IncompatibleFrequency: # We may have a mixture of frequencies. # We choose to return an ndarray, rather than raising. pass elif inferred_dtype == "interval": try: return IntervalArray(data, copy=copy) except ValueError: # We may have a mixture of `closed` here. # We choose to return an ndarray, rather than raising. pass elif inferred_dtype.startswith("datetime"): # datetime, datetime64 try: return DatetimeArray._from_sequence(data, copy=copy) except ValueError: # Mixture of timezones, fall back to PandasArray pass elif inferred_dtype.startswith("timedelta"): # timedelta, timedelta64 return TimedeltaArray._from_sequence(data, copy=copy) elif inferred_dtype == "string": return StringArray._from_sequence(data, copy=copy) elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) elif inferred_dtype in ("floating", "mixed-integer-float"): return FloatingArray._from_sequence(data, copy=copy) elif inferred_dtype == "boolean": return BooleanArray._from_sequence(data, copy=copy) # Pandas overrides NumPy for # 1. datetime64[ns] # 2. timedelta64[ns] # so that a DatetimeArray is returned. if is_datetime64_ns_dtype(dtype): return DatetimeArray._from_sequence(data, dtype=dtype, copy=copy) elif is_timedelta64_ns_dtype(dtype): return TimedeltaArray._from_sequence(data, dtype=dtype, copy=copy) result = PandasArray._from_sequence(data, dtype=dtype, copy=copy) return result
def init_ndarray(values, index, columns, dtype=None, copy=False): # input must be a ndarray, list, Series, index if isinstance(values, ABCSeries): if columns is None: if values.name is not None: columns = [values.name] if index is None: index = values.index else: values = values.reindex(index) # zero len case (GH #2234) if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) # we could have a categorical type passed or coerced to 'category' # recast this to an arrays_to_mgr if (is_categorical_dtype(getattr(values, 'dtype', None)) or is_categorical_dtype(dtype)): if not hasattr(values, 'dtype'): values = prep_ndarray(values, copy=copy) values = values.ravel() elif copy: values = values.copy() index, columns = _get_axes(len(values), 1, index, columns) return arrays_to_mgr([values], columns, index, columns, dtype=dtype) elif (is_datetime64tz_dtype(values) or is_extension_array_dtype(values)): # GH#19157 if columns is None: columns = [0] return arrays_to_mgr([values], columns, index, columns, dtype=dtype) # by definition an array here # the dtypes will be coerced to a single dtype values = prep_ndarray(values, copy=copy) if dtype is not None: if not is_dtype_equal(values.dtype, dtype): try: values = values.astype(dtype) except Exception as orig: e = ValueError("failed to cast to '{dtype}' (Exception " "was: {orig})".format(dtype=dtype, orig=orig)) raise_with_traceback(e) index, columns = _get_axes(*values.shape, index=index, columns=columns) values = values.T # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type if dtype is None and is_object_dtype(values): values = maybe_infer_to_datetimelike(values) return create_block_manager_from_blocks([values], [columns, index])
def concat_compat(to_concat, axis: int = 0): """ provide concatenation of an array of arrays each of which is a single 'normalized' dtypes (in that for example, if it's object, then it is a non-datetimelike and provide a combined dtype for the resulting array that preserves the overall dtype if possible) Parameters ---------- to_concat : array of arrays axis : axis to provide concatenation Returns ------- a single array, preserving the combined dtypes """ # filter empty arrays # 1-d dtypes always are included here def is_nonempty(x) -> bool: if x.ndim <= axis: return True return x.shape[axis] > 0 # If all arrays are empty, there's nothing to convert, just short-cut to # the concatenation, #3121. # # Creating an empty array directly is tempting, but the winnings would be # marginal given that it would still require shape & dtype calculation and # np.concatenate which has them both implemented is compiled. non_empties = [x for x in to_concat if is_nonempty(x)] if non_empties and axis == 0: to_concat = non_empties typs = get_dtype_kinds(to_concat) _contains_datetime = any(typ.startswith("datetime") for typ in typs) all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) if any_ea: if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) to_concat = [ _cast_to_common_type(arr, target_dtype) for arr in to_concat ] if isinstance(to_concat[0], ExtensionArray): cls = type(to_concat[0]) return cls._concat_same_type(to_concat) else: return np.concatenate(to_concat) elif _contains_datetime or "timedelta" in typs: return concat_datetime(to_concat, axis=axis, typs=typs) elif all_empty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise # cast this to float) typs = get_dtype_kinds(to_concat) if len(typs) != 1: if not len(typs - {"i", "u", "f"}) or not len(typs - {"bool", "i", "u"}): # let numpy coerce pass else: # coerce to object to_concat = [x.astype("object") for x in to_concat] return np.concatenate(to_concat, axis=axis)
def sanitize_array( data, index: Optional[Index], dtype: Optional[DtypeObj] = None, copy: bool = False, raise_cast_failure: bool = False, ) -> ArrayLike: """ Sanitize input data to an ndarray or ExtensionArray, copy if specified, coerce to the dtype if specified. """ if isinstance(data, ma.MaskedArray): data = sanitize_masked_array(data) # extract ndarray or ExtensionArray, ensure we have no PandasArray data = extract_array(data, extract_numpy=True) if isinstance(data, np.ndarray) and data.ndim == 0: if dtype is None: dtype = data.dtype data = lib.item_from_zerodim(data) # GH#846 if isinstance(data, np.ndarray): if dtype is not None and is_float_dtype( data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) except ValueError: subarr = np.array(data, copy=copy) else: # we will try to copy by-definition here subarr = _try_cast(data, dtype, copy, raise_cast_failure) elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray subarr = data if dtype is not None: subarr = subarr.astype(dtype, copy=copy) elif copy: subarr = subarr.copy() return subarr elif isinstance(data, (list, tuple, abc.Set, abc.ValuesView)) and len(data) > 0: # TODO: deque, array.array if isinstance(data, set): # Raise only for unordered sets, e.g., not for dict_keys raise TypeError("Set type is unordered") data = list(data) if dtype is not None: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: subarr = maybe_convert_platform(data) subarr = maybe_cast_to_datetime(subarr, dtype) elif isinstance(data, range): # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) elif not is_list_like(data): if index is None: raise ValueError( "index must be specified when data is not list-like") subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype) else: subarr = _try_cast(data, dtype, copy, raise_cast_failure) subarr = _sanitize_ndim(subarr, data, dtype, index) if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)): subarr = _sanitize_str_dtypes(subarr, data, dtype, copy) is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype( dtype) if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype: inferred = lib.infer_dtype(subarr, skipna=False) if inferred in {"interval", "period"}: subarr = array(subarr) return subarr
def safe_sort(values, labels=None, na_sentinel=-1, assume_unique=False, verify=True): """ Sort ``values`` and reorder corresponding ``labels``. ``values`` should be unique if ``labels`` is not None. Safe for use with mixed types (int, str), orders ints before strs. .. versionadded:: 0.19.0 Parameters ---------- values : list-like Sequence; must be unique if ``labels`` is not None. labels : list_like Indices to ``values``. All out of bound indices are treated as "not found" and will be masked with ``na_sentinel``. na_sentinel : int, default -1 Value in ``labels`` to mark "not found". Ignored when ``labels`` is None. assume_unique : bool, default False When True, ``values`` are assumed to be unique, which can speed up the calculation. Ignored when ``labels`` is None. verify : bool, default True Check if labels are out of bound for the values and put out of bound labels equal to na_sentinel. If ``verify=False``, it is assumed there are no out of bound labels. Ignored when ``labels`` is None. .. versionadded:: 0.25.0 Returns ------- ordered : ndarray Sorted ``values`` new_labels : ndarray Reordered ``labels``; returned when ``labels`` is not None. Raises ------ TypeError * If ``values`` is not list-like or if ``labels`` is neither None nor list-like * If ``values`` cannot be sorted ValueError * If ``labels`` is not None and ``values`` contain duplicates. """ if not is_list_like(values): raise TypeError("Only list-like objects are allowed to be passed to" "safe_sort as values") if (not isinstance(values, np.ndarray) and not is_extension_array_dtype(values)): # don't convert to string types dtype, _ = infer_dtype_from_array(values) values = np.asarray(values, dtype=dtype) def sort_mixed(values): # order ints before strings, safe in py3 str_pos = np.array([isinstance(x, str) for x in values], dtype=bool) nums = np.sort(values[~str_pos]) strs = np.sort(values[str_pos]) return np.concatenate([nums, np.asarray(strs, dtype=object)]) sorter = None if (not is_extension_array_dtype(values) and lib.infer_dtype(values, skipna=False) == 'mixed-integer'): # unorderable in py3 if mixed str/int ordered = sort_mixed(values) else: try: sorter = values.argsort() ordered = values.take(sorter) except TypeError: # try this anyway ordered = sort_mixed(values) # labels: if labels is None: return ordered if not is_list_like(labels): raise TypeError("Only list-like objects or None are allowed to be" "passed to safe_sort as labels") labels = ensure_platform_int(np.asarray(labels)) from pandas import Index if not assume_unique and not Index(values).is_unique: raise ValueError("values should be unique if labels is not None") if sorter is None: # mixed types (hash_klass, _), values = algorithms._get_data_algo( values, algorithms._hashtables) t = hash_klass(len(values)) t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered)) if na_sentinel == -1: # take_1d is faster, but only works for na_sentinels of -1 order2 = sorter.argsort() new_labels = algorithms.take_1d(order2, labels, fill_value=-1) if verify: mask = (labels < -len(values)) | (labels >= len(values)) else: mask = None else: reverse_indexer = np.empty(len(sorter), dtype=np.int_) reverse_indexer.put(sorter, np.arange(len(sorter))) # Out of bound indices will be masked with `na_sentinel` next, so we # may deal with them here without performance loss using `mode='wrap'` new_labels = reverse_indexer.take(labels, mode='wrap') mask = labels == na_sentinel if verify: mask = mask | (labels < -len(values)) | (labels >= len(values)) if mask is not None: np.putmask(new_labels, mask, na_sentinel) return ordered, ensure_platform_int(new_labels)
def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: """ Return dtype and N/A values to use when concatenating specified units. Returned N/A value may be None which means there was no casting involved. Returns ------- dtype """ if len(join_units) == 1: blk = join_units[0].block if blk is None: return np.dtype(np.float64) if _is_uniform_reindex(join_units): # FIXME: integrate property empty_dtype = join_units[0].block.dtype return empty_dtype has_none_blocks = any(unit.block is None for unit in join_units) dtypes = [ None if unit.block is None else unit.dtype for unit in join_units ] filtered_dtypes = [ unit.dtype for unit in join_units if unit.block is not None and not unit.is_na ] if not len(filtered_dtypes): filtered_dtypes = [ unit.dtype for unit in join_units if unit.block is not None ] dtype_alt = find_common_type(filtered_dtypes) upcast_classes = _get_upcast_classes(join_units, dtypes) if is_extension_array_dtype(dtype_alt): return dtype_alt elif dtype_alt == object: return dtype_alt # TODO: de-duplicate with maybe_promote? # create the result if "extension" in upcast_classes: return np.dtype("object") elif "bool" in upcast_classes: if has_none_blocks: return np.dtype(np.object_) else: return np.dtype(np.bool_) elif "datetimetz" in upcast_classes: # GH-25014. We use NaT instead of iNaT, since this eventually # ends up in DatetimeArray.take, which does not allow iNaT. dtype = upcast_classes["datetimetz"] return dtype[0] elif "datetime" in upcast_classes: return np.dtype("M8[ns]") elif "timedelta" in upcast_classes: return np.dtype("m8[ns]") else: try: common_dtype = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_) else: if is_float_dtype(common_dtype): return common_dtype elif is_numeric_dtype(common_dtype): if has_none_blocks: return np.dtype(np.float64) else: return common_dtype msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg)
def _map_values(self, mapper, na_action=None): """ An internal function that maps values using the input correspondence (which can be a dict, Series, or function). Parameters ---------- mapper : function, dict, or Series The input correspondence object na_action : {None, 'ignore'} If 'ignore', propagate NA values, without passing them to the mapping function Returns ------- Union[Index, MultiIndex], inferred The output of the mapping function applied to the index. If the function returns a tuple with more than one element a MultiIndex will be returned. """ # we can fastpath dict/Series to an efficient map # as we know that we are not going to have to yield # python types if is_dict_like(mapper): if isinstance(mapper, dict) and hasattr(mapper, "__missing__"): # If a dictionary subclass defines a default value method, # convert mapper to a lookup function (GH #15999). dict_with_default = mapper mapper = lambda x: dict_with_default[x] else: # Dictionary does not have a default. Thus it's safe to # convert to an Series for efficiency. # we specify the keys here to handle the # possibility that they are tuples # The return value of mapping with an empty mapper is # expected to be pd.Series(np.nan, ...). As np.nan is # of dtype float64 the return value of this method should # be float64 as well mapper = create_series_with_explicit_dtype( mapper, dtype_if_empty=np.float64) if isinstance(mapper, ABCSeries): # Since values were input this means we came from either # a dict or a series and mapper should be an index if is_categorical_dtype(self.dtype): # use the built in categorical series mapper which saves # time by mapping the categories instead of all values cat = cast("Categorical", self._values) return cat.map(mapper) values = self._values indexer = mapper.index.get_indexer(values) new_values = algorithms.take_nd(mapper._values, indexer) return new_values # we must convert to python types if is_extension_array_dtype(self.dtype) and hasattr( self._values, "map"): # GH#23179 some EAs do not have `map` values = self._values if na_action is not None: raise NotImplementedError map_f = lambda values, f: values.map(f) else: values = self._values.astype(object) if na_action == "ignore": map_f = lambda values, f: lib.map_infer_mask( values, f, isna(values).view(np.uint8)) elif na_action is None: map_f = lib.map_infer else: msg = ("na_action must either be 'ignore' or None, " f"{na_action} was passed") raise ValueError(msg) # mapper is a function new_values = map_f(values, mapper) return new_values
def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index Returns ------- stacked : Series """ def factorize(index): if index.is_unique: return index, np.arange(len(index)) codes, categories = factorize_from_iterable(index) return categories, codes N, K = frame.shape # Will also convert negative level numbers and check if out of bounds. level_num = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_codes = [lab.repeat(K) for lab in frame.index.codes] clev, clab = factorize(frame.columns) new_levels.append(clev) new_codes.append(np.tile(clab, N).ravel()) new_names = list(frame.index.names) new_names.append(frame.columns.name) new_index = MultiIndex( levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) else: levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) codes = ilab.repeat(K), np.tile(clab, N).ravel() new_index = MultiIndex( levels=levels, codes=codes, names=[frame.index.name, frame.columns.name], verify_integrity=False, ) if frame._is_homogeneous_type: # For homogeneous EAs, frame.values will coerce to object. So # we concatenate instead. dtypes = list(frame.dtypes.values) dtype = dtypes[0] if is_extension_array_dtype(dtype): arr = dtype.construct_array_type() new_values = arr._concat_same_type( [col._values for _, col in frame.items()] ) new_values = _reorder_for_extension_array_stack(new_values, N, K) else: # homogeneous, non-EA new_values = frame.values.ravel() else: # non-homogeneous new_values = frame.values.ravel() if dropna: mask = notna(new_values) new_values = new_values[mask] new_index = new_index[mask] return frame._constructor_sliced(new_values, index=new_index)
def _stack_multi_columns(frame, level_num=-1, dropna=True): def _convert_level_number(level_num, columns): """ Logic for converting the level number to something we can safely pass to swaplevel: We generally want to convert the level number into a level name, except when columns do not have names, in which case we must leave as a level number """ if level_num in columns.names: return columns.names[level_num] else: if columns.names[level_num] is None: return level_num else: return columns.names[level_num] this = frame.copy() # this makes life much simpler if level_num != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns for i in range(level_num, frame.columns.nlevels - 1): # Need to check if the ints conflict with level names lev1 = _convert_level_number(i, roll_columns) lev2 = _convert_level_number(i + 1, roll_columns) roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = roll_columns if not this.columns.is_lexsorted(): # Workaround the edge case where 0 is one of the column names, # which interferes with trying to sort based on the first # level level_to_sort = _convert_level_number(0, this.columns) this = this.sort_index(level=level_to_sort, axis=1) # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: tuples = list( zip( *[ lev.take(level_codes) for lev, level_codes in zip( this.columns.levels[:-1], this.columns.codes[:-1] ) ] ) ) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) else: new_columns = this.columns.levels[0]._shallow_copy(name=this.columns.names[0]) unique_groups = new_columns # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] level_codes = sorted(set(this.columns.codes[-1])) level_vals_used = level_vals[level_codes] levsize = len(level_codes) drop_cols = [] for key in unique_groups: try: loc = this.columns.get_loc(key) except KeyError: drop_cols.append(key) continue # can make more efficient? # we almost always return a slice # but if unsorted can get a boolean # indexer if not isinstance(loc, slice): slice_len = len(loc) else: slice_len = loc.stop - loc.start if slice_len != levsize: chunk = this.loc[:, this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.codes[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if frame._is_homogeneous_type and is_extension_array_dtype( frame.dtypes.iloc[0] ): dtype = this[this.columns[loc]].dtypes.iloc[0] subset = this[this.columns[loc]] value_slice = dtype.construct_array_type()._concat_same_type( [x._values for _, x in subset.items()] ) N, K = this.shape idx = np.arange(N * K).reshape(K, N).T.ravel() value_slice = value_slice.take(idx) elif frame._is_mixed_type: value_slice = this[this.columns[loc]].values else: value_slice = this.values[:, loc] if value_slice.ndim > 1: # i.e. not extension value_slice = value_slice.ravel() new_data[key] = value_slice if len(drop_cols) > 0: new_columns = new_columns.difference(drop_cols) N = len(this) if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) new_codes = [lab.repeat(levsize) for lab in this.index.codes] else: old_codes, old_levels = factorize_from_iterable(this.index) new_levels = [old_levels] new_codes = [old_codes.repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(level_vals) new_codes.append(np.tile(level_codes, N)) new_names.append(frame.columns.names[level_num]) new_index = MultiIndex( levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) result = frame._constructor(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: result = result.dropna(axis=0, how="all") return result
def _stack_multi_columns(frame, level_num=-1, dropna=True): def _convert_level_number(level_num, columns): """ Logic for converting the level number to something we can safely pass to swaplevel: We generally want to convert the level number into a level name, except when columns do not have names, in which case we must leave as a level number """ if level_num in columns.names: return columns.names[level_num] else: if columns.names[level_num] is None: return level_num else: return columns.names[level_num] this = frame.copy() # this makes life much simpler if level_num != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns for i in range(level_num, frame.columns.nlevels - 1): # Need to check if the ints conflict with level names lev1 = _convert_level_number(i, roll_columns) lev2 = _convert_level_number(i + 1, roll_columns) roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = roll_columns if not this.columns.is_lexsorted(): # Workaround the edge case where 0 is one of the column names, # which interferes with trying to sort based on the first # level level_to_sort = _convert_level_number(0, this.columns) this = this.sort_index(level=level_to_sort, axis=1) # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: tuples = list(zip(*[lev.take(level_codes) for lev, level_codes in zip(this.columns.levels[:-1], this.columns.codes[:-1])])) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) else: new_columns = unique_groups = this.columns.levels[0] # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] level_codes = sorted(set(this.columns.codes[-1])) level_vals_used = level_vals[level_codes] levsize = len(level_codes) drop_cols = [] for key in unique_groups: try: loc = this.columns.get_loc(key) except KeyError: drop_cols.append(key) continue # can make more efficient? # we almost always return a slice # but if unsorted can get a boolean # indexer if not isinstance(loc, slice): slice_len = len(loc) else: slice_len = loc.stop - loc.start if slice_len != levsize: chunk = this.loc[:, this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.codes[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if (frame._is_homogeneous_type and is_extension_array_dtype(frame.dtypes.iloc[0])): dtype = this[this.columns[loc]].dtypes.iloc[0] subset = this[this.columns[loc]] value_slice = dtype.construct_array_type()._concat_same_type( [x._values for _, x in subset.iteritems()] ) N, K = this.shape idx = np.arange(N * K).reshape(K, N).T.ravel() value_slice = value_slice.take(idx) elif frame._is_mixed_type: value_slice = this[this.columns[loc]].values else: value_slice = this.values[:, loc] if value_slice.ndim > 1: # i.e. not extension value_slice = value_slice.ravel() new_data[key] = value_slice if len(drop_cols) > 0: new_columns = new_columns.difference(drop_cols) N = len(this) if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) new_codes = [lab.repeat(levsize) for lab in this.index.codes] else: new_levels = [this.index] new_codes = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(level_vals) new_codes.append(np.tile(level_codes, N)) new_names.append(frame.columns.names[level_num]) new_index = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) result = frame._constructor(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: result = result.dropna(axis=0, how='all') return result
def to_numpy( self, dtype: Dtype | None = None, copy: bool = False, na_value=lib.no_default, **kwargs, ) -> np.ndarray: """ A NumPy ndarray representing the values in this Series or Index. .. versionadded:: 0.24.0 Parameters ---------- dtype : str or numpy.dtype, optional The dtype to pass to :meth:`numpy.asarray`. copy : bool, default False Whether to ensure that the returned value is not a view on another array. Note that ``copy=False`` does not *ensure* that ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that a copy is made, even if not strictly necessary. na_value : Any, optional The value to use for missing values. The default value depends on `dtype` and the type of the array. .. versionadded:: 1.0.0 **kwargs Additional keywords passed through to the ``to_numpy`` method of the underlying array (for extension arrays). .. versionadded:: 1.0.0 Returns ------- numpy.ndarray See Also -------- Series.array : Get the actual data stored within. Index.array : Get the actual data stored within. DataFrame.to_numpy : Similar method for DataFrame. Notes ----- The returned array will be the same up to equality (values equal in `self` will be equal in the returned array; likewise for values that are not equal). When `self` contains an ExtensionArray, the dtype may be different. For example, for a category-dtype Series, ``to_numpy()`` will return a NumPy array and the categorical dtype will be lost. For NumPy dtypes, this will be a reference to the actual data stored in this Series or Index (assuming ``copy=False``). Modifying the result in place will modify the data stored in the Series or Index (not that we recommend doing that). For extension types, ``to_numpy()`` *may* require copying data and coercing the result to a NumPy type (possibly object), which may be expensive. When you need a no-copy reference to the underlying data, :attr:`Series.array` should be used instead. This table lays out the different dtypes and default return types of ``to_numpy()`` for various dtypes within pandas. ================== ================================ dtype array type ================== ================================ category[T] ndarray[T] (same dtype as input) period ndarray[object] (Periods) interval ndarray[object] (Intervals) IntegerNA ndarray[object] datetime64[ns] datetime64[ns] datetime64[ns, tz] ndarray[object] (Timestamps) ================== ================================ Examples -------- >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.to_numpy() array(['a', 'b', 'a'], dtype=object) Specify the `dtype` to control how datetime-aware data is represented. Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp` objects, each with the correct ``tz``. >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) >>> ser.to_numpy(dtype=object) array([Timestamp('2000-01-01 00:00:00+0100', tz='CET', freq='D'), Timestamp('2000-01-02 00:00:00+0100', tz='CET', freq='D')], dtype=object) Or ``dtype='datetime64[ns]'`` to return an ndarray of native datetime64 values. The values are converted to UTC and the timezone info is dropped. >>> ser.to_numpy(dtype="datetime64[ns]") ... # doctest: +ELLIPSIS array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], dtype='datetime64[ns]') """ if is_extension_array_dtype(self.dtype): # error: Too many arguments for "to_numpy" of "ExtensionArray" return self.array.to_numpy( # type: ignore[call-arg] dtype, copy=copy, na_value=na_value, **kwargs) elif kwargs: bad_keys = list(kwargs.keys())[0] raise TypeError( f"to_numpy() got an unexpected keyword argument '{bad_keys}'") # error: Argument "dtype" to "asarray" has incompatible type # "Union[ExtensionDtype, str, dtype[Any], Type[str], Type[float], Type[int], # Type[complex], Type[bool], Type[object], None]"; expected "Union[dtype[Any], # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int, # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" result = np.asarray(self._values, dtype=dtype) # type: ignore[arg-type] # TODO(GH-24345): Avoid potential double copy if copy or na_value is not lib.no_default: result = result.copy() if na_value is not lib.no_default: result[self.isna()] = na_value return result
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. .. versionadded:: 0.19.2 Parameters ---------- vals : ndarray, Categorical encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. .. versionadded:: 0.20.0 Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if not hasattr(vals, 'dtype'): raise TypeError("must pass a ndarray-like") dtype = vals.dtype if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) elif is_extension_array_dtype(dtype): vals, _ = vals._values_for_factorize() dtype = vals.dtype # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. elif isinstance(dtype, np.bool): vals = vals.astype('u8') elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): vals = vals.view('i8').astype('u8', copy=False) elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: from pandas import factorize, Categorical, Index codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = hashing.hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types vals = hashing.hash_object_array(vals.astype(str).astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals