def is_inferred_bool_dtype(arr: ArrayLike) -> bool: """ Check if this is a ndarray[bool] or an ndarray[object] of bool objects. Parameters ---------- arr : np.ndarray or ExtensionArray Returns ------- bool Notes ----- This does not include the special treatment is_bool_dtype uses for Categorical. """ if not isinstance(arr, np.ndarray): return False dtype = arr.dtype if dtype == np.dtype(bool): return True elif dtype == np.dtype("object"): return lib.is_bool_array(arr) return False
def is_bool_indexer(key: Any) -> bool: """ Check whether `key` is a valid boolean indexer. Parameters ---------- key : Any Only list-likes may be considered boolean indexers. All other types are not considered a boolean indexer. For array-like input, boolean ndarrays or ExtensionArrays with ``_is_boolean`` set are considered boolean indexers. Returns ------- bool Whether `key` is a valid boolean indexer. Raises ------ ValueError When the array is an object-dtype ndarray or ExtensionArray and contains missing values. See Also -------- check_bool_array_indexer : Check that `key` is a valid mask for an array, and convert to an ndarray. """ na_msg = "cannot mask with array containing NA / NaN values" if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( is_array_like(key) and is_extension_array_dtype(key.dtype) ): if key.dtype == np.object_: key = np.asarray(values_from_object(key)) if not lib.is_bool_array(key): if isna(key).any(): raise ValueError(na_msg) return False return True elif is_bool_dtype(key.dtype): # an ndarray with bool-dtype by definition has no missing values. # So we only need to check for NAs in ExtensionArrays if is_extension_array_dtype(key.dtype): if np.any(key.isna()): raise ValueError(na_msg) return True elif isinstance(key, list): try: arr = np.asarray(key) return arr.dtype == np.bool_ and len(arr) == len(key) except TypeError: # pragma: no cover return False return False
def get_bool_data(self, copy: bool = False) -> ArrayManager: """ Select columns that are bool-dtype and object-dtype columns that are all-bool. Parameters ---------- copy : bool, default False Whether to copy the blocks """ return self._get_data_subset(lambda arr: is_bool_dtype(arr.dtype) or ( is_object_dtype(arr.dtype) and lib.is_bool_array(arr)))
def is_bool_indexer(key: Any) -> bool: """ Check whether `key` is a valid boolean indexer. Parameters ---------- key : Any Only list-likes may be considered boolean indexers. All other types are not considered a boolean indexer. For array-like input, boolean ndarrays or ExtensionArrays with ``_is_boolean`` set are considered boolean indexers. Returns ------- bool Whether `key` is a valid boolean indexer. Raises ------ ValueError When the array is an object-dtype ndarray or ExtensionArray and contains missing values. See Also -------- check_array_indexer : Check that `key` is a valid array to index, and convert to an ndarray. """ if isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or ( is_array_like(key) and is_extension_array_dtype(key.dtype) ): if key.dtype == np.object_: key = np.asarray(key) if not lib.is_bool_array(key): na_msg = "Cannot mask with non-boolean array containing NA / NaN values" if lib.infer_dtype(key) == "boolean" and isna(key).any(): # Don't raise on e.g. ["A", "B", np.nan], see # test_loc_getitem_list_of_labels_categoricalindex_with_na raise ValueError(na_msg) return False return True elif is_bool_dtype(key.dtype): return True elif isinstance(key, list): # check if np.array(key).dtype would be bool if len(key) > 0: if type(key) is not list: # GH#42461 cython will raise TypeError if we pass a subclass key = list(key) return lib.is_bool_list(key) return False
def is_bool_indexer(key): # type: (Any) -> bool """ Check whether `key` is a valid boolean indexer. Parameters ---------- key : Any Only list-likes may be considered boolean indexers. All other types are not considered a boolean indexer. For array-like input, boolean ndarrays or ExtensionArrays with ``_is_boolean`` set are considered boolean indexers. Returns ------- bool Raises ------ ValueError When the array is an object-dtype ndarray or ExtensionArray and contains missing values. """ na_msg = 'cannot index with vector containing NA / NaN values' if (isinstance(key, (ABCSeries, np.ndarray, ABCIndex)) or (is_array_like(key) and is_extension_array_dtype(key.dtype))): if key.dtype == np.object_: key = np.asarray(values_from_object(key)) if not lib.is_bool_array(key): if isna(key).any(): raise ValueError(na_msg) return False return True elif is_bool_dtype(key.dtype): # an ndarray with bool-dtype by definition has no missing values. # So we only need to check for NAs in ExtensionArrays if is_extension_array_dtype(key.dtype): if np.any(key.isna()): raise ValueError(na_msg) return True elif isinstance(key, list): try: arr = np.asarray(key) return arr.dtype == np.bool_ and len(arr) == len(key) except TypeError: # pragma: no cover return False return False
def test_string_array(nullable_string_dtype, any_string_method, request): method_name, args, kwargs = any_string_method if method_name == "decode": pytest.skip("decode requires bytes.") if nullable_string_dtype == "arrow_string" and method_name in { "extract", "extractall", }: reason = "extract/extractall does not yet dispatch to array" mark = pytest.mark.xfail(reason=reason) request.node.add_marker(mark) data = ["a", "bb", np.nan, "ccc"] a = Series(data, dtype=object) b = Series(data, dtype=nullable_string_dtype) expected = getattr(a.str, method_name)(*args, **kwargs) result = getattr(b.str, method_name)(*args, **kwargs) if isinstance(expected, Series): if expected.dtype == "object" and lib.is_string_array( expected.dropna().values, ): assert result.dtype == nullable_string_dtype result = result.astype(object) elif expected.dtype == "object" and lib.is_bool_array(expected.values, skipna=True): assert result.dtype == "boolean" result = result.astype(object) elif expected.dtype == "bool": assert result.dtype == "boolean" result = result.astype("bool") elif expected.dtype == "float" and expected.isna().any(): assert result.dtype == "Int64" result = result.astype("float") elif isinstance(expected, DataFrame): columns = expected.select_dtypes(include="object").columns assert all(result[columns].dtypes == nullable_string_dtype) result[columns] = result[columns].astype(object) tm.assert_equal(result, expected)
def is_bool_indexer(key): if isinstance(key, (ABCSeries, np.ndarray)): if key.dtype == np.object_: key = np.asarray(_values_from_object(key)) if not lib.is_bool_array(key): if isnull(key).any(): raise ValueError('cannot index with vector containing ' 'NA / NaN values') return False return True elif key.dtype == np.bool_: return True elif isinstance(key, list): try: arr = np.asarray(key) return arr.dtype == np.bool_ and len(arr) == len(key) except TypeError: # pragma: no cover return False return False
def is_bool_indexer(key): if isinstance(key, (ABCSeries, np.ndarray)): if key.dtype == np.object_: key = np.asarray(_values_from_object(key)) if not lib.is_bool_array(key): if isna(key).any(): raise ValueError('cannot index with vector containing ' 'NA / NaN values') return False return True elif key.dtype == np.bool_: return True elif isinstance(key, list): try: arr = np.asarray(key) return arr.dtype == np.bool_ and len(arr) == len(key) except TypeError: # pragma: no cover return False return False
def test_string_array(nullable_string_dtype, any_string_method): method_name, args, kwargs = any_string_method data = ["a", "bb", np.nan, "ccc"] a = Series(data, dtype=object) b = Series(data, dtype=nullable_string_dtype) if method_name == "decode": with pytest.raises(TypeError, match="a bytes-like object is required"): getattr(b.str, method_name)(*args, **kwargs) return expected = getattr(a.str, method_name)(*args, **kwargs) result = getattr(b.str, method_name)(*args, **kwargs) if isinstance(expected, Series): if expected.dtype == "object" and lib.is_string_array( expected.dropna().values, ): assert result.dtype == nullable_string_dtype result = result.astype(object) elif expected.dtype == "object" and lib.is_bool_array(expected.values, skipna=True): assert result.dtype == "boolean" result = result.astype(object) elif expected.dtype == "bool": assert result.dtype == "boolean" result = result.astype("bool") elif expected.dtype == "float" and expected.isna().any(): assert result.dtype == "Int64" result = result.astype("float") elif isinstance(expected, DataFrame): columns = expected.select_dtypes(include="object").columns assert all(result[columns].dtypes == nullable_string_dtype) result[columns] = result[columns].astype(object) tm.assert_equal(result, expected)
def test_string_array(any_string_method): method_name, args, kwargs = any_string_method if method_name == "decode": pytest.skip("decode requires bytes.") data = ["a", "bb", np.nan, "ccc"] a = Series(data, dtype=object) b = Series(data, dtype="string") expected = getattr(a.str, method_name)(*args, **kwargs) result = getattr(b.str, method_name)(*args, **kwargs) if isinstance(expected, Series): if expected.dtype == "object" and lib.is_string_array( expected.dropna().values, ): assert result.dtype == "string" result = result.astype(object) elif expected.dtype == "object" and lib.is_bool_array( expected.values, skipna=True ): assert result.dtype == "boolean" result = result.astype(object) elif expected.dtype == "bool": assert result.dtype == "boolean" result = result.astype("bool") elif expected.dtype == "float" and expected.isna().any(): assert result.dtype == "Int64" result = result.astype("float") elif isinstance(expected, DataFrame): columns = expected.select_dtypes(include="object").columns assert all(result[columns].dtypes == "string") result[columns] = result[columns].astype(object) tm.assert_equal(result, expected)
def is_inferred_bool_dtype(arr: ArrayLike) -> bool: """ Check if this is a ndarray[bool] or an ndarray[object] of bool objects. Parameters ---------- arr : np.ndarray or ExtensionArray Returns ------- bool Notes ----- This does not include the special treatment is_bool_dtype uses for Categorical. """ if not isinstance(arr, np.ndarray): return False dtype = arr.dtype if dtype == np.dtype(bool): return True elif dtype == np.dtype("object"): result = lib.is_bool_array(arr) if result: # GH#46188 warnings.warn( "In a future version, object-dtype columns with all-bool values " "will not be included in reductions with bool_only=True. " "Explicitly cast to bool dtype instead.", FutureWarning, stacklevel=find_stack_level(), ) return result return False
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. .. versionadded:: 0.19.2 Parameters ---------- vals : ndarray, Categorical encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. .. versionadded:: 0.20.0 Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if not hasattr(vals, 'dtype'): raise TypeError("must pass a ndarray-like") if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. if is_bool_array(vals): vals = vals.astype('u8') elif (is_datetime64_dtype(vals) or is_timedelta64_dtype(vals)): vals = vals.view('i8').astype('u8', copy=False) elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = _hash.hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types vals = _hash.hash_object_array(vals.astype(str).astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. .. versionadded:: 0.19.2 Parameters ---------- vals : ndarray, Categorical encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. .. versionadded:: 0.20.0 Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if not hasattr(vals, 'dtype'): raise TypeError("must pass a ndarray-like") if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. if is_bool_array(vals): vals = vals.astype('u8') elif (is_datetime64_dtype(vals) or is_timedelta64_dtype(vals)): vals = vals.view('i8').astype('u8', copy=False) elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = _hash.hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types vals = _hash.hash_object_array( vals.astype(str).astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals