def _hash_categories(categories, ordered=True): from pandas.core.util.hashing import (hash_array, _combine_hash_arrays, hash_tuples) from pandas.core.dtypes.common import is_datetime64tz_dtype, _NS_DTYPE if len(categories) and isinstance(categories[0], tuple): # assumes if any individual category is a tuple, then all our. ATM # I don't really want to support just some of the categories being # tuples. categories = list(categories) # breaks if a np.array of categories cat_array = hash_tuples(categories) else: if categories.dtype == 'O': if len({type(x) for x in categories}) != 1: # TODO: hash_array doesn't handle mixed types. It casts # everything to a str first, which means we treat # {'1', '2'} the same as {'1', 2} # find a better solution hashed = hash((tuple(categories), ordered)) return hashed if is_datetime64tz_dtype(categories.dtype): # Avoid future warning. categories = categories.astype(_NS_DTYPE) cat_array = hash_array(np.asarray(categories), categorize=False) if ordered: cat_array = np.vstack( [cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)]) else: cat_array = [cat_array] hashed = _combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) return np.bitwise_xor.reduce(hashed)
def _hash_categories(categories, ordered=True): from pandas.core.util.hashing import ( hash_array, _combine_hash_arrays, hash_tuples ) if len(categories) and isinstance(categories[0], tuple): # assumes if any individual category is a tuple, then all our. ATM # I don't really want to support just some of the categories being # tuples. categories = list(categories) # breaks if a np.array of categories cat_array = hash_tuples(categories) else: if categories.dtype == 'O': types = [type(x) for x in categories] if not len(set(types)) == 1: # TODO: hash_array doesn't handle mixed types. It casts # everything to a str first, which means we treat # {'1', '2'} the same as {'1', 2} # find a better solution hashed = hash((tuple(categories), ordered)) return hashed cat_array = hash_array(np.asarray(categories), categorize=False) if ordered: cat_array = np.vstack([ cat_array, np.arange(len(cat_array), dtype=cat_array.dtype) ]) else: cat_array = [cat_array] hashed = _combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) return np.bitwise_xor.reduce(hashed)
def _hash_categories(categories, ordered=True): from pandas.core.util.hashing import (hash_array, _combine_hash_arrays, hash_tuples) if len(categories) and isinstance(categories[0], tuple): # assumes if any individual category is a tuple, then all our. ATM # I don't really want to support just some of the categories being # tuples. categories = list(categories) # breaks if a np.array of categories cat_array = hash_tuples(categories) else: if categories.dtype == 'O': types = [type(x) for x in categories] if not len(set(types)) == 1: # TODO: hash_array doesn't handle mixed types. It casts # everything to a str first, which means we treat # {'1', '2'} the same as {'1', 2} # find a better solution hashed = hash((tuple(categories), ordered)) return hashed cat_array = hash_array(np.asarray(categories), categorize=False) if ordered: cat_array = np.vstack( [cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)]) else: cat_array = [cat_array] hashed = _combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) if len(hashed) == 0: # bug in Numpy<1.12 for length 0 arrays. Just return the correct # value of 0 return 0 else: return np.bitwise_xor.reduce(hashed)
def _hash_dataframe_rows_no_categorize(df): """ returns a sequence of hashes, for each row, as in original `hash_pandas_object` """ # hashing whole df was too slow, as it uses categorize=True for each series --> adapted pandas code hashes = (hash_array(series._values, categorize=False) for _, series in df.items()) num_items = len(df.columns) h = combine_hash_arrays(hashes, num_items) return h
def _hash_categories(self) -> int: from pandas.core.util.hashing import ( combine_hash_arrays, hash_array, hash_tuples, ) categories = self.categories ordered = self.ordered if len(categories) and isinstance(categories[0], tuple): # assumes if any individual category is a tuple, then all our. ATM # I don't really want to support just some of the categories being # tuples. cat_list = list(categories) # breaks if a np.array of categories cat_array = hash_tuples(cat_list) else: if categories.dtype == "O" and len({type(x) for x in categories}) != 1: # TODO: hash_array doesn't handle mixed types. It casts # everything to a str first, which means we treat # {'1', '2'} the same as {'1', 2} # find a better solution hashed = hash((tuple(categories), ordered)) return hashed if DatetimeTZDtype.is_dtype(categories.dtype): # Avoid future warning. categories = categories.view("datetime64[ns]") cat_array = hash_array(np.asarray(categories), categorize=False) if ordered: cat_array = np.vstack( [cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)]) else: # error: Incompatible types in assignment (expression has type # "List[ndarray]", variable has type "ndarray") cat_array = [cat_array] # type: ignore[assignment] # error: Incompatible types in assignment (expression has type "ndarray", # variable has type "int") hashed = combine_hash_arrays( # type: ignore[assignment] iter(cat_array), num_items=len(cat_array)) return np.bitwise_xor.reduce(hashed)