def __new__(cls, data, sparse_index=None, index=None, kind='integer', fill_value=None, dtype=np.float64, copy=False): if index is not None: if data is None: data = np.nan if not lib.isscalar(data): raise Exception("must only pass scalars with an index ") values = np.empty(len(index), dtype='float64') values.fill(data) data = values if dtype is not None: dtype = np.dtype(dtype) is_sparse_array = isinstance(data, SparseArray) if fill_value is None: if is_sparse_array: fill_value = data.fill_value else: fill_value = nan if is_sparse_array: sparse_index = data.sp_index values = np.asarray(data) else: # array-like if sparse_index is None: values, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) else: values = _sanitize_values(data) if len(values) != sparse_index.npoints: raise AssertionError("Non array-like type {0} must have" " the same length as the" " index".format(type(values))) # Create array, do *not* copy data by default if copy: try: # ToDo: Can remove this error handling when we actually # support other dtypes subarr = np.array(values, dtype=dtype, copy=True) except ValueError: subarr = np.array(values, copy=True) else: try: subarr = np.asarray(values, dtype=dtype) except ValueError: subarr = np.asarray(values) # if we have a bool type, make sure that we have a bool fill_value if ((dtype is not None and issubclass(dtype.type, np.bool_)) or (data is not None and lib.is_bool_array(subarr))): if np.isnan(fill_value) or not fill_value: fill_value = False else: fill_value = bool(fill_value) # Change the class of the array to be the subclass type. return cls._simple_new(subarr, sparse_index, fill_value)
def __new__(cls, data, sparse_index=None, index=None, kind='integer', fill_value=None, dtype=np.float64, copy=False): if index is not None: if data is None: data = np.nan if not is_scalar(data): raise Exception("must only pass scalars with an index ") values = np.empty(len(index), dtype='float64') values.fill(data) data = values if dtype is not None: dtype = np.dtype(dtype) is_sparse_array = isinstance(data, SparseArray) if fill_value is None: if is_sparse_array: fill_value = data.fill_value else: fill_value = nan if is_sparse_array: sparse_index = data.sp_index values = np.asarray(data) else: # array-like if sparse_index is None: values, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) else: values = _sanitize_values(data) if len(values) != sparse_index.npoints: raise AssertionError("Non array-like type {0} must have" " the same length as the" " index".format(type(values))) # Create array, do *not* copy data by default if copy: try: # ToDo: Can remove this error handling when we actually # support other dtypes subarr = np.array(values, dtype=dtype, copy=True) except ValueError: subarr = np.array(values, copy=True) else: try: subarr = np.asarray(values, dtype=dtype) except ValueError: subarr = np.asarray(values) # if we have a bool type, make sure that we have a bool fill_value if ((dtype is not None and issubclass(dtype.type, np.bool_)) or (data is not None and lib.is_bool_array(subarr))): if np.isnan(fill_value) or not fill_value: fill_value = False else: fill_value = bool(fill_value) # Change the class of the array to be the subclass type. return cls._simple_new(subarr, sparse_index, fill_value)
def _is_bool_indexer(key): if isinstance(key, np.ndarray) and key.dtype == np.object_: if not lib.is_bool_array(key): if isnull(key).any(): raise ValueError("cannot index with vector containing " "NA / NaN values") return False return True elif isinstance(key, np.ndarray) and key.dtype == np.bool_: return True elif isinstance(key, list): try: return np.asarray(key).dtype == np.bool_ except TypeError: # pragma: no cover return False return False
def _is_bool_indexer(key): if isinstance(key, np.ndarray) and key.dtype == np.object_: if not lib.is_bool_array(key): if isnull(key).any(): raise ValueError('cannot index with vector containing ' 'NA / NaN values') return False return True elif isinstance(key, np.ndarray) and key.dtype == np.bool_: return True elif isinstance(key, list): try: return np.asarray(key).dtype == np.bool_ except TypeError: # pragma: no cover return False return False
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't # ask numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we # can manage it. if is_bool_array(vals): vals = vals.astype('u8') elif ((is_datetime64_dtype(vals) or is_timedelta64_dtype(vals) or is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object # dtypes, then hash and rename categories. We allow skipping the # categorization when the values are known/likely to be unique. if categorize: codes, categories = pd.factorize(vals, sort=False) cat = pd.Categorical(codes, pd.Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) vals = hash_object_array(vals, hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals
def is_bool_indexer(key): if isinstance(key, (ABCSeries, np.ndarray)): if key.dtype == np.object_: key = np.asarray(_values_from_object(key)) if not lib.is_bool_array(key): if isnull(key).any(): raise ValueError('cannot index with vector containing ' 'NA / NaN values') return False return True elif key.dtype == np.bool_: return True elif isinstance(key, list): try: arr = np.asarray(key) return arr.dtype == np.bool_ and len(arr) == len(key) except TypeError: # pragma: no cover return False return False
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. .. versionadded:: 0.19.2 Parameters ---------- vals : ndarray encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. .. versionadded:: 0.20.0 Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. if is_bool_array(vals): vals = vals.astype('u8') elif ((is_datetime64_dtype(vals) or is_timedelta64_dtype(vals) or is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) vals = _hash.hash_object_array(vals, hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals
def __new__(cls, data, sparse_index=None, index=None, kind='integer', fill_value=None, dtype=np.float64, copy=False): if index is not None: if data is None: data = np.nan if not np.isscalar(data): raise Exception("must only pass scalars with an index ") values = np.empty(len(index), dtype='float64') values.fill(data) data = values if dtype is not None: dtype = np.dtype(dtype) is_sparse_array = isinstance(data, SparseArray) if fill_value is None: if is_sparse_array: fill_value = data.fill_value else: fill_value = nan if is_sparse_array: sparse_index = data.sp_index values = np.asarray(data) else: # array-like if sparse_index is None: values, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) else: values = data if len(values) != sparse_index.npoints: raise AssertionError("Non array-like type {0} must have" " the same length as the" " index".format(type(values))) # Create array, do *not* copy data by default if copy: subarr = np.array(values, dtype=dtype, copy=True) else: subarr = np.asarray(values, dtype=dtype) # if we have a bool type, make sure that we have a bool fill_value if (dtype is not None and issubclass(dtype.type, np.bool_)) or ( data is not None and lib.is_bool_array(subarr)): if np.isnan(fill_value) or not fill_value: fill_value = False else: fill_value = bool(fill_value) # Change the class of the array to be the subclass type. output = subarr.view(cls) output.sp_index = sparse_index output.fill_value = fill_value return output
def __new__( cls, data, sparse_index=None, index=None, kind='integer', fill_value=None, dtype=np.float64, copy=False): if index is not None: if data is None: data = np.nan if not np.isscalar(data): raise Exception("must only pass scalars with an index ") values = np.empty(len(index), dtype='float64') values.fill(data) data = values if dtype is not None: dtype = np.dtype(dtype) is_sparse_array = isinstance(data, SparseArray) if fill_value is None: if is_sparse_array: fill_value = data.fill_value else: fill_value = nan if is_sparse_array: sparse_index = data.sp_index values = np.asarray(data) else: # array-like if sparse_index is None: values, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) else: values = data if len(values) != sparse_index.npoints: raise AssertionError("Non array-like type {0} must have" " the same length as the" " index".format(type(values))) # Create array, do *not* copy data by default if copy: subarr = np.array(values, dtype=dtype, copy=True) else: subarr = np.asarray(values, dtype=dtype) # if we have a bool type, make sure that we have a bool fill_value if (dtype is not None and issubclass(dtype.type, np.bool_)) or (data is not None and lib.is_bool_array(subarr)): if np.isnan(fill_value) or not fill_value: fill_value = False else: fill_value = bool(fill_value) # Change the class of the array to be the subclass type. output = subarr.view(cls) output.sp_index = sparse_index output.fill_value = fill_value return output
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): """ Given a 1d array, return an array of deterministic integers. .. versionadded:: 0.19.2 Parameters ---------- vals : ndarray, Categorical encoding : string, default 'utf8' encoding for data & key when strings hash_key : string key to encode, default to _default_hash_key categorize : bool, default True Whether to first categorize object arrays before hashing. This is more efficient when the array contains duplicate values. .. versionadded:: 0.20.0 Returns ------- 1d uint64 numpy array of hash values, same length as the vals """ if not hasattr(vals, 'dtype'): raise TypeError("must pass a ndarray-like") if hash_key is None: hash_key = _default_hash_key # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. if is_categorical_dtype(vals.dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early if np.issubdtype(vals.dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. if is_bool_array(vals): vals = vals.astype('u8') elif (is_datetime64_dtype(vals) or is_timedelta64_dtype(vals)): vals = vals.view('i8').astype('u8', copy=False) elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8): vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) return _hash_categorical(cat, encoding, hash_key) try: vals = _hash.hash_object_array(vals, hash_key, encoding) except TypeError: # we have mixed types vals = _hash.hash_object_array(vals.astype(str).astype(object), hash_key, encoding) # Then, redistribute these 64-bit ints within the space of 64-bit ints vals ^= vals >> 30 vals *= np.uint64(0xbf58476d1ce4e5b9) vals ^= vals >> 27 vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals