Пример #1
0
    def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
        if hash_key is None:
            hash_key = _default_hash_key

        # For categoricals, we hash the categories, then remap the codes to the
        # hash values. (This check is above the complex check so that we don't
        # ask numpy if categorical is a subdtype of complex, as it will choke.
        if is_categorical_dtype(vals.dtype):
            return _hash_categorical(vals, encoding, hash_key)

        # we'll be working with everything as 64-bit values, so handle this
        # 128-bit value early
        if np.issubdtype(vals.dtype, np.complex128):
            return hash_array(vals.real) + 23 * hash_array(vals.imag)

        # First, turn whatever array this is into unsigned 64-bit ints, if we
        # can manage it.
        if is_bool_array(vals):
            vals = vals.astype('u8')
        elif ((is_datetime64_dtype(vals) or is_timedelta64_dtype(vals)
               or is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8):
            vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
        else:
            # With repeated values, its MUCH faster to categorize object
            # dtypes, then hash and rename categories. We allow skipping the
            # categorization when the values are known/likely to be unique.
            if categorize:
                codes, categories = pd.factorize(vals, sort=False)
                cat = pd.Categorical(codes,
                                     pd.Index(categories),
                                     ordered=False,
                                     fastpath=True)
                return _hash_categorical(cat, encoding, hash_key)

            vals = hash_object_array(vals, hash_key, encoding)

        # Then, redistribute these 64-bit ints within the space of 64-bit ints
        vals ^= vals >> 30
        vals *= np.uint64(0xbf58476d1ce4e5b9)
        vals ^= vals >> 27
        vals *= np.uint64(0x94d049bb133111eb)
        vals ^= vals >> 31
        return vals
Пример #2
0
    def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
        if hash_key is None:
            hash_key = _default_hash_key

        # For categoricals, we hash the categories, then remap the codes to the
        # hash values. (This check is above the complex check so that we don't
        # ask numpy if categorical is a subdtype of complex, as it will choke.
        if is_categorical_dtype(vals.dtype):
            return _hash_categorical(vals, encoding, hash_key)

        # we'll be working with everything as 64-bit values, so handle this
        # 128-bit value early
        if np.issubdtype(vals.dtype, np.complex128):
            return hash_array(vals.real) + 23 * hash_array(vals.imag)

        # First, turn whatever array this is into unsigned 64-bit ints, if we
        # can manage it.
        if is_bool_array(vals):
            vals = vals.astype('u8')
        elif ((is_datetime64_dtype(vals) or
               is_timedelta64_dtype(vals) or
               is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8):
            vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
        else:
            # With repeated values, its MUCH faster to categorize object
            # dtypes, then hash and rename categories. We allow skipping the
            # categorization when the values are known/likely to be unique.
            if categorize:
                codes, categories = pd.factorize(vals, sort=False)
                cat = pd.Categorical(codes, pd.Index(categories),
                                     ordered=False, fastpath=True)
                return _hash_categorical(cat, encoding, hash_key)

            vals = hash_object_array(vals, hash_key, encoding)

        # Then, redistribute these 64-bit ints within the space of 64-bit ints
        vals ^= vals >> 30
        vals *= np.uint64(0xbf58476d1ce4e5b9)
        vals ^= vals >> 27
        vals *= np.uint64(0x94d049bb133111eb)
        vals ^= vals >> 31
        return vals
Пример #3
0
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
    """
    Given a 1d array, return an array of deterministic integers.

    .. versionadded:: 0.19.2

    Parameters
    ----------
    vals : ndarray
    encoding : string, default 'utf8'
        encoding for data & key when strings
    hash_key : string key to encode, default to _default_hash_key
    categorize : bool, default True
        Whether to first categorize object arrays before hashing. This is more
        efficient when the array contains duplicate values.

        .. versionadded:: 0.20.0

    Returns
    -------
    1d uint64 numpy array of hash values, same length as the vals

    """

    if hash_key is None:
        hash_key = _default_hash_key

    # For categoricals, we hash the categories, then remap the codes to the
    # hash values. (This check is above the complex check so that we don't ask
    # numpy if categorical is a subdtype of complex, as it will choke.
    if is_categorical_dtype(vals.dtype):
        return _hash_categorical(vals, encoding, hash_key)

    # we'll be working with everything as 64-bit values, so handle this
    # 128-bit value early
    if np.issubdtype(vals.dtype, np.complex128):
        return hash_array(vals.real) + 23 * hash_array(vals.imag)

    # First, turn whatever array this is into unsigned 64-bit ints, if we can
    # manage it.
    if is_bool_array(vals):
        vals = vals.astype('u8')
    elif ((is_datetime64_dtype(vals) or is_timedelta64_dtype(vals)
           or is_numeric_dtype(vals)) and vals.dtype.itemsize <= 8):
        vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
    else:
        # With repeated values, its MUCH faster to categorize object dtypes,
        # then hash and rename categories. We allow skipping the categorization
        # when the values are known/likely to be unique.
        if categorize:
            codes, categories = factorize(vals, sort=False)
            cat = Categorical(codes,
                              Index(categories),
                              ordered=False,
                              fastpath=True)
            return _hash_categorical(cat, encoding, hash_key)

        vals = _hash.hash_object_array(vals, hash_key, encoding)

    # Then, redistribute these 64-bit ints within the space of 64-bit ints
    vals ^= vals >> 30
    vals *= np.uint64(0xbf58476d1ce4e5b9)
    vals ^= vals >> 27
    vals *= np.uint64(0x94d049bb133111eb)
    vals ^= vals >> 31
    return vals
Пример #4
0
def hash_array(vals, encoding='utf8', hash_key=None):
    """
    Given a 1d array, return an array of deterministic integers.

    .. versionadded:: 0.19.2

    Parameters
    ----------
    vals : ndarray
    encoding : string, default 'utf8'
        encoding for data & key when strings
    hash_key : string key to encode, default to _default_hash_key

    Returns
    -------
    1d uint64 numpy array of hash values, same length as the vals

    """

    # work with cagegoricals as ints. (This check is above the complex
    # check so that we don't ask numpy if categorical is a subdtype of
    # complex, as it will choke.
    if hash_key is None:
        hash_key = _default_hash_key

    if is_categorical_dtype(vals.dtype):
        vals = vals.codes

    # we'll be working with everything as 64-bit values, so handle this
    # 128-bit value early
    if np.issubdtype(vals.dtype, np.complex128):
        return hash_array(vals.real) + 23 * hash_array(vals.imag)

    # MAIN LOGIC:
    inferred = infer_dtype(vals)

    # First, turn whatever array this is into unsigned 64-bit ints, if we can
    # manage it.
    if inferred == 'boolean':
        vals = vals.astype('u8')

    if (np.issubdtype(vals.dtype, np.datetime64) or
       np.issubdtype(vals.dtype, np.timedelta64) or
       np.issubdtype(vals.dtype, np.number)) and vals.dtype.itemsize <= 8:

        vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
    else:

        # its MUCH faster to categorize object dtypes, then hash and rename
        codes, categories = factorize(vals, sort=False)
        categories = Index(categories)
        c = Series(Categorical(codes, categories,
                               ordered=False, fastpath=True))
        vals = _hash.hash_object_array(categories.values,
                                       hash_key,
                                       encoding)

        # rename & extract
        vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values

    # Then, redistribute these 64-bit ints within the space of 64-bit ints
    vals ^= vals >> 30
    vals *= np.uint64(0xbf58476d1ce4e5b9)
    vals ^= vals >> 27
    vals *= np.uint64(0x94d049bb133111eb)
    vals ^= vals >> 31
    return vals
Пример #5
0
def hash_array(vals, encoding='utf8', hash_key=None):
    """
    Given a 1d array, return an array of deterministic integers.

    .. versionadded:: 0.19.2

    Parameters
    ----------
    vals : ndarray
    encoding : string, default 'utf8'
        encoding for data & key when strings
    hash_key : string key to encode, default to _default_hash_key

    Returns
    -------
    1d uint64 numpy array of hash values, same length as the vals

    """

    # work with cagegoricals as ints. (This check is above the complex
    # check so that we don't ask numpy if categorical is a subdtype of
    # complex, as it will choke.
    if hash_key is None:
        hash_key = _default_hash_key

    if is_categorical_dtype(vals.dtype):
        vals = vals.codes

    # we'll be working with everything as 64-bit values, so handle this
    # 128-bit value early
    if np.issubdtype(vals.dtype, np.complex128):
        return hash_array(vals.real) + 23 * hash_array(vals.imag)

    # MAIN LOGIC:
    inferred = infer_dtype(vals)

    # First, turn whatever array this is into unsigned 64-bit ints, if we can
    # manage it.
    if inferred == 'boolean':
        vals = vals.astype('u8')

    if (np.issubdtype(vals.dtype, np.datetime64)
            or np.issubdtype(vals.dtype, np.timedelta64) or np.issubdtype(
                vals.dtype, np.number)) and vals.dtype.itemsize <= 8:

        vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
    else:

        # its MUCH faster to categorize object dtypes, then hash and rename
        codes, categories = factorize(vals, sort=False)
        categories = Index(categories)
        c = Series(Categorical(codes, categories, ordered=False,
                               fastpath=True))
        vals = _hash.hash_object_array(categories.values, hash_key, encoding)

        # rename & extract
        vals = c.cat.rename_categories(Index(vals)).astype(np.uint64).values

    # Then, redistribute these 64-bit ints within the space of 64-bit ints
    vals ^= vals >> 30
    vals *= np.uint64(0xbf58476d1ce4e5b9)
    vals ^= vals >> 27
    vals *= np.uint64(0x94d049bb133111eb)
    vals ^= vals >> 31
    return vals
Пример #6
0
def hash_array(vals, encoding='utf8', hash_key=None, categorize=True):
    """
    Given a 1d array, return an array of deterministic integers.

    .. versionadded:: 0.19.2

    Parameters
    ----------
    vals : ndarray, Categorical
    encoding : string, default 'utf8'
        encoding for data & key when strings
    hash_key : string key to encode, default to _default_hash_key
    categorize : bool, default True
        Whether to first categorize object arrays before hashing. This is more
        efficient when the array contains duplicate values.

        .. versionadded:: 0.20.0

    Returns
    -------
    1d uint64 numpy array of hash values, same length as the vals

    """

    if not hasattr(vals, 'dtype'):
        raise TypeError("must pass a ndarray-like")

    if hash_key is None:
        hash_key = _default_hash_key

    # For categoricals, we hash the categories, then remap the codes to the
    # hash values. (This check is above the complex check so that we don't ask
    # numpy if categorical is a subdtype of complex, as it will choke.
    if is_categorical_dtype(vals.dtype):
        return _hash_categorical(vals, encoding, hash_key)

    # we'll be working with everything as 64-bit values, so handle this
    # 128-bit value early
    if np.issubdtype(vals.dtype, np.complex128):
        return hash_array(vals.real) + 23 * hash_array(vals.imag)

    # First, turn whatever array this is into unsigned 64-bit ints, if we can
    # manage it.
    if is_bool_array(vals):
        vals = vals.astype('u8')
    elif (is_datetime64_dtype(vals) or
          is_timedelta64_dtype(vals)):
        vals = vals.view('i8').astype('u8', copy=False)
    elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8):
        vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
    else:
        # With repeated values, its MUCH faster to categorize object dtypes,
        # then hash and rename categories. We allow skipping the categorization
        # when the values are known/likely to be unique.
        if categorize:
            codes, categories = factorize(vals, sort=False)
            cat = Categorical(codes, Index(categories),
                              ordered=False, fastpath=True)
            return _hash_categorical(cat, encoding, hash_key)

        try:
            vals = _hash.hash_object_array(vals, hash_key, encoding)
        except TypeError:
            # we have mixed types
            vals = _hash.hash_object_array(vals.astype(str).astype(object),
                                           hash_key, encoding)

    # Then, redistribute these 64-bit ints within the space of 64-bit ints
    vals ^= vals >> 30
    vals *= np.uint64(0xbf58476d1ce4e5b9)
    vals ^= vals >> 27
    vals *= np.uint64(0x94d049bb133111eb)
    vals ^= vals >> 31
    return vals