예제 #1
0
    def _hash_categories(categories, ordered=True):
        from pandas.core.util.hashing import (hash_array, _combine_hash_arrays,
                                              hash_tuples)
        from pandas.core.dtypes.common import is_datetime64tz_dtype, _NS_DTYPE

        if len(categories) and isinstance(categories[0], tuple):
            # assumes if any individual category is a tuple, then all our. ATM
            # I don't really want to support just some of the categories being
            # tuples.
            categories = list(categories)  # breaks if a np.array of categories
            cat_array = hash_tuples(categories)
        else:
            if categories.dtype == 'O':
                if len({type(x) for x in categories}) != 1:
                    # TODO: hash_array doesn't handle mixed types. It casts
                    # everything to a str first, which means we treat
                    # {'1', '2'} the same as {'1', 2}
                    # find a better solution
                    hashed = hash((tuple(categories), ordered))
                    return hashed

            if is_datetime64tz_dtype(categories.dtype):
                # Avoid future warning.
                categories = categories.astype(_NS_DTYPE)

            cat_array = hash_array(np.asarray(categories), categorize=False)
        if ordered:
            cat_array = np.vstack(
                [cat_array,
                 np.arange(len(cat_array), dtype=cat_array.dtype)])
        else:
            cat_array = [cat_array]
        hashed = _combine_hash_arrays(iter(cat_array),
                                      num_items=len(cat_array))
        return np.bitwise_xor.reduce(hashed)
예제 #2
0
    def _hash_categories(categories, ordered=True):
        from pandas.core.util.hashing import (
            hash_array, _combine_hash_arrays, hash_tuples
        )

        if len(categories) and isinstance(categories[0], tuple):
            # assumes if any individual category is a tuple, then all our. ATM
            # I don't really want to support just some of the categories being
            # tuples.
            categories = list(categories)  # breaks if a np.array of categories
            cat_array = hash_tuples(categories)
        else:
            if categories.dtype == 'O':
                types = [type(x) for x in categories]
                if not len(set(types)) == 1:
                    # TODO: hash_array doesn't handle mixed types. It casts
                    # everything to a str first, which means we treat
                    # {'1', '2'} the same as {'1', 2}
                    # find a better solution
                    hashed = hash((tuple(categories), ordered))
                    return hashed
            cat_array = hash_array(np.asarray(categories), categorize=False)
        if ordered:
            cat_array = np.vstack([
                cat_array, np.arange(len(cat_array), dtype=cat_array.dtype)
            ])
        else:
            cat_array = [cat_array]
        hashed = _combine_hash_arrays(iter(cat_array),
                                      num_items=len(cat_array))
        return np.bitwise_xor.reduce(hashed)
예제 #3
0
    def _hash_categories(categories, ordered=True):
        from pandas.core.util.hashing import (hash_array, _combine_hash_arrays,
                                              hash_tuples)

        if len(categories) and isinstance(categories[0], tuple):
            # assumes if any individual category is a tuple, then all our. ATM
            # I don't really want to support just some of the categories being
            # tuples.
            categories = list(categories)  # breaks if a np.array of categories
            cat_array = hash_tuples(categories)
        else:
            if categories.dtype == 'O':
                types = [type(x) for x in categories]
                if not len(set(types)) == 1:
                    # TODO: hash_array doesn't handle mixed types. It casts
                    # everything to a str first, which means we treat
                    # {'1', '2'} the same as {'1', 2}
                    # find a better solution
                    hashed = hash((tuple(categories), ordered))
                    return hashed
            cat_array = hash_array(np.asarray(categories), categorize=False)
        if ordered:
            cat_array = np.vstack(
                [cat_array,
                 np.arange(len(cat_array), dtype=cat_array.dtype)])
        else:
            cat_array = [cat_array]
        hashed = _combine_hash_arrays(iter(cat_array),
                                      num_items=len(cat_array))
        if len(hashed) == 0:
            # bug in Numpy<1.12 for length 0 arrays. Just return the correct
            # value of 0
            return 0
        else:
            return np.bitwise_xor.reduce(hashed)
def _hash_dataframe_rows_no_categorize(df):
    """ returns a sequence of hashes, for each row, as in original `hash_pandas_object` """

    # hashing whole df was too slow, as it uses categorize=True for each series --> adapted pandas code

    hashes = (hash_array(series._values, categorize=False)
              for _, series in df.items())
    num_items = len(df.columns)
    h = combine_hash_arrays(hashes, num_items)

    return h
예제 #5
0
    def _hash_categories(self) -> int:
        from pandas.core.util.hashing import (
            combine_hash_arrays,
            hash_array,
            hash_tuples,
        )

        categories = self.categories
        ordered = self.ordered

        if len(categories) and isinstance(categories[0], tuple):
            # assumes if any individual category is a tuple, then all our. ATM
            # I don't really want to support just some of the categories being
            # tuples.
            cat_list = list(categories)  # breaks if a np.array of categories
            cat_array = hash_tuples(cat_list)
        else:
            if categories.dtype == "O" and len({type(x)
                                                for x in categories}) != 1:
                # TODO: hash_array doesn't handle mixed types. It casts
                # everything to a str first, which means we treat
                # {'1', '2'} the same as {'1', 2}
                # find a better solution
                hashed = hash((tuple(categories), ordered))
                return hashed

            if DatetimeTZDtype.is_dtype(categories.dtype):
                # Avoid future warning.
                categories = categories.view("datetime64[ns]")

            cat_array = hash_array(np.asarray(categories), categorize=False)
        if ordered:
            cat_array = np.vstack(
                [cat_array,
                 np.arange(len(cat_array), dtype=cat_array.dtype)])
        else:
            # error: Incompatible types in assignment (expression has type
            # "List[ndarray]", variable has type "ndarray")
            cat_array = [cat_array]  # type: ignore[assignment]
        # error: Incompatible types in assignment (expression has type "ndarray",
        # variable has type "int")
        hashed = combine_hash_arrays(  # type: ignore[assignment]
            iter(cat_array),
            num_items=len(cat_array))
        return np.bitwise_xor.reduce(hashed)