def test_multiindex_objects(self): mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], labels=[[0, 1, 0, 2], [2, 0, 0, 1]], names=['col1', 'col2']) recons = mi._sort_levels_monotonic() # these are equal assert mi.equals(recons) assert Index(mi.values).equals(Index(recons.values)) # _hashed_values and hash_pandas_object(..., index=False) # equivalency expected = hash_pandas_object( mi, index=False).values result = mi._hashed_values tm.assert_numpy_array_equal(result, expected) expected = hash_pandas_object( recons, index=False).values result = recons._hashed_values tm.assert_numpy_array_equal(result, expected) expected = mi._hashed_values result = recons._hashed_values # values should match, but in different order tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))
def test_multiindex_objects(self): mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]], labels=[[0, 1, 0, 2], [2, 0, 0, 1]], names=['col1', 'col2']) recons = mi._sort_levels_monotonic() # these are equal assert mi.equals(recons) assert Index(mi.values).equals(Index(recons.values)) # _hashed_values and hash_pandas_object(..., index=False) # equivalency expected = hash_pandas_object(mi, index=False).values result = mi._hashed_values tm.assert_numpy_array_equal(result, expected) expected = hash_pandas_object(recons, index=False).values result = recons._hashed_values tm.assert_numpy_array_equal(result, expected) expected = mi._hashed_values result = recons._hashed_values # values should match, but in different order tm.assert_numpy_array_equal(np.sort(result), np.sort(expected))
def check_not_equal_with_index(self, obj): # check that we are not hashing the same if # we include the index if not isinstance(obj, Index): a = hash_pandas_object(obj, index=True) b = hash_pandas_object(obj, index=False) self.assertFalse((a == b).all())
def check_equal(self, obj, **kwargs): a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) tm.assert_series_equal(a, b) kwargs.pop('index', None) a = hash_pandas_object(obj, **kwargs) b = hash_pandas_object(obj, **kwargs) tm.assert_series_equal(a, b)
def test_hash_keys(self): # using different hash keys, should have different hashes # for the same data # this only matters for object dtypes obj = Series(list('abc')) a = hash_pandas_object(obj, hash_key='9876543210123456') b = hash_pandas_object(obj, hash_key='9876543210123465') self.assertTrue((a != b).all())
def test_mixed(self): # mixed objects obj = Series(['1', 2, 3]) self.check_equal(obj) self.check_not_equal_with_index(obj) # mixed are actually equal when stringified a = hash_pandas_object(obj) b = hash_pandas_object(Series(list('123'))) self.assert_series_equal(a, b)
def test_pandas_errors(self): for obj in [pd.Timestamp('20130101')]: with pytest.raises(TypeError): hash_pandas_object(obj) with catch_warnings(record=True): obj = tm.makePanel() with pytest.raises(TypeError): hash_pandas_object(obj)
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, categorize=True): if hash_key is None: hash_key = _default_hash_key def adder(h, hashed_to_add): h = np.multiply(h, np.uint(3), h) return np.add(h, hashed_to_add, h) if isinstance(obj, pd.Index): h = hash_array(obj.values, encoding, hash_key, categorize).astype('uint64') h = pd.Series(h, index=obj, dtype='uint64') elif isinstance(obj, pd.Series): h = hash_array(obj.values, encoding, hash_key, categorize).astype('uint64') if index: h = adder( h, hash_pandas_object(obj.index, index=False, encoding=encoding, hash_key=hash_key, categorize=categorize).values) h = pd.Series(h, index=obj.index, dtype='uint64') elif isinstance(obj, pd.DataFrame): cols = obj.iteritems() first_series = next(cols)[1] h = hash_array(first_series.values, encoding, hash_key, categorize).astype('uint64') for _, col in cols: h = adder( h, hash_array(col.values, encoding, hash_key, categorize)) if index: h = adder( h, hash_pandas_object(obj.index, index=False, encoding=encoding, hash_key=hash_key, categorize=categorize).values) h = pd.Series(h, index=obj.index, dtype='uint64') else: raise TypeError("Unexpected type for hashing %s" % type(obj)) return h
def test_categorical_consistency(self): # GH15143 # Check that categoricals hash consistent with their values, not codes # This should work for categoricals of any dtype for s1 in [Series(['a', 'b', 'c', 'd']), Series([1000, 2000, 3000, 4000]), Series(pd.date_range(0, periods=4))]: s2 = s1.astype('category').cat.set_categories(s1) s3 = s2.cat.set_categories(list(reversed(s1))) for categorize in [True, False]: # These should all hash identically h1 = hash_pandas_object(s1, categorize=categorize) h2 = hash_pandas_object(s2, categorize=categorize) h3 = hash_pandas_object(s3, categorize=categorize) tm.assert_series_equal(h1, h2) tm.assert_series_equal(h1, h3)
def test_consistency(self): # check that our hash doesn't change because of a mistake # in the actual code; this is the ground truth result = hash_pandas_object(Index(['foo', 'bar', 'baz'])) expected = Series(np.array([3600424527151052760, 1374399572096150070, 477881037637427054], dtype='uint64'), index=['foo', 'bar', 'baz']) tm.assert_series_equal(result, expected)
def test_hash_tuples(self): tups = [(1, 'one'), (1, 'two'), (2, 'one')] result = hash_tuples(tups) expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values self.assert_numpy_array_equal(result, expected) result = hash_tuples(tups[0]) self.assertEqual(result, expected[0])
def test_hash_tuples(self): tups = [(1, 'one'), (1, 'two'), (2, 'one')] result = hash_tuples(tups) expected = hash_pandas_object(MultiIndex.from_tuples(tups)).values tm.assert_numpy_array_equal(result, expected) result = hash_tuples(tups[0]) self.assertEqual(result, expected[0])
def test_categorical_consistency(self): # GH15143 # Check that categoricals hash consistent with their values, not codes # This should work for categoricals of any dtype for s1 in [ Series(['a', 'b', 'c', 'd']), Series([1000, 2000, 3000, 4000]), Series(pd.date_range(0, periods=4)) ]: s2 = s1.astype('category').cat.set_categories(s1) s3 = s2.cat.set_categories(list(reversed(s1))) for categorize in [True, False]: # These should all hash identically h1 = hash_pandas_object(s1, categorize=categorize) h2 = hash_pandas_object(s2, categorize=categorize) h3 = hash_pandas_object(s3, categorize=categorize) tm.assert_series_equal(h1, h2) tm.assert_series_equal(h1, h3)
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None, categorize=True): if hash_key is None: hash_key = _default_hash_key def adder(h, hashed_to_add): h = np.multiply(h, np.uint(3), h) return np.add(h, hashed_to_add, h) if isinstance(obj, pd.Index): h = hash_array(obj.values, encoding, hash_key, categorize).astype('uint64') h = pd.Series(h, index=obj, dtype='uint64') elif isinstance(obj, pd.Series): h = hash_array(obj.values, encoding, hash_key, categorize).astype('uint64') if index: h = adder(h, hash_pandas_object(obj.index, index=False, encoding=encoding, hash_key=hash_key, categorize=categorize).values) h = pd.Series(h, index=obj.index, dtype='uint64') elif isinstance(obj, pd.DataFrame): cols = obj.iteritems() first_series = next(cols)[1] h = hash_array(first_series.values, encoding, hash_key, categorize).astype('uint64') for _, col in cols: h = adder(h, hash_array(col.values, encoding, hash_key, categorize)) if index: h = adder(h, hash_pandas_object(obj.index, index=False, encoding=encoding, hash_key=hash_key, categorize=categorize).values) h = pd.Series(h, index=obj.index, dtype='uint64') else: raise TypeError("Unexpected type for hashing %s" % type(obj)) return h
def f(): hash_pandas_object(f)
def time_frame(self): hash_pandas_object(self.df)
def time_series_int(self): hash_pandas_object(self.df.E)
def time_series_string(self): hash_pandas_object(self.df.B)
def f(): hash_pandas_object(Series(list('abc')), hash_key='foo')
def time_series_categorical(self): hash_pandas_object(self.df.C)
def test_multiindex_unique(self): mi = MultiIndex.from_tuples([(118, 472), (236, 118), (51, 204), (102, 51)]) self.assertTrue(mi.is_unique) result = hash_pandas_object(mi) self.assertTrue(result.is_unique)