示例#1
0
 def test_categorical_with_nan_consistency(self):
     c = pd.Categorical.from_codes(
         [-1, 0, 1, 2, 3, 4],
         categories=pd.date_range('2012-01-01', periods=5, name='B'))
     expected = hash_array(c, categorize=False)
     c = pd.Categorical.from_codes(
         [-1, 0],
         categories=[pd.Timestamp('2012-01-01')])
     result = hash_array(c, categorize=False)
     assert result[0] in expected
     assert result[1] in expected
示例#2
0
 def test_categorical_with_nan_consistency(self):
     c = pd.Categorical.from_codes([-1, 0, 1, 2, 3, 4],
                                   categories=pd.date_range('2012-01-01',
                                                            periods=5,
                                                            name='B'))
     expected = hash_array(c, categorize=False)
     c = pd.Categorical.from_codes([-1, 0],
                                   categories=[pd.Timestamp('2012-01-01')])
     result = hash_array(c, categorize=False)
     assert result[0] in expected
     assert result[1] in expected
示例#3
0
    def test_same_len_hash_collisions(self):

        for l in range(8):
            length = 2**(l + 8) + 1
            s = tm.rands_array(length, 2)
            result = hash_array(s, 'utf8')
            self.assertFalse(result[0] == result[1])

        for l in range(8):
            length = 2**(l + 8)
            s = tm.rands_array(length, 2)
            result = hash_array(s, 'utf8')
            self.assertFalse(result[0] == result[1])
示例#4
0
    def test_same_len_hash_collisions(self):

        for l in range(8):
            length = 2**(l + 8) + 1
            s = tm.rands_array(length, 2)
            result = hash_array(s, 'utf8')
            self.assertFalse(result[0] == result[1])

        for l in range(8):
            length = 2**(l + 8)
            s = tm.rands_array(length, 2)
            result = hash_array(s, 'utf8')
            self.assertFalse(result[0] == result[1])
示例#5
0
    def test_hash_collisions(self):

        # hash collisions are bad
        # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
        L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9',  # noqa
             'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe']  # noqa

        # these should be different!
        result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8')
        expected1 = np.array([14963968704024874985], dtype=np.uint64)
        tm.assert_numpy_array_equal(result1, expected1)

        result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8')
        expected2 = np.array([16428432627716348016], dtype=np.uint64)
        tm.assert_numpy_array_equal(result2, expected2)

        result = hash_array(np.asarray(L, dtype=object), 'utf8')
        tm.assert_numpy_array_equal(
            result, np.concatenate([expected1, expected2], axis=0))
示例#6
0
    def test_hash_collisions(self):

        # hash collisions are bad
        # https://github.com/pandas-dev/pandas/issues/14711#issuecomment-264885726
        L = ['Ingrid-9Z9fKIZmkO7i7Cn51Li34pJm44fgX6DYGBNj3VPlOH50m7HnBlPxfIwFMrcNJNMP6PSgLmwWnInciMWrCSAlLEvt7JkJl4IxiMrVbXSa8ZQoVaq5xoQPjltuJEfwdNlO6jo8qRRHvD8sBEBMQASrRa6TsdaPTPCBo3nwIBpE7YzzmyH0vMBhjQZLx1aCT7faSEx7PgFxQhHdKFWROcysamgy9iVj8DO2Fmwg1NNl93rIAqC3mdqfrCxrzfvIY8aJdzin2cHVzy3QUJxZgHvtUtOLxoqnUHsYbNTeq0xcLXpTZEZCxD4PGubIuCNf32c33M7HFsnjWSEjE2yVdWKhmSVodyF8hFYVmhYnMCztQnJrt3O8ZvVRXd5IKwlLexiSp4h888w7SzAIcKgc3g5XQJf6MlSMftDXm9lIsE1mJNiJEv6uY6pgvC3fUPhatlR5JPpVAHNSbSEE73MBzJrhCAbOLXQumyOXigZuPoME7QgJcBalliQol7YZ9',  # noqa
             'Tim-b9MddTxOWW2AT1Py6vtVbZwGAmYCjbp89p8mxsiFoVX4FyDOF3wFiAkyQTUgwg9sVqVYOZo09Dh1AzhFHbgij52ylF0SEwgzjzHH8TGY8Lypart4p4onnDoDvVMBa0kdthVGKl6K0BDVGzyOXPXKpmnMF1H6rJzqHJ0HywfwS4XYpVwlAkoeNsiicHkJUFdUAhG229INzvIAiJuAHeJDUoyO4DCBqtoZ5TDend6TK7Y914yHlfH3g1WZu5LksKv68VQHJriWFYusW5e6ZZ6dKaMjTwEGuRgdT66iU5nqWTHRH8WSzpXoCFwGcTOwyuqPSe0fTe21DVtJn1FKj9F9nEnR9xOvJUO7E0piCIF4Ad9yAIDY4DBimpsTfKXCu1vdHpKYerzbndfuFe5AhfMduLYZJi5iAw8qKSwR5h86ttXV0Mc0QmXz8dsRvDgxjXSmupPxBggdlqUlC828hXiTPD7am0yETBV0F3bEtvPiNJfremszcV8NcqAoARMe']  # noqa

        # these should be different!
        result1 = hash_array(np.asarray(L[0:1], dtype=object), 'utf8')
        expected1 = np.array([14963968704024874985], dtype=np.uint64)
        self.assert_numpy_array_equal(result1, expected1)

        result2 = hash_array(np.asarray(L[1:2], dtype=object), 'utf8')
        expected2 = np.array([16428432627716348016], dtype=np.uint64)
        self.assert_numpy_array_equal(result2, expected2)

        result = hash_array(np.asarray(L, dtype=object), 'utf8')
        self.assert_numpy_array_equal(
            result, np.concatenate([expected1, expected2], axis=0))
示例#7
0
 def test_hash_array(self):
     for name, s in self.df.iteritems():
         a = s.values
         tm.assert_numpy_array_equal(hash_array(a), hash_array(a))
示例#8
0
 def test_hash_array_mixed(self):
     result1 = hash_array(np.array([3, 4, 'All']))
     result2 = hash_array(np.array(['3', '4', 'All']))
     result3 = hash_array(np.array([3, 4, 'All'], dtype=object))
     tm.assert_numpy_array_equal(result1, result2)
     tm.assert_numpy_array_equal(result1, result3)
示例#9
0
 def test_hash_array(self):
     for name, s in self.df.iteritems():
         a = s.values
         tm.assert_numpy_array_equal(hash_array(a), hash_array(a))
示例#10
0
 def test_hash_array_mixed(self):
     result1 = hash_array(np.array([3, 4, 'All']))
     result2 = hash_array(np.array(['3', '4', 'All']))
     result3 = hash_array(np.array([3, 4, 'All'], dtype=object))
     tm.assert_numpy_array_equal(result1, result2)
     tm.assert_numpy_array_equal(result1, result3)