def test_invalid_n(self): fhp = FieldHashingProperties(ngram=2, k=20, positional=True) fhp.ngram = -6 with self.assertRaises( ValueError, msg='Expected raise ValueError on invalid n.'): tok = get_tokenizer(fhp) tok('prawn')
def test_compare_strategies(self): def mkSchema(hashing_properties): return Schema( l=1024, xor_folds=1, kdf_type='HKDF', kdf_hash='SHA256', kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P' '/G5nUBrM7ybymlEFsMV6PAeDZCNp3r' 'fNUPCtLDMOGQHG4pCQpfhiHCyA=='), kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, fields=[ StringSpec( identifier='name', hashing_properties=hashing_properties, description=None, case=StringSpec._DEFAULT_CASE, min_length=1, max_length=50 ) ] ) pii = [('An',), ('Fred',), ('Philhowe',), ('MuhlbachBereznyz',)] secret = 'secret' schema_k = mkSchema(FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, comparator=bigram_tokenizer, strategy=BitsPerTokenStrategy(20), hash_type='doubleHash' )) mean_k, std_k = _test_stats(pii, schema_k, secret) print('test_compare_k_and_num_bits k: ', mean_k, std_k) schema_num_bits = mkSchema(FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, comparator=bigram_tokenizer, strategy=BitsPerFeatureStrategy(int(round(mean_k))), hash_type='doubleHash' )) mean_num_bits, std_num_bits = _test_stats(pii, schema_num_bits, secret) print('test_compare_k_and_num_bits num_bits: ', mean_num_bits, std_num_bits) self.assertGreater(std_k, 2 * std_num_bits, 'Standard deviation for num_bits should be' ' < half that for the equivalent k')
def test_different_weights(self): schema = Schema( version=1, hashing_globals=GlobalHashingProperties( k=30, kdf_hash='SHA256', kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==' ), kdf_type='HKDF', l=1024, hash_type='blakeHash', xor_folds=0, ), fields=[ StringSpec( identifier='some info', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None) ])
def setUp(self): ascii_hashing = FieldHashingProperties( encoding='ascii', comparator=get_comparator({ 'type': 'ngram', 'n': 2 }), strategy=BitsPerTokenStrategy(20)) self.fields = [ StringSpec(identifier='given name', case='lower', min_length=1, max_length=None, hashing_properties=ascii_hashing), StringSpec(identifier='surname', case='upper', min_length=1, max_length=None, hashing_properties=ascii_hashing), StringSpec(identifier='email address', regex=r'.+@.+\..+', hashing_properties=ascii_hashing), IntegerSpec(identifier='age', minimum=18, maximum=99, hashing_properties=ascii_hashing), DateSpec(identifier='join date', format='%Y-%m-%d', hashing_properties=ascii_hashing), EnumSpec(identifier='account type', values=['free', 'paid'], hashing_properties=ascii_hashing) ]
def test_compare_to_legacy(self): # Identifier: 'ANY freetext' fhp = FieldHashingProperties(ngram=2, hash_type='doubleHash', k=10) schema = Schema( l=1024, kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P' '/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='), fields=[ StringSpec(identifier='ANY text {}'.format(i + 1), hashing_properties=fhp) for i in range(4) ]) row = ['Bobby', 'Bobby', 'Bobby', 'Bobby'] master_secrets = [ 'No, I am your father'.encode(), "No... that's not true! That's impossible!".encode() ] keys_hkdf = generate_key_lists(master_secrets, len(row), kdf='HKDF') keys_legacy = generate_key_lists(master_secrets, len(row), kdf='legacy') bloom_hkdf = next(stream_bloom_filters([row], keys_hkdf, schema)) bloom_legacy = next(stream_bloom_filters([row], keys_legacy, schema)) hkdf_count = bloom_hkdf[0].count() legacy_count = bloom_legacy[0].count() # lecay will map the 4 Bobbys' to the same bits, whereas hkdf will # map each Bobby to different bits. self.assertLessEqual(legacy_count, fhp.k * 6) # 6 bi-grams self.assertLess(legacy_count, hkdf_count) self.assertLessEqual(hkdf_count, len(row) * legacy_count)
def test_different_weights(self): schema = Schema( l=1024, xor_folds=0, kdf_hash='SHA256', kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P' '/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='), kdf_type='HKDF', fields=[ StringSpec( identifier='some info', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, comparator=bigram_tokenizer, strategy=BitsPerTokenStrategy(20) ), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None ) ] )
def test_from_properties_invalid_hash(self): fhp = FieldHashingProperties( comparator=bigram_tokenizer, strategy=BitsPerTokenStrategy(30), hash_type='jakubHash' # <- this is invalid. ) with self.assertRaises(ValueError, msg='Expected ValueError on invalid encoding.'): hashing_function_from_properties(fhp)
def test_from_properties_invalid_hash(self): fhp = FieldHashingProperties( ngram=2, k=30, hash_type='jakubHash' # <- this is invalid. ) with self.assertRaises(ValueError, msg='Expected ValueError on invalid encoding.'): hashing_function_from_properties(fhp)
def test_bug210(self): # https://github.com/data61/clkhash/issues/210 common_tokens = [str(i) for i in range(65)] e1 = common_tokens + ['e1'] # 66 tokens e2 = common_tokens + ['e2a', 'e2b'] # 67 tokens tok_sim = 2.0 * len(common_tokens) / (len(e1) + len(e2)) fhp = FieldHashingProperties(ngram=2, num_bits=100, hash_type='doubleHash') f = lambda tokens: double_hash_encode_ngrams(tokens, ( self.key_sha1, self.key_md5), fhp.ks(len(tokens)), 1024, fhp. encoding) b1 = f(e1) b2 = f(e2) intersect = b1 & b2 sim = 2.0 * intersect.count() / (b1.count() + b2.count()) # print('test_bug210: bit counts: b1 = {}, b2 = {}, intersect = {}' # ', tok_sim = {}, sim = {}' # .format(b1.count(), # b2.count(), # intersect.count(), # tok_sim, sim)) self.assertGreater(sim, 0.9 * tok_sim)
def test_positional_unigram_duplicate(self): properties = FieldHashingProperties(ngram=1, positional=True) self.assertEqual(list(get_tokenizer(properties)("111")), ['1 1', '2 1', '3 1'])
def test_unigram_duplicate(self): properties = FieldHashingProperties(ngram=1, positional=False) self.assertEqual(list(get_tokenizer(properties)("1212")), ['1', '2', '1', '2'])
def test_bigram_2(self): properties = FieldHashingProperties(ngram=2, positional=False) self.assertEqual(list(get_tokenizer(properties)("steve", ignore='e')), [' s', 'st', 'tv', 'v '])
def test_bigram_1(self): properties = FieldHashingProperties(ngram=2, positional=False) self.assertEqual(list(get_tokenizer(properties)("steve")), [' s', 'st', 'te', 'ev', 've', 'e '])
def test_bigram_duplicate(self): properties = FieldHashingProperties(ngram=2, positional=False) self.assertEqual(list(get_tokenizer(properties)("abab")), [' a', 'ab', 'ba', 'ab', 'b '])
def test_unigram_1(self): properties = FieldHashingProperties(ngram=1, positional=False) self.assertEqual(list(get_tokenizer(properties)("1/2/93", ignore='/')), ['1', '2', '9', '3'])
import unittest from clkhash.field_formats import FieldHashingProperties from clkhash.tokenizer import get_tokenizer __author__ = 'shardy' # some tokenizers p1_20 = get_tokenizer( FieldHashingProperties(ngram=1, k=20) ) p2_20 = get_tokenizer( FieldHashingProperties(ngram=2, k=20) ) p1_20_true = get_tokenizer( FieldHashingProperties(ngram=1, k=20, positional=True) ) dummy = get_tokenizer(None) class TestTokenizer(unittest.TestCase): def test_unigram_1(self): self.assertEqual(list(p1_20("1/2/93", ignore='/')), ['1', '2', '9', '3']) def test_unigram_2(self): self.assertEqual(list(p1_20("1*2*93", ignore='*')),
def test_compare_to_legacy(self): # Identifier: 'ANY freetext' schema = Schema( version=1, hashing_globals=GlobalHashingProperties( k=10, kdf_hash='SHA256', kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==' ), kdf_type='HKDF', l=1024, hash_type='doubleHash', hash_prevent_singularity=False, xor_folds=0), fields=[ StringSpec( identifier='ANY text 1', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None), StringSpec( identifier='ANY text 2', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None), StringSpec( identifier='ANY text 3', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None), StringSpec( identifier='ANY text 4', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None) ])
def test_positional_unigram_2(self): properties = FieldHashingProperties(ngram=1, positional=True) self.assertEqual(list(get_tokenizer(properties)("1*2*")), ['1 1', '2 *', '3 2', '4 *'])