def test_compare_to_legacy(self): # Identifier: 'ANY freetext' fhp = FieldHashingProperties(ngram=2, hash_type='doubleHash', k=10) schema = Schema( l=1024, kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P' '/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='), fields=[ StringSpec(identifier='ANY text {}'.format(i + 1), hashing_properties=fhp) for i in range(4) ]) row = ['Bobby', 'Bobby', 'Bobby', 'Bobby'] master_secrets = [ 'No, I am your father'.encode(), "No... that's not true! That's impossible!".encode() ] keys_hkdf = generate_key_lists(master_secrets, len(row), kdf='HKDF') keys_legacy = generate_key_lists(master_secrets, len(row), kdf='legacy') bloom_hkdf = next(stream_bloom_filters([row], keys_hkdf, schema)) bloom_legacy = next(stream_bloom_filters([row], keys_legacy, schema)) hkdf_count = bloom_hkdf[0].count() legacy_count = bloom_legacy[0].count() # lecay will map the 4 Bobbys' to the same bits, whereas hkdf will # map each Bobby to different bits. self.assertLessEqual(legacy_count, fhp.k * 6) # 6 bi-grams self.assertLess(legacy_count, hkdf_count) self.assertLessEqual(hkdf_count, len(row) * legacy_count)
def test_xor_folding_integration(self): namelist = randomnames.NameList(1) schema_0 = namelist.SCHEMA assert schema_0.xor_folds == 0 schema_1 = copy(schema_0) schema_1.xor_folds = 1 schema_1.l //= 2 key_lists = generate_key_lists('secret', len(namelist.schema_types)) bf_original, _, _ = next(bloomfilter.stream_bloom_filters( namelist.names, key_lists, schema_0)) bf_folded, _, _ = next(bloomfilter.stream_bloom_filters( namelist.names, key_lists, schema_1)) self.assertEqual( bf_folded, bf_original[:len(bf_original) // 2] ^ bf_original[len(bf_original) // 2:], 'Folded filter is not an XOR of the two halves of the original.')
def test_generate_key_lists(self): secret = "No, I am your father. No... that's not true! That's impossible!".encode( ) for num_keys in (1, 10): key_lists = generate_key_lists(secret, num_keys) self._test_key_lists(key_lists, num_keys, DEFAULT_NUM_HASHING_METHODS)
def compare_python_c(ntotal=10000, nsubset=6000, frac=0.8): """Compare results and running time of python and C++ versions. :param ntotal: Total number of data points to generate :param nsubset: Number of points for each database :param frac: Fraction of overlap between subsets :raises: AssertionError if the results differ :return: dict with 'c' and 'python' keys with values of the total time taken for each implementation """ nml = NameList(ntotal) sl1, sl2 = nml.generate_subsets(nsubset, frac) keys = generate_key_lists(('test1', 'test2'), len(nml.schema)) filters1 = calculate_bloom_filters(sl1, get_schema_types(nml.schema), keys) filters2 = calculate_bloom_filters(sl2, get_schema_types(nml.schema), keys) # Pure Python version start = timer() result = python_filter_similarity(filters1, filters2) end = timer() python_time = end - start # C++ cffi version start = timer() result3 = cffi_filter_similarity_k(filters1, filters2, 1, 0.0) end = timer() cffi_time = end - start assert result == result3, "Results are different between C++ cffi and Python" # Results are the same return {"c": cffi_time, "python": python_time}
def test_generate_key_lists_num_hashes(self): secret = "No, I am your father. No... that's not true! That's impossible!".encode( ) num_keys = 10 for num_hashing_methods in (1, 10): key_lists = generate_key_lists( secret, num_keys, num_hashing_methods=num_hashing_methods) self._test_key_lists(key_lists, num_keys, num_hashing_methods)
def setUpClass(cls): cls.proportion = 0.8 nl = randomnames.NameList(300) s1, s2 = nl.generate_subsets(200, cls.proportion) keys = generate_key_lists(('test1', 'test2'), len(nl.schema)) cls.filters1 = bloomfilter.calculate_bloom_filters( s1, schema.get_schema_types(nl.schema), keys) cls.filters2 = bloomfilter.calculate_bloom_filters( s2, schema.get_schema_types(nl.schema), keys)
def generate_clks( pii_data, # type: Sequence[Sequence[str]] schema, # type: Schema secret, # type: AnyStr validate=True, # type: bool callback=None, # type: Optional[Callable[[int, Sequence[int]], None]] use_multiprocessing=True # type: bool ): # type: (...) -> List[str] # Generate two keys for each identifier from the secret, one key per hashing method used when computing # the bloom filters. # Otherwise it could create more if required using the parameter `num_hashing_methods` in `generate_key_lists` key_lists = generate_key_lists(secret, len(schema.fields), key_size=schema.kdf_key_size, salt=schema.kdf_salt, info=schema.kdf_info, kdf=schema.kdf_type, hash_algo=schema.kdf_hash) if validate: validate_entries(schema.fields, pii_data) # Chunks PII log.info("Hashing {} entities".format(len(pii_data))) chunk_size = 200 if len(pii_data) <= 10000 else 1000 futures = [] # Compute Bloom filter from the chunks and then serialise it pool_executor = ProcessPoolExecutor if use_multiprocessing else \ ThreadPoolExecutor # type: Union[Type[ProcessPoolExecutor], Type[ThreadPoolExecutor]] with pool_executor() as executor: for chunk in chunks(pii_data, chunk_size): future = executor.submit( hash_and_serialize_chunk, chunk, key_lists, schema, ) if callback is not None: unpacked_callback = cast(Callable[[int, Sequence[int]], None], callback) future.add_done_callback( lambda f: unpacked_callback(len(f.result()[0]), f.result()[1])) futures.append(future) results = [] for future in futures: clks, clk_stats = future.result() results.extend(clks) return results
def generate_data(samples, proportion=0.75): nl = randomnames.NameList(samples * 2) s1, s2 = nl.generate_subsets(samples, proportion) keys = generate_key_lists('secret', len(nl.schema_types)) filters1 = list(map(itemgetter(0), bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA))) filters2 = list(map(itemgetter(0), bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA))) return (s1, s2, filters1, filters2)
def generate_data(samples, proportion=0.75): nl = randomnames.NameList(samples * 2) s1, s2 = nl.generate_subsets(samples, proportion) keys = generate_key_lists(('test1', 'test2'), len(nl.schema)) filters1 = bloomfilter.calculate_bloom_filters( s1, schema.get_schema_types(nl.schema), keys) filters2 = bloomfilter.calculate_bloom_filters( s2, schema.get_schema_types(nl.schema), keys) return (s1, s2, filters1, filters2)
def generate_clks( pii_data, # type: Sequence[Sequence[str]] schema, # type: Schema keys, # type: Tuple[AnyStr, AnyStr] validate=True, # type: bool callback=None # type: Optional[Callable[[int, Sequence[int]], None]] ): # type: (...) -> List[str] # generate two keys for each identifier key_lists = generate_key_lists( keys, len(schema.fields), key_size=schema.hashing_globals.kdf_key_size, salt=schema.hashing_globals.kdf_salt, info=schema.hashing_globals.kdf_info, kdf=schema.hashing_globals.kdf_type, hash_algo=schema.hashing_globals.kdf_hash) if validate: validate_entries(schema.fields, pii_data) # Chunks PII log.info("Hashing {} entities".format(len(pii_data))) chunk_size = 200 if len(pii_data) <= 10000 else 1000 futures = [] # Compute Bloom filter from the chunks and then serialise it with concurrent.futures.ProcessPoolExecutor() as executor: for chunk in chunks(pii_data, chunk_size): future = executor.submit( hash_and_serialize_chunk, chunk, key_lists, schema, ) if callback is not None: unpacked_callback = cast(Callable[[int, Sequence[int]], None], callback) future.add_done_callback( lambda f: unpacked_callback(len(f.result()[0]), f.result()[1])) futures.append(future) results = [] for future in futures: clks, clk_stats = future.result() results.extend(clks) return results
def test_generate_key_lists(self): master_secrets = [ 'No, I am your father'.encode(), "No... that's not true! That's impossible!".encode() ] for num_keys in (1, 10): key_lists = generate_key_lists(master_secrets, num_keys) self.assertEqual(len(key_lists), num_keys) for l in key_lists: self.assertEqual(len(l), len(master_secrets)) for key in key_lists[0]: self.assertEqual(len(key), DEFAULT_KEY_SIZE, msg='key should be of size ' '"default_key_size"')
def test_cffi_manual(self): nl = randomnames.NameList(30) s1, s2 = nl.generate_subsets(5, 1.0) keys = generate_key_lists(('test1', 'test2'), len(nl.schema_types)) f1 = tuple( f[0] for f in bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA)) f2 = tuple( f[0] for f in bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA)) py_similarity = similarities.dice_coefficient_python( (f1, f2), self.default_threshold, self.default_k) c_similarity = similarities.dice_coefficient_accelerated( (f1, f2), self.default_threshold, self.default_k) self.assert_similarity_matrices_equal(py_similarity, c_similarity)
def setup_class(cls): cls.proportion = 0.8 nl = randomnames.NameList(300) s1, s2 = nl.generate_subsets(200, cls.proportion) keys = generate_key_lists(('test1', 'test2'), len(nl.schema_types)) cls.filters1 = tuple( f[0] for f in bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA)) cls.filters2 = tuple( f[0] for f in bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA)) cls.filters = cls.filters1, cls.filters2 cls.default_k = 10 cls.default_threshold = 0.5
def test_cffi_manual(self): nl = randomnames.NameList(30) s1, s2 = nl.generate_subsets(5, 1.0) keys = generate_key_lists(('test1', 'test2'), len(nl.schema)) f1 = bloomfilter.calculate_bloom_filters( s1, schema.get_schema_types(nl.schema), keys) f2 = bloomfilter.calculate_bloom_filters( s2, schema.get_schema_types(nl.schema), keys) ps = entitymatch.python_filter_similarity(f1, f2) cs = entitymatch.cffi_filter_similarity_k(f1, f2, 1, 0.0) python_scores = [p[1] for p in ps] c_scores = [c[1] for c in cs] self.assertAlmostEqual(python_scores, c_scores)
def compare_python_c(ntotal=10000, nsubset=6000, frac=0.8): """Compare results and running time of python and C++ versions. :param ntotal: Total number of data points to generate :param nsubset: Number of points for each database :param frac: Fraction of overlap between subsets :raises: AssertionError if the results differ :return: dict with 'c' and 'python' keys with values of the total time taken for each implementation """ nml = NameList(ntotal) sl1, sl2 = nml.generate_subsets(nsubset, frac) keys = generate_key_lists(('test1', 'test2'), len(nml.schema_types)) filters1 = tuple( map(operator.itemgetter(0), stream_bloom_filters(sl1, keys, nml.SCHEMA))) filters2 = tuple( map(operator.itemgetter(0), stream_bloom_filters(sl2, keys, nml.SCHEMA))) # Pure Python version start = timer() result = anonlink.candidate_generation.find_candidate_pairs( (filters1, filters2), anonlink.similarities.dice_coefficient_python, 0.0, k=1) end = timer() python_time = end - start # C++ accelerated version start = timer() result3 = anonlink.candidate_generation.find_candidate_pairs( (filters1, filters2), anonlink.similarities.dice_coefficient_accelerated, 0.0, k=1) end = timer() cffi_time = end - start assert result == result3, "Results are different between C++ cffi and Python" # Results are the same return {"c": cffi_time, "python": python_time}
def test_cffi_k(self): nl = randomnames.NameList(300) s1, s2 = nl.generate_subsets(150, 0.8) keys = ('test1', 'test2') key_lists = generate_key_lists(keys, len(nl.schema)) f1 = bloomfilter.calculate_bloom_filters( s1, schema.get_schema_types(nl.schema), key_lists) f2 = bloomfilter.calculate_bloom_filters( s2, schema.get_schema_types(nl.schema), key_lists) threshold = 0.8 similarity = entitymatch.cffi_filter_similarity_k(f1, f2, 4, threshold) mapping = network_flow.map_entities(similarity, threshold=threshold, method=None) for indexA in mapping: self.assertEqual(s1[indexA], s2[mapping[indexA]])
def setUp(self): self.nl = randomnames.NameList(300) self.s1, self.s2 = self.nl.generate_subsets(self.sample, self.proportion) self.key_lists = generate_key_lists('secret', len(self.nl.schema_types))
def test_wrong_kdf(self): with self.assertRaises(ValueError): generate_key_lists([b'0'], 1, kdf='breakMe')
identifier='some info', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, comparator=bigram_tokenizer, strategy=BitsPerTokenStrategy(20) ), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None ) ] ) pii = [['Deckard']] keys = generate_key_lists('secret', 1) schema.fields[0].hashing_properties.strategy = BitsPerTokenStrategy(0) bf0 = next(bloomfilter.stream_bloom_filters(pii, keys, schema)) schema.fields[0].hashing_properties.strategy = BitsPerTokenStrategy(20) bf1 = next(bloomfilter.stream_bloom_filters(pii, keys, schema)) schema.fields[0].hashing_properties.strategy = BitsPerTokenStrategy(40) bf2 = next(bloomfilter.stream_bloom_filters(pii, keys, schema)) schema.fields[0].hashing_properties.strategy = BitsPerTokenStrategy(30) bf15 = next(bloomfilter.stream_bloom_filters(pii, keys, schema)) self.assertEqual(bf0[0].count(), 0) n1 = bf1[0].count()
def test_fail_generate_key_lists(self): with self.assertRaises(TypeError): generate_key_lists([True, False], 10)
encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None) ]) row = ['Bobby', 'Bobby', 'Bobby', 'Bobby'] master_secrets = [ 'No, I am your father'.encode(), "No... that's not true! That's impossible!".encode() ] keys_hkdf = generate_key_lists(master_secrets, len(row), kdf='HKDF') keys_legacy = generate_key_lists(master_secrets, len(row), kdf='legacy') bloom_hkdf = next(stream_bloom_filters([row], keys_hkdf, schema)) bloom_legacy = next(stream_bloom_filters([row], keys_legacy, schema)) hkdf_count = bloom_hkdf[0].count() legacy_count = bloom_legacy[0].count() # lecay will map the 4 Bobbys' to the same bits, whereas hkdf will map each Bobby to different bits. self.assertLessEqual(legacy_count, schema.hashing_globals.k * 6) # 6 bi-grams self.assertLess(legacy_count, hkdf_count) self.assertLessEqual(hkdf_count, len(row) * legacy_count) def test_wrong_kdf(self): with self.assertRaises(ValueError):
fields=[ StringSpec( identifier='some info', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None) ]) pii = [['Deckard']] keys = generate_key_lists(('secret', ), 1) schema.fields[0].hashing_properties.weight = 0 bf0 = next(bloomfilter.stream_bloom_filters(pii, keys, schema)) schema.fields[0].hashing_properties.weight = 1 bf1 = next(bloomfilter.stream_bloom_filters(pii, keys, schema)) schema.fields[0].hashing_properties.weight = 2 bf2 = next(bloomfilter.stream_bloom_filters(pii, keys, schema)) schema.fields[0].hashing_properties.weight = 1.5 bf15 = next(bloomfilter.stream_bloom_filters(pii, keys, schema)) self.assertEqual(bf0[0].count(), 0) n1 = bf1[0].count()
def setUp(self): self.nl = randomnames.NameList(300) self.s1, self.s2 = self.nl.generate_subsets(self.sample, self.proportion) keys = ('test1', 'test2') self.key_lists = generate_key_lists(keys, len(self.nl.schema_types))
def test_wrong_num_hashing_methods(self): with self.assertRaises(ValueError): secret = "No, I am your father. No... that's not true! That's impossible!".encode( ) generate_key_lists(secret, 10, num_hashing_methods=0)
fields=[ StringSpec( identifier='some info', hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, k=20), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None) ]) pii = [['Deckard']] keys = generate_key_lists(('secret', ), 1) schema.fields[0].hashing_properties.k = 0 bf0 = next(bloomfilter.stream_bloom_filters(pii, keys, schema)) schema.fields[0].hashing_properties.k = 20 bf1 = next(bloomfilter.stream_bloom_filters(pii, keys, schema)) schema.fields[0].hashing_properties.k = 40 bf2 = next(bloomfilter.stream_bloom_filters(pii, keys, schema)) schema.fields[0].hashing_properties.k = 30 bf15 = next(bloomfilter.stream_bloom_filters(pii, keys, schema)) self.assertEqual(bf0[0].count(), 0) n1 = bf1[0].count()