def test_compare_to_legacy(self): # Identifier: 'ANY freetext' fhp = FieldHashingProperties(ngram=2, hash_type='doubleHash', k=10) schema = Schema( l=1024, kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='), kdf_key_size=64, kdf_salt=base64.b64decode( 'SCbL2zHNnmsckfzchsNkZY9XoHk96P' '/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='), fields=[ StringSpec(identifier='ANY text {}'.format(i + 1), hashing_properties=fhp) for i in range(4) ]) row = ['Bobby', 'Bobby', 'Bobby', 'Bobby'] master_secrets = [ 'No, I am your father'.encode(), "No... that's not true! That's impossible!".encode() ] keys_hkdf = generate_key_lists(master_secrets, len(row), kdf='HKDF') keys_legacy = generate_key_lists(master_secrets, len(row), kdf='legacy') bloom_hkdf = next(stream_bloom_filters([row], keys_hkdf, schema)) bloom_legacy = next(stream_bloom_filters([row], keys_legacy, schema)) hkdf_count = bloom_hkdf[0].count() legacy_count = bloom_legacy[0].count() # lecay will map the 4 Bobbys' to the same bits, whereas hkdf will # map each Bobby to different bits. self.assertLessEqual(legacy_count, fhp.k * 6) # 6 bi-grams self.assertLess(legacy_count, hkdf_count) self.assertLessEqual(hkdf_count, len(row) * legacy_count)
def test_concurrent(self): f1 = tuple( map( itemgetter(0), bloomfilter.stream_bloom_filters(self.s1, self.key_lists, self.nl.SCHEMA))) f2 = tuple( map( itemgetter(0), bloomfilter.stream_bloom_filters(self.s2, self.key_lists, self.nl.SCHEMA))) threshold = 0.9 candidate_pairs = anonlink.concurrency.process_chunk( [{ "datasetIndex": 0, "range": [0, len(f1)] }, { "datasetIndex": 1, "range": [0, len(f2)] }], (f1, f2), anonlink.similarities.dice_coefficient, threshold, k=4) groups = anonlink.solving.greedy_solve(candidate_pairs) mapping = dict(anonlink.solving.pairs_from_groups(groups)) self.check_accuracy(mapping)
def test_xor_folding_integration(self): namelist = randomnames.NameList(1) schema_0 = namelist.SCHEMA assert schema_0.xor_folds == 0 schema_1 = copy(schema_0) schema_1.xor_folds = 1 schema_1.l //= 2 key_lists = generate_key_lists('secret', len(namelist.schema_types)) bf_original, _, _ = next(bloomfilter.stream_bloom_filters( namelist.names, key_lists, schema_0)) bf_folded, _, _ = next(bloomfilter.stream_bloom_filters( namelist.names, key_lists, schema_1)) self.assertEqual( bf_folded, bf_original[:len(bf_original) // 2] ^ bf_original[len(bf_original) // 2:], 'Folded filter is not an XOR of the two halves of the original.')
def generate_data(samples, proportion=0.75): nl = randomnames.NameList(samples * 2) s1, s2 = nl.generate_subsets(samples, proportion) keys = generate_key_lists('secret', len(nl.schema_types)) filters1 = list(map(itemgetter(0), bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA))) filters2 = list(map(itemgetter(0), bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA))) return (s1, s2, filters1, filters2)
def test_cffi_manual(self): nl = randomnames.NameList(30) s1, s2 = nl.generate_subsets(5, 1.0) keys = generate_key_lists(('test1', 'test2'), len(nl.schema_types)) f1 = tuple( f[0] for f in bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA)) f2 = tuple( f[0] for f in bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA)) py_similarity = similarities.dice_coefficient_python( (f1, f2), self.default_threshold, self.default_k) c_similarity = similarities.dice_coefficient_accelerated( (f1, f2), self.default_threshold, self.default_k) self.assert_similarity_matrices_equal(py_similarity, c_similarity)
def setup_class(cls): cls.proportion = 0.8 nl = randomnames.NameList(300) s1, s2 = nl.generate_subsets(200, cls.proportion) keys = generate_key_lists(('test1', 'test2'), len(nl.schema_types)) cls.filters1 = tuple( f[0] for f in bloomfilter.stream_bloom_filters(s1, keys, nl.SCHEMA)) cls.filters2 = tuple( f[0] for f in bloomfilter.stream_bloom_filters(s2, keys, nl.SCHEMA)) cls.filters = cls.filters1, cls.filters2 cls.default_k = 10 cls.default_threshold = 0.5
def compare_python_c(ntotal=10000, nsubset=6000, frac=0.8): """Compare results and running time of python and C++ versions. :param ntotal: Total number of data points to generate :param nsubset: Number of points for each database :param frac: Fraction of overlap between subsets :raises: AssertionError if the results differ :return: dict with 'c' and 'python' keys with values of the total time taken for each implementation """ nml = NameList(ntotal) sl1, sl2 = nml.generate_subsets(nsubset, frac) keys = generate_key_lists(('test1', 'test2'), len(nml.schema_types)) filters1 = tuple( map(operator.itemgetter(0), stream_bloom_filters(sl1, keys, nml.SCHEMA))) filters2 = tuple( map(operator.itemgetter(0), stream_bloom_filters(sl2, keys, nml.SCHEMA))) # Pure Python version start = timer() result = anonlink.candidate_generation.find_candidate_pairs( (filters1, filters2), anonlink.similarities.dice_coefficient_python, 0.0, k=1) end = timer() python_time = end - start # C++ accelerated version start = timer() result3 = anonlink.candidate_generation.find_candidate_pairs( (filters1, filters2), anonlink.similarities.dice_coefficient_accelerated, 0.0, k=1) end = timer() cffi_time = end - start assert result == result3, "Results are different between C++ cffi and Python" # Results are the same return {"c": cffi_time, "python": python_time}
def test_cffi_k(self): f1 = tuple(map(itemgetter(0), bloomfilter.stream_bloom_filters( self.s1, self.key_lists, self.nl.SCHEMA))) f2 = tuple(map(itemgetter(0), bloomfilter.stream_bloom_filters( self.s2, self.key_lists, self.nl.SCHEMA))) threshold = 0.9 candidate_pairs = anonlink.candidate_generation.find_candidate_pairs( (f1, f2), anonlink.similarities.dice_coefficient_accelerated, threshold, k=4) groups = anonlink.solving.greedy_solve(candidate_pairs) mapping = dict(anonlink.solving.pairs_from_groups(groups)) self.check_accuracy(mapping)
def hash_and_serialize_chunk( chunk_pii_data, # type: Sequence[Sequence[str]] keys, # type: Sequence[Sequence[bytes]] schema # type: Schema ): # type: (...) -> Tuple[List[str], Sequence[int]] """ Generate Bloom filters (ie hash) from chunks of PII then serialize the generated Bloom filters. It also computes and outputs the Hamming weight (or popcount) -- the number of bits set to one -- of the generated Bloom filters. :param chunk_pii_data: An iterable of indexable records. :param keys: A tuple of two lists of secret keys used in the HMAC. :param Schema schema: Schema specifying the entry formats and hashing settings. :return: A list of serialized Bloom filters and a list of corresponding popcounts """ clk_data = [] clk_popcounts = [] for clk in stream_bloom_filters(chunk_pii_data, keys, schema): clk_data.append(serialize_bitarray(clk[0]).strip()) clk_popcounts.append(clk[2]) return clk_data, clk_popcounts
hashing_properties=FieldHashingProperties( encoding=FieldHashingProperties._DEFAULT_ENCODING, ngram=2, positional=False, weight=1), description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None) ]) pii = [['Deckard']] keys = generate_key_lists(('secret', ), 1) schema.fields[0].hashing_properties.weight = 0 bf0 = next(bloomfilter.stream_bloom_filters(pii, keys, schema)) schema.fields[0].hashing_properties.weight = 1 bf1 = next(bloomfilter.stream_bloom_filters(pii, keys, schema)) schema.fields[0].hashing_properties.weight = 2 bf2 = next(bloomfilter.stream_bloom_filters(pii, keys, schema)) schema.fields[0].hashing_properties.weight = 1.5 bf15 = next(bloomfilter.stream_bloom_filters(pii, keys, schema)) self.assertEqual(bf0[0].count(), 0) n1 = bf1[0].count() n2 = bf2[0].count() n15 = bf15[0].count() self.assertGreater(n1, 0)
description=None, case=StringSpec._DEFAULT_CASE, min_length=0, max_length=None) ]) row = ['Bobby', 'Bobby', 'Bobby', 'Bobby'] master_secrets = [ 'No, I am your father'.encode(), "No... that's not true! That's impossible!".encode() ] keys_hkdf = generate_key_lists(master_secrets, len(row), kdf='HKDF') keys_legacy = generate_key_lists(master_secrets, len(row), kdf='legacy') bloom_hkdf = next(stream_bloom_filters([row], keys_hkdf, schema)) bloom_legacy = next(stream_bloom_filters([row], keys_legacy, schema)) hkdf_count = bloom_hkdf[0].count() legacy_count = bloom_legacy[0].count() # lecay will map the 4 Bobbys' to the same bits, whereas hkdf will map each Bobby to different bits. self.assertLessEqual(legacy_count, schema.hashing_globals.k * 6) # 6 bi-grams self.assertLess(legacy_count, hkdf_count) self.assertLessEqual(hkdf_count, len(row) * legacy_count) def test_wrong_kdf(self): with self.assertRaises(ValueError): generate_key_lists([b'0'], 1, kdf='breakMe') def test_wrong_hash_function(self): with self.assertRaises(ValueError):