def test_missing_value_integration(): # we create two clks, one from PII which contains the 'replaceWith' values, one which contains the sentinels. # if everything goes right, then the two clks will be identical. schema_dict = dict( version=1, clkConfig=dict(l=1024, k=20, hash=dict(type='doubleHash'), kdf=dict(type='HKDF')), features=[ dict(identifier='name', format=dict(type='string', encoding='utf-8'), hashing=dict(ngram=2, missingValue=dict(sentinel='null', replaceWith='Bob'))), dict(identifier='age', format=dict(type='integer'), hashing=dict(ngram=1, missingValue=dict(sentinel='NA', replaceWith='42'))) ]) s = schema.from_json_dict(schema_dict) pii = [['Bob', '42'], ['null', 'NA']] clks = generate_clks(pii, schema=s, keys=('sec1', 'sec2')) assert len(clks) == 2 assert clks[0] == clks[1]
def test_missing_value_integration(): # we create two clks, one from PII which contains the 'replaceWith' values, one which contains the sentinels. # if everything goes right, then the two clks will be identical. schema_json = """ { "version": 2, "clkConfig": { "l": 1024, "kdf": { "type": "HKDF" } }, "features": [ { "identifier": "name", "format": { "type": "string", "encoding": "utf-8" }, "hashing": { "ngram": 2, "strategy": { "k": 20 }, "missingValue": { "sentinel": "null", "replaceWith": "Bob" } } }, { "identifier": "age", "format": { "type": "integer" }, "hashing": { "ngram": 1, "strategy": { "k": 20 }, "missingValue": { "sentinel": "NA", "replaceWith": "42" } } } ] } """ schema_dict = json.loads(schema_json) s = schema.from_json_dict(schema_dict) pii = [['Bob', '42'], ['null', 'NA']] clks = generate_clks(pii, schema=s, keys=('sec1', 'sec2')) assert len(clks) == 2 assert clks[0] == clks[1]
def test_validation_of_illdefined_not_ignored_feature(self): # 'ignored' has to be true if 'format' and 'hashing' is missing schema_dict = { 'version': 2, 'clkConfig': { 'l': 1024, 'kdf': { 'type': 'HKDF' } }, 'features': [{ 'identifier': 'rec_id', 'ignored': False }] } with self.assertRaises(Exception) as contextmanager: schema.from_json_dict(schema_dict) exception = contextmanager.exception self.assertIsInstance(exception, SchemaError)
def test_expected_number_of_encodings_returned(self): loaded_schema = schema.from_json_dict(self.SCHEMA_DICT) results = clk.generate_clk_from_csv(io.StringIO(self.CSV_INPUT), self.SECRET, loaded_schema, validate=True, header=True, progress_bar=False) assert len(results) == 3
def test_encoding_regression(self): loaded_schema = schema.from_json_dict(self.SCHEMA_DICT) results = clk.generate_clk_from_csv( io.StringIO(self.CSV_INPUT), self.KEYS, loaded_schema, validate=True, header=True, progress_bar=False) assert results[0] == 'THHkzVWFYtzMJzmWobTLN8k8VwRN8+na10bN3N9I9oDPGuRZLGpV/QXZYtRZ6/wc+K3W9wvmDA2KpHmOTlVAY9jDblysQ9zlR86OMSbBn+uG3Qxi8EDpUN6nSI5FfOK1Zt77J0ye8P3wifF6QdkFfm3UXNGWil7CPNnUa/fHG0w=' assert results[1] == '/r76/u//7+1O/3bG//7N5t3evpe/Wt7+v/f/Xt/+9rpXW//f/p7/v//3/vv7v/7/fv7X//vf3Vf/9vP//nd/3t93dt7/dPr/fj7f1z5B3/7W1u/qr+b3//q6729n6/au7772TPz+2s3u/n/88/9OTG/PxvrOh/7Hb89cz+Z3vmo='
def test_encoding_regression(self): loaded_schema = schema.from_json_dict(self.SCHEMA_DICT) results = clk.generate_clk_from_csv(io.StringIO(self.CSV_INPUT), self.SECRET, loaded_schema, validate=True, header=True, progress_bar=False) assert results[ 0] == 'SU9+/O/Jzzi0sfzH8K2l3+qfhn8Ky3jVI21DVdH9j2fXE++JH8GcQGSeYxDZFxALCAT8CHwYJyQcRT3MhUQOFWcOf5fWdr6ofh6DYy8iv////weyunbMahfV9RMWkRwQmBL3fjreUVOCS9D9kAbQC2XgULidKCTHd9ZpbPJ91eE=' assert results[ 1] == 'Pfl1/d7/31/+9u9x9zv//76/83//0v1Xt/dX/3X/e79XP7vd+Xfkf//2/9Xb/7Fd73e9f/n0f/c7Vb99B/X29d8997Pz/vJ87X/X/vcX9vt1d+/+5bP1fvfevnfX8d/f/j0XPL7f999kc/28/3d4c7t/9b/+Pf411/f2+3z1d/s='
def test_issue_111(self): schema_dict = { 'version': 1, 'clkConfig': { 'l': 1024, 'k': 20, 'hash': { 'type': 'doubleHash' }, 'kdf': { 'type': 'HKDF' } }, 'features': [{ 'identifier': 'rec_id', 'ignored': True }, { 'identifier': 'given_name', 'format': { 'type': 'string', 'encoding': 'utf-8' }, 'hashing': { 'ngram': 2, 'weight': 1 } }, { 'identifier': 'surname', 'format': { 'type': 'string', 'encoding': 'utf-8' }, 'hashing': { 'ngram': 2, 'weight': 1 } }, { 'identifier': 'street_number', 'format': { 'type': 'integer' }, 'hashing': { 'ngram': 1, 'positional': True, 'weight': 1 } }, { 'identifier': 'address_1', 'format': { 'type': 'string', 'encoding': 'utf-8' }, 'hashing': { 'ngram': 2, 'weight': 1 } }, { 'identifier': 'address_2', 'format': { 'type': 'string', 'encoding': 'utf-8' }, 'hashing': { 'ngram': 2, 'weight': 1 } }, { 'identifier': 'suburb', 'format': { 'type': 'string', 'encoding': 'utf-8' }, 'hashing': { 'ngram': 2, 'weight': 1 } }, { 'identifier': 'postcode', 'format': { 'type': 'integer', 'minimum': 1000, 'maximum': 9999 }, 'hashing': { 'ngram': 1, 'positional': True, 'weight': 1 } }, { 'identifier': 'state', 'format': { 'type': 'string', 'encoding': 'utf-8', 'maxLength': 3 }, 'hashing': { 'ngram': 2, 'weight': 1 } }, { 'identifier': 'day_of_birth', 'format': { 'type': 'integer' }, 'hashing': { 'ngram': 1, 'positional': True, 'weight': 1 } }, { 'identifier': 'soc_sec_id', 'ignored': True }] } # This fails in #111. Now it shouldn't. schema.from_json_dict(schema_dict)
class NameList: """ Randomly generated PII records. """ randomname_schema_bytes = pkgutil.get_data('clkhash', 'data/randomnames-schema.json') if randomname_schema_bytes is None: raise Exception( "Couldn't locate package data. Please file a bug report.") randomname_schema = json.loads(randomname_schema_bytes.decode()) SCHEMA = schema.from_json_dict(randomname_schema) def __init__(self, n): # type: (int) -> None self.load_data() self.year = date.today().year - 1 self.names = [person for person in self.generate_random_person(n)] self.all_male_first_names = None # type: Optional[Distribution] self.all_female_first_names = None # type: Optional[Distribution] self.all_last_names = None # type: Optional[Distribution] self.all_ages = None # type: Optional[Distribution] @property def schema_types(self): # type: () -> Sequence[FieldSpec] return self.SCHEMA.fields def generate_random_person(self, n): # type: (int) -> Iterable[Tuple[str, str, str, str]] """ Generator that yields details on a person with plausible name, sex and age. :yields: Generated data for one person tuple - (id: str, name: str('First Last'), birthdate: str('DD/MM/YYYY'), sex: str('M' | 'F') ) """ assert self.all_male_first_names is not None assert self.all_female_first_names is not None assert self.all_last_names is not None for i in range(n): sex = 'M' if random.random() > 0.5 else 'F' dob = random_date(self.year, self.all_ages).strftime("%Y/%m/%d") first_name = self.all_male_first_names.generate( ) if sex == 'M' else self.all_female_first_names.generate() last_name = self.all_last_names.generate() yield (str(i), first_name + ' ' + last_name, dob, sex) def load_data(self): # type: () -> None """ Loads databases from package data Uses data files sourced from http://www.quietaffiliate.com/free-first-name-and-last-name-databases-csv-and-sql/ https://www.census.gov/topics/population/genealogy/data/2010_surnames.html https://www.abs.gov.au/AUSSTATS/[email protected]/DetailsPage/3101.0Jun%202016 """ self.all_male_first_names = Distribution('data/male-first-names.csv') self.all_female_first_names = Distribution( 'data/female-first-names.csv') self.all_last_names = Distribution('data/last-names.csv') self.all_ages = Distribution('data/ages.csv') def generate_subsets(self, sz, overlap=0.8, subsets=2): # type: (int, float, int) -> Tuple[List, ...] """ Return random subsets with nonempty intersection. The random subsets are of specified size. If an element is common to two subsets, then it is common to all subsets. This overlap is controlled by a parameter. :param sz: size of subsets to generate :param overlap: size of the intersection, as fraction of the subset length :param subsets: number of subsets to generate :raises ValueError: if there aren't sufficiently many names in the list to satisfy the request; more precisely, raises if (1 - subsets) * floor(overlap * sz) + subsets * sz > len(self.names). :return: tuple of subsets """ overlap_sz = int(math.floor(overlap * sz)) unique_sz = sz - overlap_sz # Unique names per subset total_unique_sz = unique_sz * subsets # Uniques in all subsets total_sz = overlap_sz + total_unique_sz if total_sz > len(self.names): msg = 'insufficient names for requested size and overlap' raise ValueError(msg) sset = random.sample(self.names, total_sz) # Overlapping subset, pool of unique names sset_overlap, sset_unique = sset[:overlap_sz], sset[overlap_sz:] assert len(sset_unique) == subsets * unique_sz # Split pool of unique names into `subsets` chunks uniques = (sset_unique[p * unique_sz:(p + 1) * unique_sz] for p in range(subsets)) return tuple(sset_overlap + u for u in uniques)