def test_ignore_header(self): out = clk.generate_clk_from_csv(io.StringIO(self.csv_correct_header), ('open', 'sesame'), self.schema, header='ignore', progress_bar=False) self.assertEqual(len(out), 3) out = clk.generate_clk_from_csv(io.StringIO( self.csv_incorrect_header_name), ('open', 'sesame'), self.schema, header='ignore', progress_bar=False) self.assertEqual(len(out), 3) out = clk.generate_clk_from_csv(io.StringIO(self.csv_incorrect_count), ('open', 'sesame'), self.schema, header='ignore', progress_bar=False) self.assertEqual(len(out), 3) out = clk.generate_clk_from_csv(io.StringIO(self.csv_no_header), ('open', 'sesame'), self.schema, header='ignore', progress_bar=False) self.assertEqual(len(out), 2)
def compute_hash_speed(num, quiet=False): # type: (int, bool) -> float """ Hash time. """ namelist = NameList(num) os_fd, tmpfile_name = tempfile.mkstemp(text=True) schema = NameList.SCHEMA header_row = ','.join([f.identifier for f in schema.fields]) with open(tmpfile_name, 'wt') as f: f.write(header_row) f.write('\n') for person in namelist.names: print(','.join([str(field) for field in person]), file=f) with open(tmpfile_name, 'rt') as f: start = timer() generate_clk_from_csv(f, ('key1', 'key2'), schema, progress_bar=not quiet) end = timer() os.close(os_fd) os.remove(tmpfile_name) elapsed_time = end - start if not quiet: print("{:6d} hashes in {:.6f} seconds. {:.2f} KH/s".format(num, elapsed_time, num / (1000 * elapsed_time))) return num / elapsed_time
def setUp(self): super(TestRestClientInteractionWithService, self).setUp() self.url = os.environ['TEST_ENTITY_SERVICE'] schema_object = clkhash.schema.Schema.from_json_file(schema_file=open(SAMPLE_DATA_SCHEMA_PATH, 'rt')) keys = ('secret', 'key') self.clk_data_1 = json.dumps( {'clks': generate_clk_from_csv(open(SAMPLE_DATA_PATH_1, 'rt'), keys, schema_object, header='ignore')}) self.clk_data_2 = json.dumps( {'clks': generate_clk_from_csv(open(SAMPLE_DATA_PATH_2, 'rt'), keys, schema_object, header='ignore')})
def test_header(self): out = clk.generate_clk_from_csv(io.StringIO(self.csv_correct_header), ('open', 'sesame'), self.schema, header=True, progress_bar=False) self.assertEqual(len(out), 3) with self.assertRaises(validate_data.FormatError): clk.generate_clk_from_csv(io.StringIO( self.csv_incorrect_header_name), ('open', 'sesame'), self.schema, header=True, progress_bar=False) with self.assertRaises(validate_data.FormatError): clk.generate_clk_from_csv(io.StringIO(self.csv_incorrect_count), ('open', 'sesame'), self.schema, header=True, progress_bar=False) with self.assertRaises(validate_data.FormatError): clk.generate_clk_from_csv(io.StringIO(self.csv_no_header), ('open', 'sesame'), self.schema, header=True, progress_bar=False)
def test_expected_number_of_encodings_returned(self): loaded_schema = schema.from_json_dict(self.SCHEMA_DICT) results = clk.generate_clk_from_csv(io.StringIO(self.CSV_INPUT), self.SECRET, loaded_schema, validate=True, header=True, progress_bar=False) assert len(results) == 3
def setup_class(cls): cls.url = os.environ['TEST_ENTITY_SERVICE'] schema_object = clkhash.schema.from_json_file( schema_file=open(SAMPLE_DATA_SCHEMA_PATH, 'rt')) keys = ('secret', 'key') cls.clk_data_1 = json.dumps({ 'clks': generate_clk_from_csv(open(SAMPLE_DATA_PATH_1, 'rt'), keys, schema_object, header='ignore') }) cls.clk_data_2 = json.dumps({ 'clks': generate_clk_from_csv(open(SAMPLE_DATA_PATH_2, 'rt'), keys, schema_object, header='ignore') }) cls._created_projects = []
def test_encoding_regression(self): loaded_schema = schema.from_json_dict(self.SCHEMA_DICT) results = clk.generate_clk_from_csv( io.StringIO(self.CSV_INPUT), self.KEYS, loaded_schema, validate=True, header=True, progress_bar=False) assert results[0] == 'THHkzVWFYtzMJzmWobTLN8k8VwRN8+na10bN3N9I9oDPGuRZLGpV/QXZYtRZ6/wc+K3W9wvmDA2KpHmOTlVAY9jDblysQ9zlR86OMSbBn+uG3Qxi8EDpUN6nSI5FfOK1Zt77J0ye8P3wifF6QdkFfm3UXNGWil7CPNnUa/fHG0w=' assert results[1] == '/r76/u//7+1O/3bG//7N5t3evpe/Wt7+v/f/Xt/+9rpXW//f/p7/v//3/vv7v/7/fv7X//vf3Vf/9vP//nd/3t93dt7/dPr/fj7f1z5B3/7W1u/qr+b3//q6729n6/au7772TPz+2s3u/n/88/9OTG/PxvrOh/7Hb89cz+Z3vmo='
def test_doesnt_crash(self): CSV_INPUT = io.StringIO('name,id,dob,gender,children\n' 'KÉVIN,kev007,1963-12-13,M,1\n' '"JOHN HOWARD, ESQ.",stv534,1992-02-29,M,16\n' 'JULIA,alp423,0123-01-12,F,0\n') SCHEMA_DICT = dict( version=1, clkConfig=dict( l=1024, k=30, kdf=dict( type='HKDF', hash='SHA256', salt= 'SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==', info='c2NoZW1hX2V4YW1wbGU=', keySize=64), hash=dict(type='doubleHash')), features=[ dict(identifier='name', format=dict(type='string', encoding='utf-8', case='upper'), hashing=dict(ngram=2, weight=2)), dict(identifier='id', format=dict(type='string', encoding='ascii', pattern=r'[a-z][a-z][a-z]\d\d\d'), hashing=dict(ngram=1, positional=True)), dict(identifier='dob', format=dict(type='date', format='%Y-%m-%d', description='When were ya born?'), hashing=dict(ngram=2, positional=True, weight=.5)), dict(identifier='gender', format=dict(type='enum', values=['M', 'F']), hashing=dict(ngram=1, positional=False)), dict(identifier='children', format=dict(type='integer', maximum=20), hashing=dict(ngram=1, positional=True)) ]) KEYS = ('chicken', 'nuggets') loaded_schema = schema.Schema.from_json_dict(SCHEMA_DICT) results = clk.generate_clk_from_csv(CSV_INPUT, KEYS, loaded_schema, validate=True, header=True, progress_bar=False)
def test_encoding_regression(self): loaded_schema = schema.from_json_dict(self.SCHEMA_DICT) results = clk.generate_clk_from_csv(io.StringIO(self.CSV_INPUT), self.SECRET, loaded_schema, validate=True, header=True, progress_bar=False) assert results[ 0] == 'SU9+/O/Jzzi0sfzH8K2l3+qfhn8Ky3jVI21DVdH9j2fXE++JH8GcQGSeYxDZFxALCAT8CHwYJyQcRT3MhUQOFWcOf5fWdr6ofh6DYy8iv////weyunbMahfV9RMWkRwQmBL3fjreUVOCS9D9kAbQC2XgULidKCTHd9ZpbPJ91eE=' assert results[ 1] == 'Pfl1/d7/31/+9u9x9zv//76/83//0v1Xt/dX/3X/e79XP7vd+Xfkf//2/9Xb/7Fd73e9f/n0f/c7Vb99B/X29d8997Pz/vJ87X/X/vcX9vt1d+/+5bP1fvfevnfX8d/f/j0XPL7f999kc/28/3d4c7t/9b/+Pf411/f2+3z1d/s='
def hash(pii_csv, secret, schema, clk_json, no_header, check_header, validate, multiprocessing, verbose): """Process data to create CLKs Given a file containing CSV data as PII_CSV, and a JSON document defining the expected schema, verify the schema, then hash the data to create CLKs writing them as JSON to CLK_JSON. Note the CSV file should contain a header row - however this row is not used by this tool. It is important that the secret is only known by the two data providers. One word must be provided. For example: $clkutil hash pii.csv horse-staple pii-schema.json clk.json Use "-" for CLK_JSON to write JSON to stdout. """ try: schema_object = clkhash.schema.from_json_file(schema_file=schema) except SchemaError as e: log(str(e)) raise SystemExit(-1) header = True if not check_header: header = 'ignore' if no_header: header = False try: clk_data = clk.generate_clk_from_csv( pii_csv, secret, schema_object, validate=validate, header=header, progress_bar=verbose, use_multiprocessing=multiprocessing) except (validate_data.EntryError, validate_data.FormatError) as e: msg, = e.args log(msg) log('Hashing failed.') else: json.dump({'clks': clk_data}, clk_json) if hasattr(clk_json, 'name'): log("CLK data written to {}".format(clk_json.name))
def hash(input, keys, schema, output, quiet, no_header, check_header, validate): """Process data to create CLKs Given a file containing csv data as INPUT, and a json document defining the expected schema, verify the schema, then hash the data to create CLKs writing to OUTPUT. Note the CSV file should contain a header row - however this row is not used by this tool. It is important that the keys are only known by the two data providers. Two words should be provided. For example: $clkutil hash input.txt horse staple output.txt Use "-" to output to stdout. """ schema_object = clkhash.schema.Schema.from_json_file(schema_file=schema) header = True if not check_header: header = 'ignore' if no_header: header = False try: clk_data = clk.generate_clk_from_csv(input, keys, schema_object, validate=validate, header=header, progress_bar=not quiet) except (validate_data.EntryError, validate_data.FormatError) as e: msg, = e.args log(msg) log('Hashing failed.') else: json.dump({'clks': clk_data}, output) if hasattr(output, 'name'): log("CLK data written to {}".format(output.name))
def generate_clks(filename, schema): with open(filename, 'rt') as f: hashed_data = clk.generate_clk_from_csv(f, SECRET, schema) return deserialize_filters(hashed_data)