Exemplo n.º 1
0
    def test_ignore_header(self):
        out = clk.generate_clk_from_csv(io.StringIO(self.csv_correct_header),
                                        ('open', 'sesame'),
                                        self.schema,
                                        header='ignore',
                                        progress_bar=False)
        self.assertEqual(len(out), 3)

        out = clk.generate_clk_from_csv(io.StringIO(
            self.csv_incorrect_header_name), ('open', 'sesame'),
                                        self.schema,
                                        header='ignore',
                                        progress_bar=False)
        self.assertEqual(len(out), 3)

        out = clk.generate_clk_from_csv(io.StringIO(self.csv_incorrect_count),
                                        ('open', 'sesame'),
                                        self.schema,
                                        header='ignore',
                                        progress_bar=False)
        self.assertEqual(len(out), 3)

        out = clk.generate_clk_from_csv(io.StringIO(self.csv_no_header),
                                        ('open', 'sesame'),
                                        self.schema,
                                        header='ignore',
                                        progress_bar=False)
        self.assertEqual(len(out), 2)
Exemplo n.º 2
0
def compute_hash_speed(num, quiet=False):
    # type: (int, bool) -> float
    """ Hash time.
    """
    namelist = NameList(num)

    os_fd, tmpfile_name = tempfile.mkstemp(text=True)

    schema = NameList.SCHEMA
    header_row = ','.join([f.identifier for f in schema.fields])

    with open(tmpfile_name, 'wt') as f:
        f.write(header_row)
        f.write('\n')
        for person in namelist.names:
            print(','.join([str(field) for field in person]), file=f)

    with open(tmpfile_name, 'rt') as f:
        start = timer()
        generate_clk_from_csv(f, ('key1', 'key2'), schema, progress_bar=not quiet)
        end = timer()

    os.close(os_fd)
    os.remove(tmpfile_name)

    elapsed_time = end - start
    if not quiet:
        print("{:6d} hashes in {:.6f} seconds. {:.2f} KH/s".format(num, elapsed_time, num / (1000 * elapsed_time)))
    return num / elapsed_time
Exemplo n.º 3
0
    def setUp(self):
        super(TestRestClientInteractionWithService, self).setUp()
        self.url = os.environ['TEST_ENTITY_SERVICE']

        schema_object = clkhash.schema.Schema.from_json_file(schema_file=open(SAMPLE_DATA_SCHEMA_PATH, 'rt'))
        keys = ('secret', 'key')
        self.clk_data_1 = json.dumps(
            {'clks': generate_clk_from_csv(open(SAMPLE_DATA_PATH_1, 'rt'), keys, schema_object, header='ignore')})
        self.clk_data_2 = json.dumps(
            {'clks': generate_clk_from_csv(open(SAMPLE_DATA_PATH_2, 'rt'), keys, schema_object, header='ignore')})
Exemplo n.º 4
0
    def test_header(self):
        out = clk.generate_clk_from_csv(io.StringIO(self.csv_correct_header),
                                        ('open', 'sesame'),
                                        self.schema,
                                        header=True,
                                        progress_bar=False)
        self.assertEqual(len(out), 3)

        with self.assertRaises(validate_data.FormatError):
            clk.generate_clk_from_csv(io.StringIO(
                self.csv_incorrect_header_name), ('open', 'sesame'),
                                      self.schema,
                                      header=True,
                                      progress_bar=False)

        with self.assertRaises(validate_data.FormatError):
            clk.generate_clk_from_csv(io.StringIO(self.csv_incorrect_count),
                                      ('open', 'sesame'),
                                      self.schema,
                                      header=True,
                                      progress_bar=False)

        with self.assertRaises(validate_data.FormatError):
            clk.generate_clk_from_csv(io.StringIO(self.csv_no_header),
                                      ('open', 'sesame'),
                                      self.schema,
                                      header=True,
                                      progress_bar=False)
Exemplo n.º 5
0
    def test_expected_number_of_encodings_returned(self):
        loaded_schema = schema.from_json_dict(self.SCHEMA_DICT)

        results = clk.generate_clk_from_csv(io.StringIO(self.CSV_INPUT),
                                            self.SECRET,
                                            loaded_schema,
                                            validate=True,
                                            header=True,
                                            progress_bar=False)

        assert len(results) == 3
Exemplo n.º 6
0
    def setup_class(cls):
        cls.url = os.environ['TEST_ENTITY_SERVICE']

        schema_object = clkhash.schema.from_json_file(
            schema_file=open(SAMPLE_DATA_SCHEMA_PATH, 'rt'))
        keys = ('secret', 'key')
        cls.clk_data_1 = json.dumps({
            'clks':
            generate_clk_from_csv(open(SAMPLE_DATA_PATH_1, 'rt'),
                                  keys,
                                  schema_object,
                                  header='ignore')
        })
        cls.clk_data_2 = json.dumps({
            'clks':
            generate_clk_from_csv(open(SAMPLE_DATA_PATH_2, 'rt'),
                                  keys,
                                  schema_object,
                                  header='ignore')
        })
        cls._created_projects = []
Exemplo n.º 7
0
    def test_encoding_regression(self):
        loaded_schema = schema.from_json_dict(self.SCHEMA_DICT)

        results = clk.generate_clk_from_csv(
            io.StringIO(self.CSV_INPUT),
            self.KEYS,
            loaded_schema,
            validate=True,
            header=True,
            progress_bar=False)

        assert results[0] == 'THHkzVWFYtzMJzmWobTLN8k8VwRN8+na10bN3N9I9oDPGuRZLGpV/QXZYtRZ6/wc+K3W9wvmDA2KpHmOTlVAY9jDblysQ9zlR86OMSbBn+uG3Qxi8EDpUN6nSI5FfOK1Zt77J0ye8P3wifF6QdkFfm3UXNGWil7CPNnUa/fHG0w='
        assert results[1] == '/r76/u//7+1O/3bG//7N5t3evpe/Wt7+v/f/Xt/+9rpXW//f/p7/v//3/vv7v/7/fv7X//vf3Vf/9vP//nd/3t93dt7/dPr/fj7f1z5B3/7W1u/qr+b3//q6729n6/au7772TPz+2s3u/n/88/9OTG/PxvrOh/7Hb89cz+Z3vmo='
Exemplo n.º 8
0
    def test_doesnt_crash(self):
        CSV_INPUT = io.StringIO('name,id,dob,gender,children\n'
                                'KÉVIN,kev007,1963-12-13,M,1\n'
                                '"JOHN HOWARD, ESQ.",stv534,1992-02-29,M,16\n'
                                'JULIA,alp423,0123-01-12,F,0\n')
        SCHEMA_DICT = dict(
            version=1,
            clkConfig=dict(
                l=1024,
                k=30,
                kdf=dict(
                    type='HKDF',
                    hash='SHA256',
                    salt=
                    'SCbL2zHNnmsckfzchsNkZY9XoHk96P/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA==',
                    info='c2NoZW1hX2V4YW1wbGU=',
                    keySize=64),
                hash=dict(type='doubleHash')),
            features=[
                dict(identifier='name',
                     format=dict(type='string', encoding='utf-8',
                                 case='upper'),
                     hashing=dict(ngram=2, weight=2)),
                dict(identifier='id',
                     format=dict(type='string',
                                 encoding='ascii',
                                 pattern=r'[a-z][a-z][a-z]\d\d\d'),
                     hashing=dict(ngram=1, positional=True)),
                dict(identifier='dob',
                     format=dict(type='date',
                                 format='%Y-%m-%d',
                                 description='When were ya born?'),
                     hashing=dict(ngram=2, positional=True, weight=.5)),
                dict(identifier='gender',
                     format=dict(type='enum', values=['M', 'F']),
                     hashing=dict(ngram=1, positional=False)),
                dict(identifier='children',
                     format=dict(type='integer', maximum=20),
                     hashing=dict(ngram=1, positional=True))
            ])
        KEYS = ('chicken', 'nuggets')

        loaded_schema = schema.Schema.from_json_dict(SCHEMA_DICT)

        results = clk.generate_clk_from_csv(CSV_INPUT,
                                            KEYS,
                                            loaded_schema,
                                            validate=True,
                                            header=True,
                                            progress_bar=False)
Exemplo n.º 9
0
    def test_encoding_regression(self):
        loaded_schema = schema.from_json_dict(self.SCHEMA_DICT)

        results = clk.generate_clk_from_csv(io.StringIO(self.CSV_INPUT),
                                            self.SECRET,
                                            loaded_schema,
                                            validate=True,
                                            header=True,
                                            progress_bar=False)

        assert results[
            0] == 'SU9+/O/Jzzi0sfzH8K2l3+qfhn8Ky3jVI21DVdH9j2fXE++JH8GcQGSeYxDZFxALCAT8CHwYJyQcRT3MhUQOFWcOf5fWdr6ofh6DYy8iv////weyunbMahfV9RMWkRwQmBL3fjreUVOCS9D9kAbQC2XgULidKCTHd9ZpbPJ91eE='
        assert results[
            1] == 'Pfl1/d7/31/+9u9x9zv//76/83//0v1Xt/dX/3X/e79XP7vd+Xfkf//2/9Xb/7Fd73e9f/n0f/c7Vb99B/X29d8997Pz/vJ87X/X/vcX9vt1d+/+5bP1fvfevnfX8d/f/j0XPL7f999kc/28/3d4c7t/9b/+Pf411/f2+3z1d/s='
Exemplo n.º 10
0
def hash(pii_csv, secret, schema, clk_json, no_header, check_header, validate,
         multiprocessing, verbose):
    """Process data to create CLKs

    Given a file containing CSV data as PII_CSV, and a JSON
    document defining the expected schema, verify the schema, then
    hash the data to create CLKs writing them as JSON to CLK_JSON. Note the CSV
    file should contain a header row - however this row is not used
    by this tool.

    It is important that the secret is only known by the two data providers. One word must be provided. For example:

    $clkutil hash pii.csv horse-staple pii-schema.json clk.json

    Use "-" for CLK_JSON to write JSON to stdout.
    """
    try:
        schema_object = clkhash.schema.from_json_file(schema_file=schema)
    except SchemaError as e:
        log(str(e))
        raise SystemExit(-1)
    header = True
    if not check_header:
        header = 'ignore'
    if no_header:
        header = False

    try:
        clk_data = clk.generate_clk_from_csv(
            pii_csv,
            secret,
            schema_object,
            validate=validate,
            header=header,
            progress_bar=verbose,
            use_multiprocessing=multiprocessing)
    except (validate_data.EntryError, validate_data.FormatError) as e:
        msg, = e.args
        log(msg)
        log('Hashing failed.')
    else:
        json.dump({'clks': clk_data}, clk_json)
        if hasattr(clk_json, 'name'):
            log("CLK data written to {}".format(clk_json.name))
Exemplo n.º 11
0
def hash(input, keys, schema, output, quiet, no_header, check_header,
         validate):
    """Process data to create CLKs

    Given a file containing csv data as INPUT, and a json
    document defining the expected schema, verify the schema, then
    hash the data to create CLKs writing to OUTPUT. Note the CSV
    file should contain a header row - however this row is not used
    by this tool.

    It is important that the keys are only known by the two data providers. Two words should be provided. For example:

    $clkutil hash input.txt horse staple output.txt

    Use "-" to output to stdout.
    """

    schema_object = clkhash.schema.Schema.from_json_file(schema_file=schema)
    header = True
    if not check_header:
        header = 'ignore'
    if no_header:
        header = False

    try:
        clk_data = clk.generate_clk_from_csv(input,
                                             keys,
                                             schema_object,
                                             validate=validate,
                                             header=header,
                                             progress_bar=not quiet)
    except (validate_data.EntryError, validate_data.FormatError) as e:
        msg, = e.args
        log(msg)
        log('Hashing failed.')
    else:
        json.dump({'clks': clk_data}, output)
        if hasattr(output, 'name'):
            log("CLK data written to {}".format(output.name))
Exemplo n.º 12
0
def generate_clks(filename, schema):
    with open(filename, 'rt') as f:
        hashed_data = clk.generate_clk_from_csv(f, SECRET, schema)
        return deserialize_filters(hashed_data)