示例#1
0
 def setUp(self):
     ascii_hashing = FieldHashingProperties(
         encoding='ascii',
         comparator=get_comparator({
             'type': 'ngram',
             'n': 2
         }),
         strategy=BitsPerTokenStrategy(20))
     self.fields = [
         StringSpec(identifier='given name',
                    case='lower',
                    min_length=1,
                    max_length=None,
                    hashing_properties=ascii_hashing),
         StringSpec(identifier='surname',
                    case='upper',
                    min_length=1,
                    max_length=None,
                    hashing_properties=ascii_hashing),
         StringSpec(identifier='email address',
                    regex=r'.+@.+\..+',
                    hashing_properties=ascii_hashing),
         IntegerSpec(identifier='age',
                     minimum=18,
                     maximum=99,
                     hashing_properties=ascii_hashing),
         DateSpec(identifier='join date',
                  format='%Y-%m-%d',
                  hashing_properties=ascii_hashing),
         EnumSpec(identifier='account type',
                  values=['free', 'paid'],
                  hashing_properties=ascii_hashing)
     ]
示例#2
0
def fhp_from_json_dict(json_dict  # type: Dict[str, Any]
                       ):
    # type: (...) -> FieldHashingProperties
    """
    Make a :class:`FieldHashingProperties` object from a dictionary.

    :param dict json_dict:
        Conforming to the `hashingConfig` definition
        in the `v2` linkage schema.
    :return: A :class:`FieldHashingProperties` instance.
    """
    h = json_dict.get('hash', {'type': 'blakeHash'})

    if json_dict['comparison'].get('type', '') == 'ngram':  # setting default
        json_dict['comparison'].setdefault(
            'positional', FieldHashingProperties._DEFAULT_POSITIONAL)
    comparator = comparators.get_comparator(json_dict['comparison'])

    return FieldHashingProperties(
        comparator=comparator,
        hash_type=h['type'],
        prevent_singularity=h.get('prevent_singularity'),
        strategy=StrategySpec.from_json_dict(json_dict['strategy']),
        missing_value=MissingValueSpec.from_json_dict(
            json_dict['missingValue'])
        if 'missingValue' in json_dict else None)
示例#3
0
    def test_compare_to_legacy(self):
        # Identifier: 'ANY freetext'

        fhp = FieldHashingProperties(
            comparator=comparators.get_comparator({
                'type': 'ngram',
                'n': 2
            }),
            hash_type='doubleHash',
            strategy=BitsPerTokenStrategy(bits_per_token=10))

        schema = Schema(
            l=1024,
            kdf_info=base64.b64decode('c2NoZW1hX2V4YW1wbGU='),
            kdf_key_size=64,
            kdf_salt=base64.b64decode(
                'SCbL2zHNnmsckfzchsNkZY9XoHk96P'
                '/G5nUBrM7ybymlEFsMV6PAeDZCNp3rfNUPCtLDMOGQHG4pCQpfhiHCyA=='),
            fields=[
                StringSpec(identifier='ANY text {}'.format(i + 1),
                           hashing_properties=fhp) for i in range(4)
            ])

        row = ['Bobby', 'Bobby', 'Bobby', 'Bobby']
        secret = "No, I am your father. No... that's not true! That's impossible!".encode(
        )
        keys_hkdf = generate_key_lists(secret, len(row), kdf='HKDF')
        keys_legacy = generate_key_lists(secret, len(row), kdf='legacy')
        bloom_hkdf = next(stream_bloom_filters([row], keys_hkdf, schema))
        bloom_legacy = next(stream_bloom_filters([row], keys_legacy, schema))
        hkdf_count = bloom_hkdf[0].count()
        legacy_count = bloom_legacy[0].count()
        # lecay will map the 4 Bobbys' to the same bits, whereas hkdf will
        # map each Bobby to different bits.
        self.assertLessEqual(legacy_count,
                             fhp.strategy.bits_per_token(1)[0] *
                             6)  # 6 bi-grams
        self.assertLess(legacy_count, hkdf_count)
        self.assertLessEqual(hkdf_count, len(row) * legacy_count)
示例#4
0
import base64
import os
import unittest

from clkhash import bloomfilter, clk, randomnames, schema
from clkhash.field_formats import FieldHashingProperties, StringSpec, BitsPerTokenStrategy, BitsPerFeatureStrategy
from clkhash.key_derivation import generate_key_lists
from clkhash.schema import Schema
from clkhash.serialization import deserialize_bitarray
from clkhash.stats import OnlineMeanVariance
from clkhash.comparators import get_comparator

TEST_DATA_DIRECTORY = os.path.join(os.path.dirname(__file__), 'testdata')

bigram_tokenizer = get_comparator({'type': 'ngram', 'n': 2})

def _test_data_file_path(file_name):
    return os.path.join(TEST_DATA_DIRECTORY, file_name)


def _test_schema(file_name):
    with open(_test_data_file_path(file_name)) as f:
        return schema.from_json_file(f)


def _test_stats(pii, schema, keys):
    counts = [deserialize_bitarray(c).count() for c in
              clk.generate_clks(pii, schema, keys)]
    print('_test_stats: counts = ', counts)
    ov = OnlineMeanVariance()
    ov.update(counts)
示例#5
0
def test_invalid_n():
    with pytest.raises(ValueError):
        comparators.get_comparator({'type': 'ngram', 'n': -6})
示例#6
0
def test_invalid_comparison():
    with pytest.raises(ValueError):
        comparators.get_comparator({"type": "apples_and_oranges"})