Exemplo n.º 1
0
 def get_filtered_records(data_a, data_b):
     # generate candidate blocks
     candidate_obj_alice = generate_candidate_blocks(
         data_a, blocking_config)
     candidate_obj_bob = generate_candidate_blocks(data_b, blocking_config)
     # blocks generator
     return generate_blocks([candidate_obj_alice, candidate_obj_bob], K=2)
Exemplo n.º 2
0
    def test_psig(self):
        """Test block generator for PPRLPsig method."""
        data1 = [('id1', 'Joyce', 'Wang', 'Ashfield'),
                 ('id2', 'Joyce', 'Hsu', 'Burwood'),
                 ('id3', 'Joyce', 'Shan', 'Lewishm'),
                 ('id4', 'Fred', 'Yu', 'Strathfield'),
                 ('id5', 'Fred', 'Zhang', 'Chippendale'),
                 ('id6', 'Lindsay', 'Jone', 'Narwee')]
        data2 = [('4', 'Fred', 'Yu', 'Strathfield'),
                 ('5', 'Fred', 'Zhang', 'Chippendale'),
                 ('6', 'Li', 'Jone', 'Narwee')]

        config = {
            "blocking-features": [1],
            "filter": {
                "type": "count",
                "max": 5,
                "min": 0,
            },
            "blocking-filter": {
                "type": "bloom filter",
                "number-hash-functions": 20,
                "bf-len": 2048,
            },
            "signatureSpecs": [[{
                "type": "feature-value",
                "feature": 1
            }],
                               [
                                   {
                                       "type": "characters-at",
                                       "config": {
                                           "pos": ["0:2"]
                                       },
                                       "feature": 1
                                   },
                               ]]
        }

        blocking_config = {'type': 'p-sig', 'version': 1, 'config': config}

        # generate candidate blocks
        candidate_obj_alice = generate_candidate_blocks(data1, blocking_config)
        candidate_obj_bob = generate_candidate_blocks(data2, blocking_config)

        # blocks generator
        filtered_records = generate_blocks(
            [candidate_obj_alice, candidate_obj_bob], K=2)
        filtered_alice = filtered_records[0]
        filtered_bob = filtered_records[1]

        expected_bf_sets = {}
        for string in ['1_Fr', '0_Fred', '1_Li']:
            bf_set = flip_bloom_filter(
                string, config['blocking-filter']['bf-len'],
                config['blocking-filter']['number-hash-functions'])
            expected_bf_sets[str(tuple(bf_set))] = True

        assert all(key in expected_bf_sets for key in filtered_alice)
        assert filtered_alice.keys() == filtered_bob.keys()
Exemplo n.º 3
0
    def test_generate_candidate_blocks_psig(self):
        """Test generation of candidate blocks for p-sig."""
        global data
        num_hash_funcs = 4
        bf_len = 2048
        config = {
            "blocking-features": [1],
            "record-id-col": 0,
            "filter": {
                "type": "ratio",
                "max": 0.5,
                "min": 0.0,
            },
            "blocking-filter": {
                "type": "bloom filter",
                "number-hash-functions": num_hash_funcs,
                "bf-len": bf_len,
            },
            "signatureSpecs": [[{
                "type": "feature-value",
                "feature": 1
            }]]
        }

        block_config = {'type': 'p-sig', 'version': 1, 'config': config}
        candidate_block_obj = generate_candidate_blocks(data, block_config)
        bf_set_fred = str(
            tuple(flip_bloom_filter('0_Fred', bf_len, num_hash_funcs)))
        bf_set_lindsay = str(
            tuple(flip_bloom_filter('0_Lindsay', bf_len, num_hash_funcs)))
        assert candidate_block_obj.blocks == {
            bf_set_fred: ['id4', 'id5'],
            bf_set_lindsay: ['id6']
        }
Exemplo n.º 4
0
    def test_generate_candidate_blocks_assertion(self):
        global data

        # test when type of blocking is not specified
        with pytest.raises(ValueError):
            block_config = {'version': 1, 'config': {}}
            generate_candidate_blocks(data, block_config)

        # test when type of blocking is not implemented
        with pytest.raises(NotImplementedError):
            block_config = {
                'type': 'fancy-block',
                'version': 1,
                'config': {
                    'blocking-features': [1, 2]
                }
            }
            generate_candidate_blocks(data, block_config)

        # test when config of blocking is not specified
        with pytest.raises(ValueError):
            block_config = {'type': 'p-sig', 'version': 1}
            generate_candidate_blocks(data, block_config)

        # test when blocking feature is a mix of integer and string
        with pytest.raises(AssertionError):
            block_config = {
                'type': 'p-sig',
                'version': 1,
                'config': {
                    'blocking-features': [1, 'name']
                }
            }
            generate_candidate_blocks(data, block_config)

        # test when blocking features are string but header is not given
        with pytest.raises(AssertionError):
            block_config = {
                'type': 'p-sig',
                'version': 1,
                'config': {
                    'blocking-features': ['name']
                }
            }
            generate_candidate_blocks(data, block_config)
Exemplo n.º 5
0
def match(file_alice, file_bob, config):
    """Run record matching experiment with blocklib API."""
    num_recs_alice = int(file_alice.split('/')[-1].split('_')[0])
    num_recs_bob = int(file_bob.split('/')[-1].split('_')[0])

    print('Loading dataset Alice n={}'.format(num_recs_alice))
    alice_data = pd.read_csv(file_alice)
    alice_data = alice_data.replace(np.nan, '', regex=True)
    alice_data = alice_data.to_dict(orient='split')['data']

    print('Loading dataset Bob n={}'.format(num_recs_bob))
    bob_data = pd.read_csv(file_alice)
    bob_data = bob_data.replace(np.nan, '', regex=True)
    bob_data = bob_data.to_dict(orient='split')['data']
    print('Example data = {}'.format(alice_data[0]))

    # build candidate blocks
    start_time = time.time()
    print('Building reversed index of Alice')
    block_obj_alice = generate_candidate_blocks(alice_data, config)

    print('Building reversed index of Bob')
    block_obj_bob = generate_candidate_blocks(bob_data, config)
    dbo_time = time.time() - start_time

    # build final blocks
    start_time = time.time()
    print('Filtering reversed index - Generate final blocks')
    filtered_alice, filtered_bob = generate_blocks(
        [block_obj_alice, block_obj_bob], K=2)
    lu_time = time.time() - start_time
    num_blocks = len(filtered_alice)

    # assess
    subdata1 = [x[0] for x in alice_data]
    subdata2 = [x[0] for x in bob_data]
    rr, pc = assess_blocks_2party([filtered_alice, filtered_bob],
                                  [subdata1, subdata2])

    return dbo_time, lu_time, num_blocks, rr, pc, num_recs_alice, block_obj_alice, block_obj_bob
Exemplo n.º 6
0
    def test_lambdafold(self):
        """Test block generator for PPRLLambdaFold method."""
        config = {
            "blocking-features": [1, 2],
            "Lambda": 5,
            "bf-len": 2000,
            "num-hash-funcs": 500,
            "K": 30,
            "random_state": 0,
            "record-id-col": 0,
            "input-clks": False
        }
        blocking_config = {
            'type': 'lambda-fold',
            'version': 1,
            'config': config
        }
        # party Alice
        records_alice = [['id1', "Joyce", "Wang"], ['id2', "Fred", "Yu"]]
        # party Bob
        records_bob = [['id3', "Joyce", "Wang"], ['id4', "Lindsay", "Lin"]]
        # generate candidate blocks
        candidate_obj_alice = generate_candidate_blocks(
            records_alice, blocking_config)
        candidate_obj_bob = generate_candidate_blocks(records_bob,
                                                      blocking_config)

        # blocks generator
        filtered_records = generate_blocks(
            [candidate_obj_alice, candidate_obj_bob], K=2)
        filtered_alice = filtered_records[0]
        filtered_bob = filtered_records[1]
        assert list(filtered_alice.values()) == [['id1'], ['id1'], ['id1'],
                                                 ['id1'], ['id1']]
        assert list(filtered_bob.values()) == [['id3'], ['id3'], ['id3'],
                                               ['id3'], ['id3']]
Exemplo n.º 7
0
def generate_candidates(filenames, config, parties, rec_id_col):
    """Generate candidates for each party."""
    block_objs = []
    truth = []
    data = []

    for party, filename in zip(parties, filenames):
        print('Loading file {} and Generate candidate blocks'.format(filename))
        df = pd.read_csv(filename).astype(str)
        records = df.to_dict(orient='split')['data']
        data.append(df[rec_id_col].values)

        # append record ids
        truth.append(
            pd.DataFrame({
                'id{}'.format(party): df.index,
                rec_id_col: df[rec_id_col]
            }))
        print('Loaded {} records from file {}'.format(len(df), filename))
        # generate candidate blocks
        blk_obj = generate_candidate_blocks(records, config)
        block_objs.append(blk_obj)

    return block_objs, data, truth
Exemplo n.º 8
0
def generate_candidate_blocks_from_csv(input_f: TextIO,
                                       schema_f: TextIO,
                                       header: bool = True,
                                       verbose: bool = False):
    """ Generate candidate blocks from CSV file

         This function also computes and outputs the Hamming weight
         (a.k.a popcount -- the number of bits set to high) of the
         generated Bloom filters.

         :param input_f: A file-like object of csv data to hash.
         :param schema_f: Schema specifying the blocking configuration
         :param header: Set to `False` if the CSV file does not have
             a header. Set to `'ignore'` if the CSV file does have a
             header but it should not be checked against the schema.
         :param verbose: enables output of extra information, i.e.: the stats for the individual PSig strategies.
         :return: A dictionary of blocks, state and config
     """
    if header not in {False, True, 'ignore'}:
        raise ValueError("header must be False, True or 'ignore' but is {!s}."
                         .format(header))

    log.info("Hashing data")

    # read blocking config as a dictionary
    start_time = time.time()
    try:
        blocking_config = json.load(schema_f)
    except ValueError as e:
        msg = 'The schema is not a valid JSON file'
        raise ValueError(msg) from e

    blocking_method = blocking_config['type']
    suffix_input = input_f.name.split('.')[-1]

    pii_data = []  # type: List[Any]
    headers = None
    # read from clks
    if blocking_method == 'lambda-fold' and blocking_config['config']['input-clks']:
        try:
            pii_data = json.load(input_f)['clks']
        except ValueError:  # since JSONDecodeError is inherited from ValueError
            raise TypeError(f'Upload should be CLKs not {suffix_input.upper()} file')

    # read from CSV file
    else:
        # sentinel check for input
        if suffix_input == 'json':
            raise TypeError(f'Upload should be CSVs not CLKs')
        else:
            reader = csv.reader(input_f)
            if header:
                headers = next(reader)
            for line in reader:
                pii_data.append(tuple(element.strip() for element in line))

    # generate candidate blocks
    blocking_obj = generate_candidate_blocks(pii_data, blocking_config, verbose=verbose, header=headers)
    log.info("Blocking took {:.2f} seconds".format(time.time() - start_time))

    # save results to dictionary
    # step1 - get blocks (need to convert numpy.int64 to normal int
    blocks = blocking_obj.blocks
    for key in blocks:
        blocks[key] = [int(x) for x in blocks[key]]

    # convert blocking key from list to string
    new_blocks = {}
    for key in blocks:
        newkey = str(key)
        new_blocks[newkey] = blocks[key]
    blocks = new_blocks

    # convert block_key: row_index to a list of dict
    flat_blocks = []  # type: List[Dict[Any, List[int]]]
    for block_key, row_indices in blocks.items():
        flat_blocks.append(dict(block_key=block_key, indices=row_indices))

    # make encoding to blocks map
    encoding_to_blocks_map = defaultdict(list)
    for block_dict in flat_blocks:
        block_id = block_dict['block_key']
        for ind in block_dict['indices']:
            encoding_to_blocks_map[ind].append(block_id)
    result = {} # type: Dict[str, Any]
    result['blocks'] = encoding_to_blocks_map

    # step2 - get all member variables in blocking state
    block_state_vars = {}  # type: Dict[str, Any]
    state = blocking_obj.state
    for name in dir(state):
        if '__' not in name and not callable(getattr(state, name)) and name != 'stats':
            block_state_vars[name] = getattr(state, name)

    result['meta'] = {}  # type: Dict[str, Any]
    result['meta']['state'] = block_state_vars

    # step3 - get config meta data
    result['meta']['config'] = blocking_config

    # step4 - add CLK counts and blocking statistics to metadata
    result['meta']['source'] = {'clk_count': [len(pii_data)]}
    del state.stats['num_of_blocks_per_rec']
    result['meta']['stats'] = state.stats
    return result
Exemplo n.º 9
0
    def test_psig_multiparty(self):
        """Test block generator for PPRLPsig method."""
        data1 = [
            ('m1-1', 'Joyce', 'Wang', 'Ashfield'),
            ('m1-2', 'Fred', 'Yu', 'Strathfield'),
            ('m1-3', 'Max', 'Zhang', 'Chippendale'),
        ]
        data2 = [('m2-1', 'Fred', 'Yu', 'Strathfield'),
                 ('m2-2', 'Jone', 'Zhang', 'Chippendale'),
                 ('m2-3', 'Li', 'Jone', 'Narwee')]
        data3 = [
            ('m3-1', 'Joyce', 'Hsu', 'Burwood'),
            ('m3-2', 'Max', 'Shan', 'Lewishm'),
        ]
        data4 = [
            ('m4-1', 'Lindsay', 'Jone', 'Narwee'),
            ('m4-2', 'Fredrick', 'Cheung', 'Narwee'),
        ]

        config = {
            "blocking-features": [1],
            "record-id-col":
            0,
            "filter": {
                "type": "count",
                "max": 5,
                "min": 0,
            },
            "blocking-filter": {
                "type": "bloom filter",
                "number-hash-functions": 20,
                "bf-len": 2048,
            },
            "signatureSpecs": [[{
                "type": "feature-value",
                "feature": 1
            }],
                               [
                                   {
                                       "type": "characters-at",
                                       "config": {
                                           "pos": ["0:2"]
                                       },
                                       "feature": 1
                                   },
                               ]]
        }

        blocking_config = {'type': 'p-sig', 'version': 1, 'config': config}

        # generate candidate blocks
        candidate_obj_m1 = generate_candidate_blocks(data1, blocking_config)
        candidate_obj_m2 = generate_candidate_blocks(data2, blocking_config)
        candidate_obj_m3 = generate_candidate_blocks(data3, blocking_config)
        candidate_obj_m4 = generate_candidate_blocks(data4, blocking_config)
        candidate_objs = [
            candidate_obj_m1, candidate_obj_m2, candidate_obj_m3,
            candidate_obj_m4
        ]
        # blocks generator
        filtered_records = generate_blocks(candidate_objs, K=3)
        filtered_m1, filtered_m2, filtered_m3, filtered_m4 = filtered_records

        expected_bf_sets = {}
        for string in ['1_Fr', '1_Jo']:
            bf_set = flip_bloom_filter(
                string, config['blocking-filter']['bf-len'],
                config['blocking-filter']['number-hash-functions'])
            expected_bf_sets[string] = str(tuple(bf_set))

        expected_m1 = {
            expected_bf_sets['1_Fr']: ['m1-2'],
            expected_bf_sets['1_Jo']: ['m1-1']
        }
        expected_m2 = {
            expected_bf_sets['1_Fr']: ['m2-1'],
            expected_bf_sets['1_Jo']: ['m2-2']
        }
        expected_m3 = {expected_bf_sets['1_Jo']: ['m3-1']}
        expected_m4 = {expected_bf_sets['1_Fr']: ['m4-2']}

        assert expected_m1 == filtered_m1
        assert expected_m2 == filtered_m2
        assert expected_m3 == filtered_m3
        assert expected_m4 == filtered_m4
Exemplo n.º 10
0
    file_alice, file_bob = data_sets_pairs[0]
    num_recs_alice = int(file_alice.split('/')[-1].split('_')[0])
    num_recs_bob = int(file_bob.split('/')[-1].split('_')[0])

    print('Loading dataset Alice n={}'.format(num_recs_alice))
    alice_data = pd.read_csv(file_alice)
    alice_data = alice_data.replace(np.nan, '', regex=True)
    alice_data = alice_data.to_dict(orient='split')['data']

    print('Loading dataset Bob n={}'.format(num_recs_bob))
    bob_data = pd.read_csv(file_alice)
    bob_data = bob_data.replace(np.nan, '', regex=True)
    bob_data = bob_data.to_dict(orient='split')['data']
    print('Example data = {}'.format(alice_data[0]))

    # build candidate blocks
    start_time = time.time()
    print('Building reversed index of Alice')
    block_obj_alice = generate_candidate_blocks(alice_data, config)

    print('Building reversed index of Bob')
    block_obj_bob = generate_candidate_blocks(bob_data, config)
    dbo_time = time.time() - start_time
    filtered_alice, filtered_bob = generate_blocks(
        [block_obj_alice, block_obj_bob], K=2)
    arisk = disclosure_risk(filtered_alice)
    brisk = disclosure_risk(filtered_bob)
    df = pd.DataFrame(dict(arisk=arisk, brisk=brisk))
    print('Saving to', 'risk_{}.csv'.format(config['type']))
    df.to_csv('risk_{}.csv'.format(config['type']))