def test_permutation(requests, the_truth): project_data, (r_a, r_b) = create_project_upload_data( requests, (the_truth['clks_a'], the_truth['clks_b']), result_type='permutations') run = post_run(requests, project_data, threshold=the_truth['threshold']) mask_result = get_run_result(requests, project_data, run, timeout=240) perm_a_result = get_run_result(requests, project_data, run, result_token=r_a['receipt_token'], wait=False) perm_b_result = get_run_result(requests, project_data, run, result_token=r_b['receipt_token'], wait=False) # compare permutations and mask against mapping of the truth permutation_a = inverse_of_permutation(perm_a_result['permutation']) permutation_b = inverse_of_permutation(perm_b_result['permutation']) groups = the_truth['groups'] # Use a mapping output to simplify the checking. mapping = dict(anonlink.solving.pairs_from_groups(groups)) # NB: Anonlink is more strict on enforcing the k parameter, so there # is a small chance the below won't hold. This should only be the # case for more noisy problems. for a, b, m in zip(permutation_a, permutation_b, mask_result['mask']): if m == 1: assert a in mapping, f"Unexpected link was included - run {run}" assert mapping[ a] == b, f"Expected link from {a} was incorrect - run {run}" else: assert a not in mapping, f"Expected link was masked out - run {run}"
def test_project_json_data_upload_with_mismatched_encoded_size( requests, result_type_number_parties): result_type, number_parties = result_type_number_parties data = [generate_json_serialized_clks(500, 64 if i == 0 else 256) for i in range(number_parties)] new_project_data, _ = create_project_upload_data( requests, data, result_type=result_type) with pytest.raises(AssertionError): run_id = post_run(requests, new_project_data, 0.9) get_run_result(requests, new_project_data, run_id, wait=True)
def test_project_json_data_upload_with_too_small_encoded_size( requests, result_type_number_parties): result_type, number_parties = result_type_number_parties new_project_data, _ = create_project_upload_fake_data( requests, [500] * number_parties, overlap=0.8, result_type=result_type, encoding_size=4 ) with pytest.raises(AssertionError): run_id = post_run(requests, new_project_data, 0.9) get_run_result(requests, new_project_data, run_id, wait=True)
def test_groups_correctness(requests): # We assume that anonlink computes the right results. with open(DATA_PATH, 'rb') as f: # Here's some filters I prepared earlier. filters = pickle.load(f) candidate_pairs = anonlink.candidate_generation.find_candidate_pairs( filters, anonlink.similarities.dice_coefficient_accelerated, THRESHOLD) true_groups = anonlink.solving.greedy_solve(candidate_pairs) filter_size = len(filters[0][0]) assert all( len(filter_) == filter_size for dataset in filters for filter_ in dataset) packed_filters = [ b''.join(binary_pack_for_upload(f, filter_size)) for f in filters ] project_data, _ = create_project_upload_data(requests, packed_filters, result_type='groups', binary=True, hash_size=DATA_HASH_SIZE) try: run = post_run(requests, project_data, threshold=THRESHOLD) result_groups = get_run_result(requests, project_data, run)['groups'] finally: delete_project(requests, project_data) # Compare ES result with anonlink. result_group_set = {frozenset(map(tuple, g)) for g in result_groups} true_group_set = set(map(frozenset, true_groups)) assert result_group_set == true_group_set
def test_project_binary_data_uploaded(requests, valid_project_params, binary_test_file_path): new_project_data = requests.post(url + '/projects', json={ 'schema': {}, **valid_project_params }).json() update_tokens = new_project_data['update_tokens'] expected_number_parties = get_expected_number_parties(valid_project_params) assert len(update_tokens) == expected_number_parties for token in update_tokens: upload_binary_data_from_file( requests, binary_test_file_path, new_project_data['project_id'], token, 1000) run_id = post_run(requests, new_project_data, 0.99) result = get_run_result(requests, new_project_data, run_id, wait=True) if valid_project_params['result_type'] == 'groups': assert 'groups' in result groups = result['groups'] assert len(groups) == 1000 for group in groups: dataset_indices = {di for di, _ in group} record_indices = {ri for _, ri in group} assert len(record_indices) == 1 assert dataset_indices == set(range(expected_number_parties)) # Check every record is represented all_record_indices = {next(iter(group))[1] for group in groups} assert all_record_indices == set(range(1000))
def test_project_json_data_upload_with_invalid_encoded_size( requests, result_type_number_parties): result_type, number_parties = result_type_number_parties new_project_data, _ = create_project_upload_fake_data( requests, [500] * number_parties, overlap=0.8, result_type=result_type, encoding_size=20 # not multiple of 8 ) with pytest.raises(AssertionError): run_id = post_run(requests, new_project_data, 0.9) get_run_result(requests, new_project_data, run_id, wait=True, timeout=240)
def test_run_permutations_results(requests, permutations_project, threshold): run_id = post_run(requests, permutations_project, threshold) mask_result = get_run_result(requests, permutations_project, run_id, timeout=240) assert 'mask' in mask_result assert len(mask_result['mask']) == min(permutations_project['size']) # Get results using receipt_token A and B token1 = permutations_project['dp_responses'][0]['receipt_token'] result1 = get_run_result(requests, permutations_project, run_id, token1, wait=False) assert 'permutation' in result1 assert 'rows' in result1 assert result1['rows'] == len(mask_result['mask']) token2 = permutations_project['dp_responses'][1]['receipt_token'] result2 = get_run_result(requests, permutations_project, run_id, token2, wait=False) assert 'permutation' in result2 assert 'rows' in result2 assert result2['rows'] == result1['rows'] assert result2['rows'] == len(mask_result['mask'])
def test_run_similarity_score_results(requests, similarity_scores_project, threshold): run_id = post_run(requests, similarity_scores_project, threshold) result = get_run_result(requests, similarity_scores_project, run_id, timeout=240) assert 'similarity_scores' in result for (party_id_1, rec_id_1), (party_id_2, rec_id_2), score in result['similarity_scores']: assert 0.0 <= score >= 1.0 assert 0 <= party_id_1 assert 0 <= party_id_2 assert party_id_1 != party_id_2 assert 0 <= rec_id_1 assert 0 <= rec_id_2
def test_groups(requests, the_truth): project_data, _ = create_project_upload_data( requests, (the_truth['clks_a'], the_truth['clks_b']), result_type='groups') run = post_run(requests, project_data, threshold=the_truth['threshold']) result = get_run_result(requests, project_data, run, timeout=240) # compare mapping with the truth result_groups = result['groups'] true_groups = the_truth['groups'] result_groups = frozenset( frozenset(map(tuple, group)) for group in result_groups) true_groups = frozenset(map(frozenset, true_groups)) assert result_groups == true_groups
def test_run_groups_results(requests, groups_project, threshold): run_id = post_run(requests, groups_project, threshold) result = get_run_result(requests, groups_project, run_id, timeout=240) assert 'groups' in result groups = result['groups'] # All groups have at least two records assert all(len(g) >= 2 for g in groups) # All records consist of a record index and dataset index assert all(all(len(i) == 2 for i in g) for g in groups) assert all(all(isinstance(i, int) and isinstance(j, int) for i, j in g) for g in groups)
def test_project_binary_data_upload_with_different_encoded_size( requests, encoding_size, valid_project_params): expected_number_parties = get_expected_number_parties(valid_project_params) new_project_data = requests.post(url + '/projects', json={ 'schema': {}, **valid_project_params }).json() common = next( binary_pack_for_upload(generate_clks(1, encoding_size), encoding_size)) data = [] for i in range(expected_number_parties): generated_clks = generate_clks(499, encoding_size) packed_clks = binary_pack_for_upload(generated_clks, encoding_size) packed_joined = b''.join(packed_clks) packed_with_common = (packed_joined + common if i == 0 else common + packed_joined) data.append(packed_with_common) project_id = new_project_data['project_id'] for d, token in zip(data, new_project_data['update_tokens']): assert len(d) == 500 * encoding_size upload_binary_data(requests, d, project_id, token, 500, size=encoding_size) run_id = post_run(requests, new_project_data, 0.99) result = get_run_result(requests, new_project_data, run_id, wait=True, timeout=240) if valid_project_params['result_type'] == 'groups': assert 'groups' in result groups = result['groups'] groups_set = {frozenset(map(tuple, group)) for group in groups} common_set = frozenset( (i, 499 if i == 0 else 0) for i in range(expected_number_parties)) assert common_set in groups_set
def test_project_json_data_upload_with_various_encoded_sizes( requests, encoding_size, result_type_number_parties): result_type, number_parties = result_type_number_parties new_project_data, _ = create_project_upload_fake_data( requests, [500] * number_parties, overlap=0.8, result_type=result_type, encoding_size=encoding_size ) run_id = post_run(requests, new_project_data, 0.9) result = get_run_result(requests, new_project_data, run_id, wait=True) if result_type == 'groups': assert 'groups' in result # This is a pretty bad bound, but we're not testing the # accuracy. assert len(result['groups']) >= 400
def test_similarity_scores(requests, the_truth): project_data, _ = create_project_upload_data( requests, (the_truth['clks_a'], the_truth['clks_b']), result_type='similarity_scores') run = post_run(requests, project_data, threshold=the_truth['threshold']) result = get_run_result(requests, project_data, run, timeout=60) true_scores = the_truth['similarity_scores'] result_scores = {tuple(index for _, index in sorted([a, b])): score for a, b, score in result['similarity_scores']} # Anonlink is more strict on enforcing the k parameter. Hence the # subset. assert true_scores.keys() <= result_scores.keys() for pair in true_scores: assert true_scores[pair] == result_scores[pair] delete_project(requests, project_data)
def test_run_mapping_results_no_data(requests): empty_project = create_project_no_data(requests) run_id = post_run(requests, empty_project, 0.95) get_run_result(requests, empty_project, run_id, expected_status=404, wait=False)
def test_project_upload_external_data(requests, a_blocking_project, binary_test_file_path): project = a_blocking_project blocking_data = json.dumps( {str(encoding_id): list({str(encoding_id % 3), str(encoding_id % 13)}) for encoding_id in range(1000)}).encode() mc, upload_info = get_temp_upload_client(project, requests, project['update_tokens'][0]) _upload_encodings_and_blocks(mc, upload_info, blocking_data, binary_test_file_path) # Should be able to notify the service that we've uploaded data res = requests.post(url + f"projects/{project['project_id']}/clks", headers={'Authorization': project['update_tokens'][0]}, json={ 'encodings': { 'file': { 'bucket': upload_info['bucket'], 'path': upload_info['path'] + "/encodings", } }, 'blocks': { 'file': { 'bucket': upload_info['bucket'], 'path': upload_info['path'] + "/blocks", } } } ) assert res.status_code == 201 # If the second data provider uses the same path to upload data, that shouldn't work res2 = requests.post(url + f"projects/{project['project_id']}/clks", headers={'Authorization': project['update_tokens'][1]}, json={ 'encodings': { 'file': { 'bucket': upload_info['bucket'], 'path': upload_info['path'] + "/encodings", } }, 'blocks': { 'file': { 'bucket': upload_info['bucket'], 'path': upload_info['path'] + "/blocks", } } } ) assert res2.status_code == 403 mc2, upload_info2 = get_temp_upload_client(project, requests, project['update_tokens'][1]) _upload_encodings_and_blocks(mc2, upload_info2, blocking_data, binary_test_file_path) # If the second data provider uses the correct path to upload data, that should work res3 = requests.post(url + f"projects/{project['project_id']}/clks", headers={'Authorization': project['update_tokens'][1]}, json={ 'encodings': { 'file': { 'bucket': upload_info2['bucket'], 'path': upload_info2['path'] + "/encodings", } }, 'blocks': { 'file': { 'bucket': upload_info2['bucket'], 'path': upload_info2['path'] + "/blocks", } } } ) assert res3.status_code == 201 run_id = post_run(requests, project, threshold=0.95) result = get_run_result(requests, project, run_id, timeout=120) assert 'groups' in result