Exemplo n.º 1
0
def test_delete_project_after_creating_run_with_clks(
        requests, result_type_number_parties):
    result_type, number_parties = result_type_number_parties
    project, _ = create_project_upload_fake_data(requests,
                                                 [100] * number_parties,
                                                 overlap=0.5,
                                                 result_type=result_type)
    post_run(requests, project, 0.9)
    delete_project(requests, project)
Exemplo n.º 2
0
def test_project_binary_data_upload_with_different_encoded_size(
        requests, encoding_size):

    new_project_data = requests.post(url + '/projects',
                                     json={
                                         'schema': {},
                                         'result_type': 'mapping',
                                     }).json()

    g1 = binary_pack_filters(generate_clks(499, encoding_size), encoding_size)
    g2 = binary_pack_filters(generate_clks(499, encoding_size), encoding_size)
    g3 = binary_pack_filters(generate_clks(1, encoding_size), encoding_size)

    def convert_generator_to_bytes(g):
        return b''.join(g)

    shared_entity = next(g3)
    f1 = convert_generator_to_bytes(g1) + shared_entity
    f2 = shared_entity + convert_generator_to_bytes(g2)

    upload_binary_data(requests, f1, new_project_data['project_id'],
                       new_project_data['update_tokens'][0], 500,
                       encoding_size)
    upload_binary_data(requests, f2, new_project_data['project_id'],
                       new_project_data['update_tokens'][1], 500,
                       encoding_size)

    run_id = post_run(requests, new_project_data, 0.99)
    result = get_run_result(requests, new_project_data, run_id, wait=True)
    assert 'mapping' in result
    assert result['mapping']['499'] == '0'
Exemplo n.º 3
0
def test_project_binary_data_uploaded(requests):
    new_project_data = requests.post(url + '/projects',
                                     json={
                                         'schema': {},
                                         'result_type': 'mapping',
                                     }).json()

    small_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   'testdata/clks_128B_1k.bin')

    upload_binary_data_from_file(requests, small_file_path,
                                 new_project_data['project_id'],
                                 new_project_data['update_tokens'][0], 1000)
    upload_binary_data_from_file(requests, small_file_path,
                                 new_project_data['project_id'],
                                 new_project_data['update_tokens'][1], 1000)

    run_id = post_run(requests, new_project_data, 0.99)
    result = get_run_result(requests, new_project_data, run_id, wait=True)
    assert 'mapping' in result

    # Since we uploaded the same file it should have identified the same rows as matches
    for i in range(1, 1000):
        assert str(i) in result['mapping']
        assert result['mapping'][str(i)] == str(i)
Exemplo n.º 4
0
def test_groups_correctness(requests):
    # We assume that anonlink computes the right results.

    with open(DATA_PATH, 'rb') as f:
        # Here's some filters I prepared earlier.
        filters = pickle.load(f)

    candidate_pairs = anonlink.candidate_generation.find_candidate_pairs(
        filters, anonlink.similarities.dice_coefficient_accelerated, THRESHOLD)
    true_groups = anonlink.solving.greedy_solve(candidate_pairs)

    filter_size = len(filters[0][0])
    assert all(
        len(filter_) == filter_size for dataset in filters
        for filter_ in dataset)
    packed_filters = [
        b''.join(binary_pack_filters(f, filter_size)) for f in filters
    ]
    project_data, _ = create_project_upload_data(requests,
                                                 packed_filters,
                                                 result_type='groups',
                                                 binary=True,
                                                 hash_size=DATA_HASH_SIZE)
    try:
        run = post_run(requests, project_data, threshold=THRESHOLD)
        result_groups = get_run_result(requests, project_data, run)['groups']
    finally:
        delete_project(requests, project_data)

    # Compare ES result with anonlink.
    result_group_set = {frozenset(map(tuple, g)) for g in result_groups}
    true_group_set = set(map(frozenset, true_groups))
    assert result_group_set == true_group_set
Exemplo n.º 5
0
def test_permutation(requests, the_truth):
    project_data, (r_a, r_b) = create_project_upload_data(
        requests, (the_truth['clks_a'], the_truth['clks_b']),
        result_type='permutations')
    run = post_run(requests, project_data, threshold=the_truth['threshold'])
    mask_result = get_run_result(requests, project_data, run, timeout=60)
    perm_a_result = get_run_result(requests,
                                   project_data,
                                   run,
                                   result_token=r_a['receipt_token'],
                                   wait=False)
    perm_b_result = get_run_result(requests,
                                   project_data,
                                   run,
                                   result_token=r_b['receipt_token'],
                                   wait=False)
    # compare permutations and mask against mapping of the truth
    permutation_a = inverse_of_permutation(perm_a_result['permutation'])
    permutation_b = inverse_of_permutation(perm_b_result['permutation'])
    mapping = the_truth['mapping']

    # NB: Anonlink is more strict on enforcing the k parameter, so there
    # is a small chance the below won't hold. This should only be the
    # case for more noisy problems.
    for a, b, m in zip(permutation_a, permutation_b, mask_result['mask']):
        if m == 1:
            assert a in mapping, f"Unexpected link was included - run {run}"
            assert mapping[
                a] == b, f"Expected link from {a} was incorrect - run {run}"
        else:
            assert a not in mapping, f"Expected link was masked out - run {run}"
def test_run_description_no_data(requests, project):
    run_id = post_run(requests, project, 0.95)
    run = get_run(requests, project, run_id)

    assert 'run_id' in run
    assert 'notes' in run
    assert 'threshold' in run
Exemplo n.º 7
0
def test_run_permutations_results(requests, permutations_project, threshold):
    run_id = post_run(requests, permutations_project, threshold)
    mask_result = get_run_result(requests, permutations_project, run_id)
    assert 'mask' in mask_result
    assert len(mask_result['mask']) == min(permutations_project['size'])

    # Get results using receipt_token A and B
    token1 = permutations_project['dp_responses'][0]['receipt_token']
    result1 = get_run_result(requests,
                             permutations_project,
                             run_id,
                             token1,
                             wait=False)
    assert 'permutation' in result1
    assert 'rows' in result1
    assert result1['rows'] == len(mask_result['mask'])

    token2 = permutations_project['dp_responses'][1]['receipt_token']
    result2 = get_run_result(requests,
                             permutations_project,
                             run_id,
                             token2,
                             wait=False)
    assert 'permutation' in result2
    assert 'rows' in result2
    assert result2['rows'] == result1['rows']
    assert result2['rows'] == len(mask_result['mask'])
Exemplo n.º 8
0
def test_run_mapping_results(requests, mapping_project):
    run_id = post_run(requests, mapping_project, 0.95)
    wait_approx_run_time(mapping_project['size'])

    result = get_run_result(requests, mapping_project, run_id)
    assert 'mapping' in result
    assert isinstance(result['mapping'], dict)
Exemplo n.º 9
0
def test_run_status_without_clks(requests):
    project = create_project_no_data(requests)

    run_id = post_run(requests, project, 0.9)
    status = get_run_status(requests, project, run_id)

    is_run_status(status)
    assert status['state'] == 'created'
Exemplo n.º 10
0
def test_list_run_after_posting_runs(requests):
    with temporary_blank_project(requests, result_type='mapping') as project:

        for i in range(1, 11):
            run_id = post_run(requests, project, 0.95)
            # Check run listing has changed
            runs = get_runs(requests, project)
            assert len(runs) == i
Exemplo n.º 11
0
def test_run_mapping_results_no_data(requests):
    empty_project = create_project_no_data(requests)
    run_id = post_run(requests, empty_project, 0.95)
    get_run_result(requests,
                   empty_project,
                   run_id,
                   expected_status=404,
                   wait=False)
def test_run_description(requests, result_type):
    project, dp1, dp2 = create_project_upload_fake_data(
        requests, [100, 100], overlap=0.5, result_type=result_type)
    run_id = post_run(requests, project, 0.98)
    run = get_run(requests, project, run_id)

    assert 'run_id' in run
    assert 'notes' in run
    assert 'threshold' in run
Exemplo n.º 13
0
def test_project_json_data_upload_with_too_small_encoded_size(requests):
    new_project_data, r1, r2 = create_project_upload_fake_data(
        requests, [500, 500],
        overlap=0.95,
        result_type='mapping',
        encoding_size=4)

    with pytest.raises(AssertionError):
        run_id = post_run(requests, new_project_data, 0.9)
        get_run_result(requests, new_project_data, run_id, wait=True)
def test_posting_run_before_data_upload(requests, project):
    run_id = post_run(requests, project, 0.95)
    runs = get_runs(requests, project)

    assert len(runs) == 1
    run = runs[0]
    assert 'run_id' in run
    assert 'time_added' in run
    assert 'state' in run
    assert run['state'] == 'created'
def test_posting_run_after_data_upload(requests, project):
    run_id = post_run(requests, project, 0.95)
    runs = get_runs(requests, project)

    assert len(runs) == 1
    for run in runs:
        assert 'run_id' in run
        assert run['run_id'] == run_id
        assert 'time_added' in run
        assert 'state' in run
Exemplo n.º 16
0
def test_project_json_data_upload_with_mismatched_encoded_size(requests):
    d1 = generate_json_serialized_clks(500, 64)
    d2 = generate_json_serialized_clks(500, 256)

    new_project_data, r1, r2 = create_project_upload_data(
        requests, d1, d2, result_type='mapping')

    with pytest.raises(AssertionError):
        run_id = post_run(requests, new_project_data, 0.9)
        get_run_result(requests, new_project_data, run_id, wait=True)
Exemplo n.º 17
0
def test_mapping(requests, the_truth):
    project_data, _, _ = create_project_upload_data(requests, the_truth['clks_a'], the_truth['clks_b'],
                                                    result_type='mapping')
    run = post_run(requests, project_data, threshold=the_truth['threshold'])
    result = get_run_result(requests, project_data, run)
    # compare mapping with the truth
    mapping = {int(k): int(result['mapping'][k]) for k in result['mapping']}
    assert mapping.keys() == the_truth['mapping'].keys()
    for key, value in mapping.items():
        assert value == the_truth['mapping'][key]
        assert the_truth['entity_ids_a'][key] == the_truth['entity_ids_b'][value]
Exemplo n.º 18
0
def test_project_json_data_upload_with_various_encoded_sizes(
        requests, encoding_size):
    new_project_data, r1, r2 = create_project_upload_fake_data(
        requests, [500, 500],
        overlap=0.95,
        result_type='mapping',
        encoding_size=encoding_size)

    run_id = post_run(requests, new_project_data, 0.9)
    result = get_run_result(requests, new_project_data, run_id, wait=True)
    assert 'mapping' in result
    assert len(result['mapping']) >= 475
def test_project_json_data_upload_with_mismatched_encoded_size(
        requests, result_type_number_parties):
    result_type, number_parties = result_type_number_parties

    data = [generate_json_serialized_clks(500, 64 if i == 0 else 256)
            for i in range(number_parties)]

    new_project_data, _ = create_project_upload_data(
        requests, data, result_type=result_type)

    with pytest.raises(AssertionError):
        run_id = post_run(requests, new_project_data, 0.9)
        get_run_result(requests, new_project_data, run_id, wait=True)
def test_project_json_data_upload_with_too_small_encoded_size(
        requests, result_type_number_parties):
    result_type, number_parties = result_type_number_parties
    new_project_data, _ = create_project_upload_fake_data(
        requests,
        [500] * number_parties,
        overlap=0.8,
        result_type=result_type,
        encoding_size=4
    )

    with pytest.raises(AssertionError):
        run_id = post_run(requests, new_project_data, 0.9)
        get_run_result(requests, new_project_data, run_id, wait=True)
Exemplo n.º 21
0
def test_similarity_scores(requests, the_truth):
    project_data, _, _ = create_project_upload_data(requests, the_truth['clks_a'], the_truth['clks_b'],
                                                    result_type='similarity_scores')
    run = post_run(requests, project_data, threshold=the_truth['threshold'])
    result = get_run_result(requests, project_data, run, timeout=60)
    # compare the result with the truth
    ss = result['similarity_scores']
    ts = the_truth['similarity_scores']
    assert len(ss) == len(ts)
    for es_score, true_score in zip(ss, ts):
        assert es_score[0] == true_score[0] and es_score[1] == true_score[2]
        assert es_score[2] == pytest.approx(true_score[1], 1e-10), 'similarity scores are different'

    delete_project(requests, project_data)
def test_run_description(requests, result_type_number_parties):
    THRESHOLD = .98

    result_type, number_parties = result_type_number_parties
    project, _ = create_project_upload_fake_data(requests,
                                                 [100] * number_parties,
                                                 overlap=0.5,
                                                 result_type=result_type)
    run_id = post_run(requests, project, THRESHOLD)
    run = get_run(requests, project, run_id)

    assert 'run_id' in run
    assert 'notes' in run
    assert run['threshold'] == THRESHOLD
Exemplo n.º 23
0
def test_run_groups_results(requests, groups_project, threshold):
    run_id = post_run(requests, groups_project, threshold)
    result = get_run_result(requests, groups_project, run_id)

    assert 'groups' in result
    groups = result['groups']

    # All groups have at least two records
    assert all(len(g) >= 2 for g in groups)

    # All records consist of a record index and dataset index
    assert all(all(len(i) == 2 for i in g) for g in groups)
    assert all(
        all(isinstance(i, int) and isinstance(j, int) for i, j in g)
        for g in groups)
Exemplo n.º 24
0
def test_groups(requests, the_truth):
    project_data, _ = create_project_upload_data(
        requests, (the_truth['clks_a'], the_truth['clks_b']),
        result_type='groups')
    run = post_run(requests, project_data, threshold=the_truth['threshold'])
    result = get_run_result(requests, project_data, run)
    # compare mapping with the truth
    result_groups = result['groups']
    true_groups = the_truth['groups']

    result_groups = frozenset(
        frozenset(map(tuple, group)) for group in result_groups)
    true_groups = frozenset(map(frozenset, true_groups))

    assert result_groups == true_groups
Exemplo n.º 25
0
def test_mapping(requests, the_truth):
    project_data, _ = create_project_upload_data(
        requests, (the_truth['clks_a'], the_truth['clks_b']),
        result_type='mapping')
    run = post_run(requests, project_data, threshold=the_truth['threshold'])
    result = get_run_result(requests, project_data, run)
    # compare mapping with the truth
    mapping = {int(k): int(result['mapping'][k]) for k in result['mapping']}

    # NB: Anonlink is more strict on enforcing the k parameter, so there
    # is a small chance the below won't hold. This should only be the
    # case for more noisy problems.
    assert mapping.keys() == the_truth['mapping'].keys()
    for key, value in mapping.items():
        assert value == the_truth['mapping'][key]
        assert the_truth['entity_ids_a'][key] == the_truth['entity_ids_b'][
            value]
Exemplo n.º 26
0
def test_permutation(requests, the_truth):
    project_data, r_a, r_b = create_project_upload_data(requests, the_truth['clks_a'], the_truth['clks_b'],
                                                        result_type='permutations')
    run = post_run(requests, project_data, threshold=the_truth['threshold'])
    mask_result = get_run_result(requests, project_data, run, timeout=60)
    perm_a_result = get_run_result(requests, project_data, run, result_token=r_a['receipt_token'], wait=False)
    perm_b_result = get_run_result(requests, project_data, run, result_token=r_b['receipt_token'], wait=False)
    # compare permutations and mask against mapping of the truth
    permutation_a = inverse_of_permutation(perm_a_result['permutation'])
    permutation_b = inverse_of_permutation(perm_b_result['permutation'])
    mapping = the_truth['mapping']
    for a, b, m in zip(permutation_a, permutation_b, mask_result['mask']):
        if m == 1:
            assert a in mapping, f"Unexpected link was included - run {run}"
            assert mapping[a] == b, f"Expected link from {a} was incorrect - run {run}"
        else:
            assert a not in mapping, f"Expected link was masked out - run {run}"
Exemplo n.º 27
0
def test_similarity_scores(requests, the_truth):
    project_data, _ = create_project_upload_data(
        requests, (the_truth['clks_a'], the_truth['clks_b']),
        result_type='similarity_scores')
    run = post_run(requests, project_data, threshold=the_truth['threshold'])
    result = get_run_result(requests, project_data, run, timeout=60)

    true_scores = the_truth['similarity_scores']
    result_scores = {(a, b): sim for a, b, sim in result['similarity_scores']}

    # Anonlink is more strict on enforcing the k parameter. Hence the
    # subset.
    assert true_scores.keys() <= result_scores.keys()

    for pair in true_scores:
        assert true_scores[pair] == result_scores[pair]

    delete_project(requests, project_data)
def test_project_binary_data_upload_with_different_encoded_size(
        requests,
        encoding_size, valid_project_params):
    expected_number_parties = get_expected_number_parties(valid_project_params)
    new_project_data = requests.post(url + '/projects',
                                 json={
                                     'schema': {},
                                     **valid_project_params
                                 }).json()

    common = next(binary_pack_filters(generate_clks(1, encoding_size),
                                      encoding_size))

    data = []
    for i in range(expected_number_parties):
        generated_clks = generate_clks(499, encoding_size)
        packed_clks = binary_pack_filters(generated_clks, encoding_size)
        packed_joined = b''.join(packed_clks)
        packed_with_common = (
            packed_joined + common if i == 0 else common + packed_joined)
        data.append(packed_with_common)

    project_id = new_project_data['project_id']
    for d, token in zip(data, new_project_data['update_tokens']):
        assert len(d) == 500 * encoding_size
        upload_binary_data(
            requests, d, project_id, token, 500, size=encoding_size)

    run_id = post_run(requests, new_project_data, 0.99)
    result = get_run_result(requests, new_project_data, run_id, wait=True)
    if valid_project_params['result_type'] == 'mapping':
        assert 'mapping' in result
        assert result['mapping']['499'] == '0'
    elif valid_project_params['result_type'] == 'groups':
        assert 'groups' in result
        groups = result['groups']
        groups_set = {frozenset(map(tuple, group)) for group in groups}
        common_set = frozenset(
            (i, 499 if i == 0 else 0) for i in range(expected_number_parties))
        assert common_set in groups_set
def test_project_binary_data_uploaded(requests, valid_project_params):
    new_project_data = requests.post(url + '/projects',
                                     json={
                                         'schema': {},
                                         **valid_project_params
                                     }).json()
    update_tokens = new_project_data['update_tokens']
    expected_number_parties = get_expected_number_parties(valid_project_params)
    assert len(update_tokens) == expected_number_parties

    small_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'testdata/clks_128B_1k.bin')

    for token in update_tokens:
        upload_binary_data_from_file(
            requests,
            small_file_path, new_project_data['project_id'], token, 1000)

    run_id = post_run(requests, new_project_data, 0.99)
    result = get_run_result(requests, new_project_data, run_id, wait=True)

    if valid_project_params['result_type'] == 'mapping':
        assert 'mapping' in result

        # Since we uploaded the same file it should have identified the
        # same rows as matches
        for i in range(1, 1000):
            assert str(i) in result['mapping']
            assert result['mapping'][str(i)] == str(i)
    elif valid_project_params['result_type'] == 'groups':
        assert 'groups' in result
        groups = result['groups']
        assert len(groups) == 1000
        for group in groups:
            dataset_indices = {di for di, _ in group}
            record_indices = {ri for _, ri in group}
            assert len(record_indices) == 1
            assert dataset_indices == set(range(expected_number_parties))
        # Check every record is represented
        all_record_indices = {next(iter(group))[1] for group in groups}
        assert all_record_indices == set(range(1000))
def test_project_json_data_upload_with_various_encoded_sizes(
        requests,
        encoding_size, result_type_number_parties):
    result_type, number_parties = result_type_number_parties
    new_project_data, _ = create_project_upload_fake_data(
        requests,
        [500] * number_parties,
        overlap=0.8,
        result_type=result_type,
        encoding_size=encoding_size
    )

    run_id = post_run(requests, new_project_data, 0.9)
    result = get_run_result(requests, new_project_data, run_id, wait=True)
    if result_type == 'mapping':
        assert 'mapping' in result
        assert len(result['mapping']) >= 400
    elif result_type == 'groups':
        assert 'groups' in result
        # This is a pretty bad bound, but we're not testing the
        # accuracy.
        assert len(result['groups']) >= 400