Пример #1
0
def main():
    data_path = os.getcwd() + '/../data'

    dataset_name = 'housing'
    original_dataset, attributes = read_data_path(data_path + '/' +
                                                  dataset_name + '/' +
                                                  dataset_name + '.csv')
    anonymised_dataset = pd.read_csv(data_path + '/result/sc_test/1_1/' +
                                     dataset_name + '/datasets/eps-1.0_1.csv')

    #dataset_postfix = '/test_dataset/test_dataset.csv'
    #anonymised_postfix = '/test_dataset/test_anonymised.csv'
    #original_dataset, attributes = read_data_path(data_path + dataset_postfix)
    #anonymised_dataset = pd.read_csv(data_path + anonymised_postfix)
    #taxonomies = [Taxonomy({'*': [TaxNode('*', None, True)]}, 1) for i in range(len(attributes))]

    cropped_dataset = original_dataset.values[:1000]
    original_dataset = pd.DataFrame(cropped_dataset, columns=attributes)

    cropped_dataset_a = anonymised_dataset.values[:1000]
    anonymised_dataset = pd.DataFrame(cropped_dataset_a, columns=attributes)

    taxonomies = [create_taxonomy(dataset_name, attr) for attr in attributes]
    add_semantic_distances_all(taxonomies)
    for taxonomy in taxonomies:
        taxonomy.add_boundary(compute_boundary(taxonomy))
    print('Taxonomies found')

    # reverse_mapping(original_dataset, anonymised_dataset, taxonomies)
    #permutation_distance(original_dataset.values[2], anonymised_dataset, taxonomies)
    start_time = time.time()
    record_linkages = record_linkage(anonymised_dataset, original_dataset,
                                     taxonomies)
    print('record linkage', record_linkages, 'time:', time.time() - start_time)
    return
Пример #2
0
def test_3_1(dataset_name, model):
    iterations = 25

    path_in = os.getcwd()
    pattern = '^.*/thesis-data-anonymisation/'
    path = re.search(pattern, path_in).group(0)

    epsilons = ['1.0', '2.0']
    original_dataset_path = \
        path+'data/' + dataset_name + '/'+dataset_name+'.csv'
    dataset_top_path = path+'data/result/'+model+'_test/3_1/'\
                    +dataset_name
    datasets_path = dataset_top_path + '/datasets'

    metrics = ['sse', 'record_linkage']

    original_dataset, attributes = read_data_path(original_dataset_path)

    taxonomies = [create_taxonomy(dataset_name, attr) for attr in attributes]
    add_semantic_distances_all(taxonomies)
    for taxonomy in taxonomies:
        taxonomy.add_boundary(compute_boundary(taxonomy))

    if model == 'safepub':
        ks = [45, 74]
        filename_combinations = generate_filename_combos(
            'granularity_eps-', '.csv', [epsilons], iterations)
    elif model == 'sc':
        ks = [32] * len(epsilons)
        filename_combinations = generate_filename_combos(
            'eps-', '.csv', [epsilons], iterations)
    elif model == 'k-anonym':
        ks = [5]
        filename_combinations = [
            generate_filename_combos('k5_suppression', '.csv', [[""]])
        ]
    else:
        raise RuntimeError('Does not recognise model', model)

    compute_score_iterations2(datasets_path, dataset_top_path,
                              filename_combinations, metrics, ks, taxonomies,
                              original_dataset)
    result_filenames = [
        filen for filen in os.listdir(dataset_top_path)
        if re.match("^result_", filen)
    ]
    normalise_scores(dataset_name, dataset_top_path, result_filenames, ks)
    norm_result_filenames = [
        filen for filen in os.listdir(dataset_top_path)
        if re.match("^norm_result_", filen)
    ]
    compute_mean_var(dataset_top_path, norm_result_filenames)
    return
Пример #3
0
def run_experiment_3(dataset_name, attributes, k, num_attributes, float_vals, num_decimals):
    epsilon = 1.0
    path_in = os.getcwd()
    pattern = '^.*/thesis-data-anonymisation/'
    path = re.search(pattern, path_in).group(0)

    datasets_path = path + 'data/' + dataset_name + '/' + dataset_name + '.csv'

    iterations = 25

    attr_to_idx = {attr: n for n, attr in enumerate(attributes)}

    all_taxonomies = [create_taxonomy(dataset_name, attribute) for attribute in attributes]
    add_semantic_distances(all_taxonomies)
    for taxonomy in all_taxonomies:
        taxonomy.add_boundary(compute_boundary(taxonomy))

    for a in range(num_attributes, num_attributes+1):
        current_datasets_path = datasets_path+'/'+str(a)
        files = os.listdir(current_datasets_path)

        output_path = path+'data/result/sc_test/2_1/'+dataset_name+'/datasets/'+str(a)
        cluster_root_path = path+'anonymisation/S_C/clusters/'+dataset_name +'/clusters_attribute_subsets/'+str(a)
        for f, filename in enumerate(files):
            data, current_attrs = read_data_path(current_datasets_path+'/'+filename)
            current_taxonomies = [all_taxonomies[attr_to_idx[attr]] for attr in current_attrs]
            cluster_path = cluster_root_path+'/'+'-'.join(current_attrs)+'_'+str(k)+'.csv'
            current_float_vals = [float_vals[attr_to_idx[attr]] for attr in current_attrs]
            current_num_decimals = [num_decimals[attr_to_idx[attr]] for attr in current_attrs]
            for i in range(iterations):
                output_file = output_path+'/'+'-'.join(current_attrs)+'_'+str(i+1)+'.csv'
                try:
                    with open(output_file, 'r') as test_file:
                        print("File found")
                except FileNotFoundError:
                    X_bar = microaggregation(data, k, current_taxonomies, epsilon, add_cluster_noise=True,
                                 cluster_path=cluster_path)
                    anon_data = sanitise(X_bar.values, epsilon, k, current_taxonomies, current_float_vals, current_num_decimals)
                    anonymised = pd.DataFrame(anon_data, columns=data.columns)
                    anonymised.to_csv(output_file, index=False)
            print("---------------- file", f, "of", len(files), "done")
        print("#########################", a, 'attributes done')
    return
Пример #4
0
def run_experiment_2(dataset_name, float_vals, num_decimals):
    path_in = os.getcwd()
    pattern = '^.*/thesis-data-anonymisation/'
    path = re.search(pattern, path_in).group(0)

    dataset_path = path + 'data/' + dataset_name + '/' + dataset_name + '.csv'

    data, attributes = read_data_path(dataset_path)
    taxonomies = [create_taxonomy(dataset_name, attr) for attr in attributes]
    add_semantic_distances(taxonomies)
    for taxonomy in taxonomies:
        taxonomy.add_boundary(compute_boundary(taxonomy))

    if dataset_name == 'adult':
        ks = list(range(200, 6000, 300))[:16]
    elif dataset_name == 'housing':
        ks = list(range(200, 6000, 300))[:16]
    else:
        raise RuntimeError('Does not recognise dataset', dataset_name)

    iterations = 50
    epsilon = 1.0

    output_path = path+'data/result/sc_test/1_2/' + dataset_name + '/datasets'


    for k in ks:
        cluster_root_path = path+'anonymisation/S_C/clusters/' + dataset_name + '/k_' \
                            + str(k) + '.csv'
        print('####################### k ' + str(k))
        for i in range(iterations):
            output_file = output_path + '/k_'+str(k)+'_'+str(i+1)+'.csv'

            try:
                with open(output_file, 'r') as test_file:
                    print("File found")
            except FileNotFoundError:
                X_bar = microaggregation(data, k, taxonomies, epsilon, add_cluster_noise=True, cluster_path=cluster_root_path)
                anon_data = sanitise(X_bar.values, epsilon, k, taxonomies, float_vals, num_decimals)
                anonymised = pd.DataFrame(anon_data, columns=data.columns)
                anonymised.to_csv(output_file, index=False)
Пример #5
0
def run_experiment4(dataset_name, k, float_vals, num_decimals):
    path_in = os.getcwd()
    pattern = '^.*/thesis-data-anonymisation/'
    path = re.search(pattern, path_in).group(0)

    dataset_path = path + 'data/' + dataset_name + '/' + dataset_name + '.csv'

    data, attributes = read_data_path(dataset_path)

    taxonomies = [create_taxonomy(dataset_name, attr) for attr in attributes]
    add_semantic_distances(taxonomies)
    for taxonomy in taxonomies:
        taxonomy.add_boundary(compute_boundary(taxonomy))

    iterations = 25
    epsilons = [1.0, 2.0]

    output_path = path + 'data/result/sc_test/3_1/' + dataset_name + '/datasets'

    cluster_root_path = path + 'anonymisation/S_C/clusters/' + dataset_name + '/k_' + str(
        k) + '.csv'

    for epsilon in epsilons:
        print('####################### epsilon ' + str(epsilon))
        for i in range(iterations):
            output_file = output_path + '/eps-' + str(epsilon) + '_' + str(
                i + 1) + '.csv'
            X_bar = microaggregation(data,
                                     k,
                                     taxonomies,
                                     epsilon,
                                     add_cluster_noise=True,
                                     cluster_path=cluster_root_path)
            anon_data = sanitise(X_bar.values, epsilon, k, taxonomies,
                                 float_vals, num_decimals)
            anonymised = pd.DataFrame(anon_data, columns=data.columns)
            anonymised.to_csv(output_file, index=False)
Пример #6
0
def test_2_1(dataset_name, model, attributes):
    # Compute the scores and the normalised scores for each attribute combination

    iterations = 25

    path_in = os.getcwd()
    pattern = '^.*/thesis-data-anonymisation/'
    path = re.search(pattern, path_in).group(0)

    original_dataset_path = path + 'data/' + dataset_name + '/attribute_subsets'

    metrics = ['discernibility', 'entropy', 'groupsize', 'sse']

    taxonomies = [create_taxonomy(dataset_name, attr) for attr in attributes]
    add_semantic_distances_all(taxonomies)
    for taxonomy in taxonomies:
        taxonomy.add_boundary(compute_boundary(taxonomy))

    attr_to_idx = {attr: n for n, attr in enumerate(attributes)}

    if dataset_name == 'adult':
        attribute_range = list(range(2, 9))
    elif dataset_name == 'housing':
        attribute_range = list(range(2, 10))
    elif dataset_name == 'musk':
        attribute_range = list(range(2, 21))
    else:
        raise RuntimeError("Does not recognise dataset", dataset_name)

    if model == 'safepub':
        # Safepub
        top_path = path + 'data/result/safepub_test/2_1/' + dataset_name
        dataset_paths = top_path + '/datasets'
        if dataset_name == 'musk':
            ks = [45]
        else:
            ks = [59]
        for a in attribute_range:
            output_path = top_path + '/' + str(a)
            original_filenames = os.listdir(original_dataset_path + '/' +
                                            str(a))
            for n, filename in enumerate(original_filenames):
                original_dataset, current_attrs = read_data_path(
                    original_dataset_path + '/' + str(a) + '/' + filename)
                filenames = [
                    '-'.join(current_attrs) + '_' + str(i + 1) + '.csv'
                    for i in range(iterations)
                ]
                #
                current_taxomomies = [
                    taxonomies[attr_to_idx[attr]] for attr in current_attrs
                ]
                #
                compute_score_iterations2(dataset_paths + '/' + str(a),
                                          output_path, [filenames], metrics,
                                          ks, current_taxomomies,
                                          original_dataset)
                print("File", n, "of", len(original_filenames))
            result_filenames = [
                filen for filen in os.listdir(output_path)
                if re.match("^result_", filen)
            ]
            normalise_scores_subattributes(dataset_name, output_path,
                                           result_filenames,
                                           ks * len(result_filenames), a)

            norm_result_filenames = [
                filen for filen in os.listdir(output_path)
                if re.match("^norm_result_", filen)
            ]
            compute_mean_var(output_path, norm_result_filenames)
            print("Attribute", a, "done")
    elif model == 'sc':
        # S-C
        top_path = path + 'data/result/sc_test/2_1/' + dataset_name
        dataset_paths = top_path + '/datasets'
        if dataset_name == 'adult':
            ks = [174]
        elif dataset_name == 'housing':
            ks = [143]
        elif dataset_name == 'musk':
            ks = [82]
        else:
            raise RuntimeError('Does not recognise dataset', dataset_name)

        for a in attribute_range:
            output_path = top_path + '/' + str(a)
            original_filenames = os.listdir(original_dataset_path + '/' +
                                            str(a))
            for n, filename in enumerate(original_filenames):
                original_dataset, current_attrs = read_data_path(
                    original_dataset_path + '/' + str(a) + '/' + filename)
                filenames = [
                    '-'.join(current_attrs) + '_' + str(i + 1) + '.csv'
                    for i in range(iterations)
                ]
                current_taxomomies = [
                    taxonomies[attr_to_idx[attr]] for attr in current_attrs
                ]
                compute_score_iterations2(dataset_paths + '/' + str(a),
                                          output_path, [filenames], metrics,
                                          ks, current_taxomomies,
                                          original_dataset)
                print("File", n, "of", len(original_filenames))
            result_filenames = [
                filen for filen in os.listdir(output_path)
                if re.match("^result_", filen)
            ]
            normalise_scores_subattributes(dataset_name, output_path,
                                           result_filenames,
                                           ks * len(result_filenames), a)
            norm_result_filenames = [
                filen for filen in os.listdir(output_path)
                if re.match("^norm_result_", filen)
            ]
            compute_mean_var(output_path, norm_result_filenames)
            print("Attribute", a, "done")
    return
Пример #7
0
def test_1_2(dataset_name, model):
    # Read the datasets, compute the score for each dataset
    iterations = 50
    path_in = os.getcwd()
    pattern = '^.*/thesis-data-anonymisation/'
    path = re.search(pattern, path_in).group(0)

    original_dataset, attributes = read_data_path(path + 'data/' +
                                                  dataset_name + '/' +
                                                  dataset_name + '.csv')
    taxonomies = [create_taxonomy(dataset_name, attr) for attr in attributes]
    add_semantic_distances_all(taxonomies)
    for taxonomy in taxonomies:
        taxonomy.add_boundary(compute_boundary(taxonomy))

    if model == 'safepub':
        # SafePub
        top_path = path + "data/result/safepub_test/1_2/" + dataset_name
        dataset_path = top_path + '/datasets'
        metrics = ['discernibility', 'entropy', 'groupsize', 'sse']
        ks = [
            59, 74, 88, 100, 114, 129, 141, 155, 170, 184, 199, 211, 225, 240,
            252, 266
        ]
        deltas = [
            1E-5, 1E-6, 1E-7, 1E-8, 1E-9, 1E-10, 1E-11, 1E-12, 1E-13, 1E-14,
            1E-15, 1E-16, 1E-17, 1E-18, 1E-19, 1E-20
        ]
        deltas_string = [
            '1.0E-5', '1.0E-6', '1.0E-7', '1.0E-8', '1.0E-9', '1.0E-10',
            '1.0E-11', '1.0E-12', '1.0E-13', '1.0E-14', '1.0E-15', '1.0E-16',
            '1.0E-17', '1.0E-18', '1.0E-19', '1.0E-20'
        ]
        filename_combinations = generate_filename_combos(
            'delta_', '.csv', [deltas_string], iterations)
        compute_score_iterations2(dataset_path, top_path,
                                  filename_combinations, metrics, ks,
                                  taxonomies, original_dataset)
        result_file_combinations = generate_filename_combos(
            'result_delta_', '.csv', [deltas_string], 1)
        normalise_scores(dataset_name, top_path, result_file_combinations, ks)
        norm_combinations = generate_filename_combos('norm_result_delta_',
                                                     '.csv', [deltas_string])
        compute_mean_var(top_path, norm_combinations)

    elif model == 'sc':
        # Soria-Comas
        top_path = path + "data/result/sc_test/1_2/" + dataset_name
        dataset_path = top_path + '/datasets'
        metrics = ['discernibility', 'entropy', 'groupsize', 'sse']
        if dataset_name == 'adult':
            ks = list(range(200, 4701, 300))
        elif dataset_name == 'housing':
            ks = list(range(200, 4701, 300))
        else:
            raise RuntimeError('Does not recognse dataset name', dataset_name)


#
        ks_string = [str(k) for k in ks]
        filename_combinations = generate_filename_combos(
            'k_', '.csv', [ks_string], 50)
        compute_score_iterations2(dataset_path, top_path,
                                  filename_combinations, metrics, ks,
                                  taxonomies, original_dataset)
        result_file_combinations = generate_filename_combos(
            'result_k_', '.csv', [ks_string], 1)
        normalise_scores(dataset_name, top_path, result_file_combinations, ks)
        norm_combinations = generate_filename_combos('norm_result_k_', '.csv',
                                                     [ks_string])
        compute_mean_var(top_path, norm_combinations)
Пример #8
0
def test_1_1(dataset_name, model):
    iterations = 50
    epsilons = [
        2.0, 1.5, 1.25, 1.0986122886681098, 1.0, 0.75, 0.6931471805599453, 0.5,
        0.1, 0.01
    ]

    eps_str = [str(eps) for eps in epsilons]

    path_in = os.getcwd()
    pattern = '^.*/thesis-data-anonymisation/'
    path = re.search(pattern, path_in).group(0)

    original_dataset, attributes = read_data(dataset_name)
    taxonomies = [create_taxonomy(dataset_name, attr) for attr in attributes]
    add_semantic_distances_all(taxonomies)
    for taxonomy in taxonomies:
        taxonomy.add_boundary(compute_boundary(taxonomy))

    # SafePub
    if model == 'safepub':
        top_path = path + "data/result/safepub_test/1_1/" + dataset_name
        data_metrics = [
            'granularity', 'intensity', 'discernibility', 'entropy',
            'groupsize'
        ]
        ks = [103, 72, 67, 61, 59, 54, 53, 50, 50, 50] * len(data_metrics)
        dataset_path = top_path + '/datasets'
        eval_metrics = ['discernibility', 'entropy', 'groupsize', 'sse']
        filename_combinations = generate_filename_combos(
            '', '.csv', [data_metrics, ['eps'], eps_str], iterations)
        filename_combinations = [[
            filename.replace('eps_', 'eps-') for filename in file_list
        ] for file_list in filename_combinations]
        compute_score_iterations2(dataset_path, top_path,
                                  filename_combinations, eval_metrics, ks,
                                  taxonomies, original_dataset)

        result_file_combinations = generate_filename_combos(
            'result_', '.csv', [data_metrics, ['eps'], eps_str])
        result_file_combinations = [
            filename.replace('eps_', 'eps-')
            for filename in result_file_combinations
        ]
        normalise_scores(dataset_name, top_path, result_file_combinations, ks)
        #
        norm_result_combinations = generate_filename_combos(
            'norm_result_', '.csv', [data_metrics, ['eps'], eps_str])
        norm_result_combinations = [
            filename.replace('eps_', 'eps-')
            for filename in norm_result_combinations
        ]
        compute_mean_var(top_path, norm_result_combinations)
    elif model == 'sc':
        # Soria-Comas
        top_path = path + "data/result/sc_test/1_1/" + dataset_name
        if dataset_name == 'adult':
            ks = [174] * len(epsilons)
        elif dataset_name == 'housing':
            ks = [143] * len(epsilons)
        else:
            raise RuntimeError('Does not recognise the dataset')
        dataset_path = top_path + '/datasets'
        metrics = ['discernibility', 'entropy', 'groupsize', 'sse']
        filename_combinations = generate_filename_combos(
            'eps-', '.csv', [eps_str], iterations)
        compute_score_iterations2(dataset_path, top_path,
                                  filename_combinations, metrics, ks,
                                  taxonomies, original_dataset)

        result_file_combinations = generate_filename_combos(
            'result_eps-', '.csv', [eps_str])
        normalise_scores(dataset_name, top_path, result_file_combinations, ks)

        norm_result_combinations = generate_filename_combos(
            'norm_result_eps-', '.csv', [eps_str])
        compute_mean_var(top_path, norm_result_combinations)
    elif model == 'sc_spec':
        top_path = path + "data/result/sc_spec_test/1_1/" + dataset_name
        ks = [800] * len(epsilons)
        result_file_combinations = generate_filename_combos(
            'result_eps-', '.csv', [eps_str])
        normalise_scores(dataset_name, top_path, result_file_combinations, ks)

        norm_result_combinations = generate_filename_combos(
            'norm_result_eps-', '.csv', [eps_str])
        compute_mean_var(top_path, norm_result_combinations)

    elif model == 'k-anonym':
        # k5-anon
        k_name = 'k5_suppression.csv'
        top_path = path + "data/result/k-anonym_test/1_1/" + dataset_name
        dataset_path = top_path + "/datasets"
        data = read_data_path(dataset_path + '/' + k_name)[0]
        metrics = ['discernibility', 'entropy', 'groupsize', 'sse']
        k = 5
        scores = np.array(
            compute_metric_scores(data, metrics, k, taxonomies,
                                  original_dataset)).reshape(1, -1)
        df = pd.DataFrame(scores, columns=metrics)
        df.to_csv(top_path + '/result_' + k_name, index=False)
        #
        normalise_scores(dataset_name, top_path, ['result_' + k_name], [k])

    else:
        raise RuntimeError('Does not recognise model', model)