forked from qiyuangong/APA
-
Notifications
You must be signed in to change notification settings - Fork 0
/
APA.py
147 lines (136 loc) · 4.62 KB
/
APA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python
#coding=utf-8
# by Qiyuan Gong
# qiyuangong@gmail.com
# http://github.com/qiyuangong
# http://cn.linkedin.com/pub/qiyuan-gong/6b/831/407/
from partition_for_transaction import partition, list_to_str
from anatomizer import anatomizer
import time, random
import pdb
_DEBUG = True
gl_att_tree = []
gl_data = []
def get_range(att_tree, tran):
"""compute probability for generlized set
For example, age value 10 is generlized to 10-20.
So the probability is 1/10, which means that this
range is 10 with probability 1/10.
"""
# store the probability of each value
prob = 1.0
for t in tran:
if att_tree[t].support:
support = att_tree[t].support
prob /= support
return prob
def check_diversity(group, L):
"""check if group satisfy l-diversity
"""
SA_values = set()
for index in group:
str_value = list_to_str(gl_data[index][-1], cmp)
SA_values.add(str_value)
if len(SA_values) >= L:
return True
return False
def mergeable(group1, group2, L):
"""check if group1 can merge with group2 to achieve l-diversity
"""
return check_diversity((group1+group2), L)
def APA(att_tree, data, K=10, L=5):
"""Using Partition to anonymize SA (transaction) partition,
while applying Anatomizer to separate QID and SA
"""
# Initialization
global gl_att_tree, gl_data
gl_att_tree = att_tree
gl_data = data
start_time = time.time()
result = []
suppress = []
tran_tree = {}
print "size of dataset %d" % len(gl_data)
# Begin Anatomy
print "Begin Anatomy"
anatomy_index = anatomizer(gl_data, L)
# Begin Partition
trans = [t[-1] for t in gl_data]
trans_set = partition(att_tree, trans, K)
for ttemp in trans_set:
(index_list, tran_value) = ttemp
parent = list_to_str(tran_value, cmp)
try:
tran_tree[parent]
except:
tran_tree[parent] = set()
for t in index_list:
leaf = list_to_str(gl_data[t][-1], cmp)
tran_tree[parent].add(leaf)
gl_data[t][-1] = tran_value[:]
# pdb.set_trace()
# Merge groups to achieve l-diversity
residue = []
grouped_index = []
for group in anatomy_index:
if check_diversity(group, L):
grouped_index.append(group[:])
else:
residue.append(group[:])
while len(residue) > 0:
g = residue.pop()
for index, group in enumerate(residue):
if mergeable(g, group, L):
g = g + group
grouped_index.append(g)
residue.pop(index)
break
else:
# add group element to random group, which alread satisfy l-diversity
if len(grouped_index) > 0:
seed = random.randrange(len(grouped_index))
grouped_index[seed] = grouped_index[seed] + g
else:
print "Error: group cannot satisfy l-diversity"
for index in g:
suppress.append(gl_data[index])
if _DEBUG:
print 'NO. of Suppress after Group Merge = %d' % len(suppress)
print 'NO. of groups = %d' % len(grouped_index)
grouped_result = []
for indexes in grouped_index:
gtemp = []
for index in indexes:
gtemp.append(gl_data[index])
grouped_result.append(gtemp)
print("--- %s seconds ---" % (time.time()-start_time))
# transform data format (QID1,.., QIDn, SA set, GroupID, 1/|group size|, SA_list (dict) :original SA (str) sets with prob)
# 1/|group size|, original SA sets with prob (dict) will be used in evaluation
for index, group in enumerate(grouped_result):
length = len(group)
leaf_list = []
SA_list = {}
parent_list = {}
for t in group:
parent = list_to_str(t[-1], cmp)
gen_range = get_range(att_tree, t[-1])
leaf_list = leaf_list + list(tran_tree[parent])
parent_list[parent] = gen_range
# all transactions covered by this group
leaf_list = list(set(leaf_list))
# pdb.set_trace()
for temp in leaf_list:
for p in parent_list.keys():
if temp in tran_tree[p]:
try:
SA_list[temp] += parent_list[p]/length
except:
SA_list[temp] = parent_list[p]/length
# pdb.set_trace()
for t in group:
temp = t[:]
temp.append(index)
temp.append(1.0/length)
temp.append(SA_list)
result.append(temp)
return result