예제 #1
0
def m_generalization(att_trees, data, k=10, l=5):
    """Using partition_for_transaction to anonymize SA (transaction) partition,
    while applying anatomy to separate QID and SA
    return (result, eval_result)
    result is 2-dimensional list
    eval_result is a tuple (rncp, tncp, rtime)
    """
    global ATT_TREES, DATA
    ATT_TREES = att_trees
    DATA = data
    start_time = time.time()
    if _DEBUG:
        print "size of dataset %d" % len(data)
    result = []
    trans = [t[-1] for t in data]
    trans_set, tncp = partition(att_trees[-1], trans, k)
    partition_data = []
    for ttemp in trans_set:
        (index_list, tran_value) = ttemp
        for t in index_list:
            DATA[t][-1] = tran_value[:]
            partition_data.append(DATA[t][:])
    if _DEBUG:
        print "Begin Mondrian"
    result, rncp = mondrian_l_diversity(ATT_TREES, partition_data, l)
    rtime = float(time.time() - start_time)
    if _DEBUG:
        print "Total running time = %.2f seconds" % rtime
    # transform data format (QID1,.., QIDn, SA set, GroupID, 1/|group size|, Group SA domain)
    # 1/|group size|, Group SA domain will be used in evaluation
    return (result, (rncp, tncp, rtime))
def Separation_Gen(att_trees, data, k=10, l=5):
    """Using partition_for_transaction to anonymize SA (transaction) partition,
    while applying anatomy to separate QID and SA
    return (result, eval_result)
    result is 2-dimensional list
    eval_result is a tuple (rncp, tncp, rtime)
    """
    global ATT_TREES, DATA
    ATT_TREES = att_trees
    DATA = data
    start_time = time.time()
    if _DEBUG:
        print "size of dataset %d" % len(data)
    result = []
    # copy transaction part of data to trans
    trans = [t[-1] for t in data]
    # anonymize transaction part with partition algorithm
    trans_set, sa_ncp = partition(att_trees[-1], trans, k)
    partition_data = []
    for ttemp in trans_set:
        (index_list, tran_value) = ttemp
        for t in index_list:
            DATA[t][-1] = tran_value[:]
            partition_data.append(DATA[t][:])
    if _DEBUG:
        print "Begin Mondrian"
    # anonymize qid and sa part with mondrian_l_diversity
    result, qid_ncp = mondrian_l_diversity(ATT_TREES, partition_data, l)
    rtime = float(time.time() - start_time)
    if _DEBUG:
        print "Total running time = %.2f seconds" % rtime
    # transform data format (QID1,.., QIDn, SA set, GroupID, 1/|group size|, Group SA domain)
    # 1/|group size|, Group SA domain will be used in evaluation
    return (result, (qid_ncp, sa_ncp, rtime))
def get_result_dataset(att_tree, data, k=10, num_test=10):
    """
    fix k, while changing size of dataset
    num_test is the test nubmber.
    """
    data_back = copy.deepcopy(data)
    length = len(data_back)
    joint = 5000
    dataset_num = length / joint
    if length % joint == 0:
        dataset_num += 1
    for i in range(1, dataset_num + 1):
        pos = i * joint
        ncp = rtime = 0
        if pos > length:
            continue
        print '#' * 30
        print "size of dataset %d" % pos
        for j in range(num_test):
            temp = random.sample(data, pos)
            _, eval_result = partition(att_tree, temp, k)
            ncp += eval_result[0]
            rtime += eval_result[1]
            data = copy.deepcopy(data_back)
        ncp /= num_test
        rtime /= num_test
        print "Average NCP %0.2f" % ncp + "%"
        print "Running time %0.2f" % rtime + " seconds"
        print '#' * 30
def get_result_one(att_tree, data, k=10):
    """
    run partition for one time, with k=10
    """
    print "K=%d" % k
    _, eval_result = partition(att_tree, data, k)
    print "NCP %0.2f" % eval_result[0] + "%"
    print "Running time %0.2f" % eval_result[1] + " seconds"
def get_result_k(att_tree, data):
    """
    change k, whle fixing size of dataset
    """
    data_back = copy.deepcopy(data)
    # for k in range(5, 105, 5):
    for k in [2, 5, 10, 25, 50, 100]:
        print '#' * 30
        print "K=%d" % k
        result, eval_result = partition(att_tree, data, k)
        data = copy.deepcopy(data_back)
        print "NCP %0.2f" % eval_result[0] + "%"
        print "Running time %0.2f" % eval_result[1] + " seconds"
예제 #6
0
 def test_case_from_paper(self):
     init_tree()
     trans = [
         ["a1"],
         ["a1", "a2"],
         ["b1", "b2"],
         ["b1", "b2"],
         ["a1", "a2", "b2"],
         ["a1", "a2", "b2"],
         ["a1", "a2", "b1", "b2"],
     ]
     result, _ = partition(ATT_TREE, trans, 2)
     for i, t in enumerate(result[:]):
         result[i] = list_to_str(t)
     self.assertEqual(set(result), set(["A", "A", "a1;a2;B", "a1;a2;B", "a1;a2;B", "b1;b2", "b1;b2"]))
예제 #7
0
파일: PAA.py 프로젝트: zshwuhan/PAA
def PAA(att_tree, data, K=10, L=5):
    """Using Partition to anonymize SA (transaction) partition, 
    while applying Anatomize to separate QID and SA
    """
    global gl_att_tree, gl_data
    gl_att_tree = att_tree
    gl_data = data
    start_time = time.time()
    tran_tree = {}
    print "size of dataset %d" % len(gl_data)
    result = []
    trans = [t[-1] for t in gl_data]
    trans_set = partition(att_tree, trans, K)
    grouped_data = []
    for ttemp in trans_set:
        (index_list, tran_value) = ttemp
        parent = list_to_str(tran_value, cmp)
        try:
            tran_tree[parent]
        except:
            tran_tree[parent] = set()
        gtemp = []
        for t in index_list:
            temp = gl_data[t][:]
            leaf = list_to_str(temp[-1], cmp)
            tran_tree[parent].add(leaf)
            temp[-1] = tran_value[:]
            gtemp.append(temp)
        grouped_data.append(gtemp)
    print "Begin Anatomy"
    grouped_result = anatomizer(grouped_data, L)
    print("--- %s seconds ---" % (time.time()-start_time))
    # transform data format (QID1,.., QIDn, SA set, GroupID, 1/|group size|, SA_list (dict) :original SA (str) sets with prob)
    # 1/|group size|, original SA sets with prob (dict) will be used in evaluation
    for index, group in enumerate(grouped_result):
        length = len(group)
        leaf_list = []
        SA_list = {}
        parent_list = {}
        for t in group:
            parent = list_to_str(t[-1], cmp)
            gen_range = get_range(att_tree, t[-1])
            leaf_list = leaf_list + list(tran_tree[parent])
            parent_list[parent] = gen_range
        # all transactions covered by this group
        leaf_list = list(set(leaf_list))
        # pdb.set_trace()
        for temp in leaf_list:
            for p in parent_list.keys():
                if temp in tran_tree[p]:
                    try:
                        SA_list[temp] += parent_list[p]/length 
                    except:
                        SA_list[temp] = parent_list[p]/length
        # pdb.set_trace()
        for t in group:
            temp = t[:]
            temp.append(index)
            temp.append(1.0/length)
            temp.append(SA_list)
            result.append(temp)
    return result
예제 #8
0
파일: APA.py 프로젝트: zshwuhan/APA
def APA(att_tree, data, K=10, L=5):
    """Using Partition to anonymize SA (transaction) partition, 
    while applying Anatomizer to separate QID and SA
    """
    # Initialization
    global gl_att_tree, gl_data
    gl_att_tree = att_tree
    gl_data = data
    start_time = time.time()
    result = []
    suppress = []
    tran_tree = {}
    print "size of dataset %d" % len(gl_data)
    # Begin Anatomy
    print "Begin Anatomy"
    anatomy_index = anatomizer(gl_data, L)
    # Begin Partition
    trans = [t[-1] for t in gl_data]
    trans_set = partition(att_tree, trans, K)
    for ttemp in trans_set:
        (index_list, tran_value) = ttemp
        parent = list_to_str(tran_value, cmp)
        try:
            tran_tree[parent]
        except:
            tran_tree[parent] = set()
        for t in index_list:
            leaf = list_to_str(gl_data[t][-1], cmp)
            tran_tree[parent].add(leaf)
            gl_data[t][-1] = tran_value[:]
    # pdb.set_trace()
    # Merge groups to achieve l-diversity
    residue = []
    grouped_index = []
    for group in anatomy_index:
        if check_diversity(group, L):
            grouped_index.append(group[:])
        else:
            residue.append(group[:])
    while len(residue) > 0:
        g = residue.pop()
        for index, group in enumerate(residue):
            if mergeable(g, group, L):
                g = g + group
                grouped_index.append(g)
                residue.pop(index)
                break
        else:
            # add group element to random group, which alread satisfy l-diversity
            if len(grouped_index) > 0:
                seed = random.randrange(len(grouped_index))
                grouped_index[seed] = grouped_index[seed] + g
            else:
                print "Error: group cannot satisfy l-diversity"
                for index in g:
                    suppress.append(gl_data[index])
    if _DEBUG:
        print 'NO. of Suppress after Group Merge = %d' % len(suppress)
        print 'NO. of groups = %d' % len(grouped_index)
    grouped_result = []
    for indexes in grouped_index:
        gtemp = []
        for index in indexes:
            gtemp.append(gl_data[index])
        grouped_result.append(gtemp)
    print("--- %s seconds ---" % (time.time()-start_time))
    # transform data format (QID1,.., QIDn, SA set, GroupID, 1/|group size|, SA_list (dict) :original SA (str) sets with prob)
    # 1/|group size|, original SA sets with prob (dict) will be used in evaluation
    for index, group in enumerate(grouped_result):
        length = len(group)
        leaf_list = []
        SA_list = {}
        parent_list = {}
        for t in group:
            parent = list_to_str(t[-1], cmp)
            gen_range = get_range(att_tree, t[-1])
            leaf_list = leaf_list + list(tran_tree[parent])
            parent_list[parent] = gen_range
        # all transactions covered by this group
        leaf_list = list(set(leaf_list))
        # pdb.set_trace()
        for temp in leaf_list:
            for p in parent_list.keys():
                if temp in tran_tree[p]:
                    try:
                        SA_list[temp] += parent_list[p]/length 
                    except:
                        SA_list[temp] = parent_list[p]/length
        # pdb.set_trace()
        for t in group:
            temp = t[:]
            temp.append(index)
            temp.append(1.0/length)
            temp.append(SA_list)
            result.append(temp)
    return result
예제 #9
0
def APA(att_tree, data, K=10, L=5):
    """Using Partition to anonymize SA (transaction) partition, 
    while applying Anatomizer to separate QID and SA
    """
    # Initialization
    global gl_att_tree, gl_data
    gl_att_tree = att_tree
    gl_data = data
    start_time = time.time()
    result = []
    suppress = []
    tran_tree = {}
    print "size of dataset %d" % len(gl_data)
    # Begin Anatomy
    print "Begin Anatomy"
    anatomy_index = anatomizer(gl_data, L)
    # Begin Partition
    trans = [t[-1] for t in gl_data]
    trans_set = partition(att_tree, trans, K)
    for ttemp in trans_set:
        (index_list, tran_value) = ttemp
        parent = list_to_str(tran_value, cmp)
        try:
            tran_tree[parent]
        except:
            tran_tree[parent] = set()
        for t in index_list:
            leaf = list_to_str(gl_data[t][-1], cmp)
            tran_tree[parent].add(leaf)
            gl_data[t][-1] = tran_value[:]
    # pdb.set_trace()
    # Merge groups to achieve l-diversity
    residue = []
    grouped_index = []
    for group in anatomy_index:
        if check_diversity(group, L):
            grouped_index.append(group[:])
        else:
            residue.append(group[:])
    while len(residue) > 0:
        g = residue.pop()
        for index, group in enumerate(residue):
            if mergeable(g, group, L):
                g = g + group
                grouped_index.append(g)
                residue.pop(index)
                break
        else:
            # add group element to random group, which alread satisfy l-diversity
            if len(grouped_index) > 0:
                seed = random.randrange(len(grouped_index))
                grouped_index[seed] = grouped_index[seed] + g
            else:
                print "Error: group cannot satisfy l-diversity"
                for index in g:
                    suppress.append(gl_data[index])
    if _DEBUG:
        print 'NO. of Suppress after Group Merge = %d' % len(suppress)
        print 'NO. of groups = %d' % len(grouped_index)
    grouped_result = []
    for indexes in grouped_index:
        gtemp = []
        for index in indexes:
            gtemp.append(gl_data[index])
        grouped_result.append(gtemp)
    print("--- %s seconds ---" % (time.time() - start_time))
    # transform data format (QID1,.., QIDn, SA set, GroupID, 1/|group size|, SA_list (dict) :original SA (str) sets with prob)
    # 1/|group size|, original SA sets with prob (dict) will be used in evaluation
    for index, group in enumerate(grouped_result):
        length = len(group)
        leaf_list = []
        SA_list = {}
        parent_list = {}
        for t in group:
            parent = list_to_str(t[-1], cmp)
            gen_range = get_range(att_tree, t[-1])
            leaf_list = leaf_list + list(tran_tree[parent])
            parent_list[parent] = gen_range
        # all transactions covered by this group
        leaf_list = list(set(leaf_list))
        # pdb.set_trace()
        for temp in leaf_list:
            for p in parent_list.keys():
                if temp in tran_tree[p]:
                    try:
                        SA_list[temp] += parent_list[p] / length
                    except:
                        SA_list[temp] = parent_list[p] / length
        # pdb.set_trace()
        for t in group:
            temp = t[:]
            temp.append(index)
            temp.append(1.0 / length)
            temp.append(SA_list)
            result.append(temp)
    return result