def greedy_bayes(dataset, k=0, epsilon=0): """Construct a Bayesian Network (BN) using greedy algorithm. Parameters ---------- dataset : DataFrame Input dataset, which only contains categorical attributes. k : int Maximum degree of the constructed BN. If k=0, k is automatically calculated. epsilon : float Parameter of differential privacy. """ num_tuples, num_attributes = dataset.shape if not k: k = calculate_k(num_attributes, num_tuples) attributes = set(dataset.columns) N = [] V = set() V.add(random.choice(dataset.columns)) print( '================== Constructing Bayesian Network ==================') for i in range(1, len(attributes)): print('Looking for next attribute-parents pair.') rest_attributes = attributes - V parents_pair_list = [] mutual_info_list = [] for child in rest_attributes: print(' Considering attribute {}'.format(child)) for parents in combinations(V, min(k, len(V))): parents = list(parents) parents_pair_list.append((child, parents)) # TODO consider to change the computation of MI by combined integers instead of strings. mi = mutual_information(dataset[child], dataset[parents]) mutual_info_list.append(mi) if epsilon: sampling_distribution = exponential_mechanism( dataset, mutual_info_list, epsilon) idx = np.random.choice(list(range(len(mutual_info_list))), p=sampling_distribution) else: idx = mutual_info_list.index(max(mutual_info_list)) N.append(parents_pair_list[idx]) V.add(parents_pair_list[idx][0]) print('========================= BN constructed =========================') return N
def worker(paras): child, V, num_parents, split, dataset = paras parents_pair_list = [] mutual_info_list = [] if split + num_parents - 1 < len(V): for other_parents in combinations(V[split + 1:], num_parents - 1): parents = list(other_parents) parents.append(V[split]) parents_pair_list.append((child, parents)) # TODO consider to change the computation of MI by combined integers instead of strings. mi = mutual_information(dataset[child], dataset[parents]) mutual_info_list.append(mi) return parents_pair_list, mutual_info_list