示例#1
0
def greedy_bayes(dataset, k=0, epsilon=0):
    """Construct a Bayesian Network (BN) using greedy algorithm.

    Parameters
    ----------
        dataset : DataFrame
            Input dataset, which only contains categorical attributes.
        k : int
            Maximum degree of the constructed BN. If k=0, k is automatically calculated.
        epsilon : float
            Parameter of differential privacy.
    """

    num_tuples, num_attributes = dataset.shape
    if not k:
        k = calculate_k(num_attributes, num_tuples)

    attributes = set(dataset.columns)
    N = []
    V = set()
    V.add(random.choice(dataset.columns))

    print(
        '================== Constructing Bayesian Network ==================')
    for i in range(1, len(attributes)):
        print('Looking for next attribute-parents pair.')
        rest_attributes = attributes - V
        parents_pair_list = []
        mutual_info_list = []
        for child in rest_attributes:
            print('    Considering attribute {}'.format(child))
            for parents in combinations(V, min(k, len(V))):
                parents = list(parents)
                parents_pair_list.append((child, parents))
                # TODO consider to change the computation of MI by combined integers instead of strings.
                mi = mutual_information(dataset[child], dataset[parents])
                mutual_info_list.append(mi)

        if epsilon:
            sampling_distribution = exponential_mechanism(
                dataset, mutual_info_list, epsilon)
            idx = np.random.choice(list(range(len(mutual_info_list))),
                                   p=sampling_distribution)
        else:
            idx = mutual_info_list.index(max(mutual_info_list))

        N.append(parents_pair_list[idx])
        V.add(parents_pair_list[idx][0])

    print('========================= BN constructed =========================')

    return N
def worker(paras):
    child, V, num_parents, split, dataset = paras
    parents_pair_list = []
    mutual_info_list = []

    if split + num_parents - 1 < len(V):
        for other_parents in combinations(V[split + 1:], num_parents - 1):
            parents = list(other_parents)
            parents.append(V[split])
            parents_pair_list.append((child, parents))
            # TODO consider to change the computation of MI by combined integers instead of strings.
            mi = mutual_information(dataset[child], dataset[parents])
            mutual_info_list.append(mi)

    return parents_pair_list, mutual_info_list