示例#1
0
    def test_roc_of_incorrect_has_fpr_lt_tpr(self):
        """The ROC of incorrect predictions should have FPR < TPR"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        # in test, c=0 is always more likely (opposite of train)
        test = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1000)] + [[a, b, 1] for a in range(3)
                                      for b in range(3) for _ in range(1)],
            columns=["a", "b", "c"],
        )

        roc, _ = roc_auc(bn, test, "c")

        assert len(roc) > 3
        assert all(fpr > tpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
示例#2
0
    def test_auc_for_nonnumeric_features(self):
        """AUC of accurate predictions should be 1 even after remapping numbers to strings"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        # remap values in column c
        train["c"] = train["c"].map({0: "f", 1: "g"})

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "c")
        assert math.isclose(auc, 1, abs_tol=0.001)
示例#3
0
def compare_result_with_ideal(
    em_cpds: Dict[str, pd.DataFrame],
    sm: StructureModel,
    data: pd.DataFrame,
    true_values_lv: np.array,
    node_states: Dict[AnyStr, Union[List, Set]],
) -> Tuple[float, float]:
    """
    Compare learned CPDs with ideal CPDs

    Args:
        em_cpds: Learned CPDs for different nodes
        sm: Structure model
        data: Input dataset
        true_values_lv: Ideal values of the latent variable
        node_states: Possible tates of different nodes

    Returns:
        Maximum absolute difference and root mean square of differences
    """
    data["z"] = true_values_lv.reshape(-1)
    bn = BayesianNetwork(sm)
    bn.fit_node_states(states_to_df(node_states))
    bn.fit_cpds(data)

    max_delta = -1
    avg_delta = 0

    for node in em_cpds:
        deltas = (em_cpds[node] - bn.cpds[node]).abs().values
        max_delta = max(max_delta, deltas.max())
        avg_delta += np.mean(deltas ** 2)

    avg_delta = np.sqrt(avg_delta / len(em_cpds))
    return max_delta, avg_delta
示例#4
0
    def test_auc_with_missing_state_in_test(self):
        """AUC should still be calculated correctly with states missing in test set"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        test = train[train["c"] == 1]
        assert len(test["c"].unique()) == 1

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, test, "c")
        assert math.isclose(auc, 1, abs_tol=0.01)
示例#5
0
    def test_auc_node_with_no_parents(self):
        """Should be possible to compute auc for state with no parent nodes"""

        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "a")
        assert math.isclose(auc, 0.5, abs_tol=0.01)
示例#6
0
    def test_roc_of_accurate_predictions(self):
        """TPR should always be better than FPR for accurate predictions"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(10)] + [[a, b, 1] for a in range(0, 2)
                                    for b in range(0, 2)
                                    for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(10)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        roc, _ = roc_auc(bn, train, "c")
        assert all(tpr > fpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
示例#7
0
    def test_auc_of_accurate_predictions(self):
        """AUC of accurate predictions should be 1"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "c")
        assert math.isclose(auc, 1, abs_tol=0.001)
示例#8
0
    def test_auc_of_incorrect_close_to_zero(self):
        """The AUC of incorrect predictions should be close to zero"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        # in test, c=0 is always more likely (opposite of train)
        test = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1000)] + [[a, b, 1] for a in range(3)
                                      for b in range(3) for _ in range(1)],
            columns=["a", "b", "c"],
        )

        _, auc = roc_auc(bn, test, "c")

        assert math.isclose(auc, 0, abs_tol=0.001)
示例#9
0
    def test_roc_of_random_has_unit_gradient(self):
        """The ROC curve for random predictions should be a line from (0,0) to (1,1)"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        test = pd.DataFrame(
            [[a, b, random.randint(0, 1)] for a in range(3) for b in range(3)
             for _ in range(1000)],
            columns=["a", "b", "c"],
        )

        roc, _ = roc_auc(bn, test, "c")

        assert len(roc) > 3
        assert all(math.isclose(a, b, abs_tol=0.03) for a, b in roc)
示例#10
0
    def test_do_sets_state_probability_to_one(self, train_model, train_data_idx):
        """Do should update the probability of the given observation=state to 1"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        ie.do_intervention("d", 1)
        assert math.isclose(ie.query()["d"][1], 1)
示例#11
0
    def test_do_accepts_all_state_probabilities(self, train_model, train_data_idx):
        """Do should accept a map of state->p and update p accordingly"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        ie.do_intervention("d", {0: 0.7, 1: 0.3})
        assert math.isclose(ie.query()["d"][0], 0.7)
        assert math.isclose(ie.query()["d"][1], 0.3)
示例#12
0
    def test_do_sets_other_state_probabilitys_to_zero(self, train_model,
                                                      train_data_idx):
        """Do should update the probability of every other state for the observation to zero"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        ie.do_intervention("d", 1)
        assert ie.query()["d"][0] == 0
示例#13
0
    def test_do_reflected_in_query(self, train_model, train_data_idx):
        """Do should adjust marginals returned by query when given a different observation"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        assert ie.query({"a": 1})["d"][1] != 1
        ie.do_intervention("d", 1)
        assert ie.query({"a": 1})["d"][1] == 1
示例#14
0
    def test_do_prevents_new_states_being_added(self, train_model, train_data_idx):
        """Do should not allow the introduction of new states"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        with pytest.raises(
            ValueError, match="The cpd states do not match expected states*"
        ):
            ie.do_intervention("d", {0: 0.7, 1: 0.3, 2: 0.0})
示例#15
0
    def test_do_on_node_with_no_effects_not_allowed(self, train_model, train_data_idx):
        """It should not be possible to create an isolated node in the network"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        with pytest.raises(
            ValueError,
            match="Do calculus cannot be applied because it would result in an isolate",
        ):
            ie.do_intervention("a", 1)
示例#16
0
    def test_do_expects_all_states_have_a_probability(self, train_model,
                                                      train_data_idx):
        """Do should accept only state probabilities where all states in the original cpds are present"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        with pytest.raises(
                ValueError,
                match="The cpd states do not match expected states*"):
            ie.do_intervention("d", {1: 1})
示例#17
0
    def test_do_expects_all_state_probabilities_sum_to_one(
            self, train_model, train_data_idx):
        """Do should accept only state probabilities where the full distribution is provided"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        with pytest.raises(
                ValueError,
                match="The cpd for the provided observation must sum to 1"):
            ie.do_intervention("d", {0: 0.7, 1: 0.4})
示例#18
0
    def test_observations_affect_marginals(self, train_model, train_data_idx):
        """Observing the state of a node should affect the marginals of dependent nodes"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        m1 = ie.query({})
        m2 = ie.query({"d": 1})

        assert m2["d"][0] == 0
        assert m2["d"][1] == 1
        assert not math.isclose(m2["b"][1], m1["b"][1], abs_tol=0.01)
示例#19
0
    def test_reset_do_sets_probabilities_back_to_initial_state(
        self, train_model, train_data_idx, train_data_idx_marginals
    ):
        """Resetting Do operator should re-introduce the original conditional dependencies"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        ie.do_intervention("d", {0: 0.7, 1: 0.3})
        ie.reset_do("d")

        assert math.isclose(ie.query()["d"][0], train_data_idx_marginals["d"][0])
        assert math.isclose(ie.query()["d"][1], train_data_idx_marginals["d"][1])
示例#20
0
    def test_do_expects_all_state_probabilities_within_0_and_1(
            self, train_model, train_data_idx):
        """Do should accept only state probabilities where the full distribution is provided"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        with pytest.raises(
                ValueError,
                match=
                "The cpd for the provided observation must be between 0 and 1",
        ):
            ie.do_intervention("d", {0: -1.0, 1: 2.0})
示例#21
0
    def test_empty_query_returns_marginals(self, train_model, train_data_idx,
                                           train_data_idx_marginals):
        """An empty query should return all the marginal probabilities of the model's distribution"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        dist = ie.query({})

        for node, states in dist.items():
            for state, p in states.items():
                assert math.isclose(train_data_idx_marginals[node][state],
                                    p,
                                    abs_tol=0.05)
示例#22
0
def chain_network() -> BayesianNetwork:
    """
    This Bayesian Model structure to test do interventions that split graph
    into subgraphs.

    a → b → c → d → e
    """
    n = 50
    nodes_names = list("abcde")
    random_binary_matrix = (np.random.randint(10, size=(n, len(nodes_names))) >
                            6).astype(int)
    df = pd.DataFrame(data=random_binary_matrix, columns=nodes_names)

    model = StructureModel()
    model.add_edges_from([
        ("a", "b"),
        ("b", "c"),
        ("c", "d"),
        ("d", "e"),
    ])
    chain_bn = BayesianNetwork(model)
    chain_bn = chain_bn.fit_node_states(df)
    chain_bn = chain_bn.fit_cpds(df,
                                 method="BayesianEstimator",
                                 bayes_prior="K2")
    return chain_bn
def train_bn(data, graph):

    bn = BayesianNetwork(graph)
    bn = bn.fit_node_states(data)
    bn = bn.fit_cpds(data, method='BayesianEstimator', bayes_prior='K2')

    return bn
示例#24
0
    def test_fit_missing_states(self):
        """test issues/15: should be possible to fit with missing states"""

        sm = StructureModel([("a", "b"), ("c", "b")])
        bn = BayesianNetwork(sm)

        train = pd.DataFrame(data=[[0, 0, 1], [1, 0, 1], [1, 1, 1]],
                             columns=["a", "b", "c"])
        test = pd.DataFrame(data=[[0, 0, 1], [1, 0, 1], [1, 1, 2]],
                            columns=["a", "b", "c"])
        data = pd.concat([train, test])

        bn.fit_node_states(data)
        bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2")

        assert bn.cpds["c"].loc[1][0] == 0.8
        assert bn.cpds["c"].loc[2][0] == 0.2
示例#25
0
    def test_behaves_same_as_seperate_calls(self, train_data_idx, train_data_discrete):
        bn1 = BayesianNetwork(from_pandas(train_data_idx, w_threshold=0.3))
        bn2 = BayesianNetwork(from_pandas(train_data_idx, w_threshold=0.3))

        bn1.fit_node_states(train_data_discrete).fit_cpds(train_data_discrete)
        bn2.fit_node_states_and_cpds(train_data_discrete)

        assert bn1.edges == bn2.edges
        assert bn1.node_states == bn2.node_states

        cpds1 = bn1.cpds
        cpds2 = bn2.cpds

        assert cpds1.keys() == cpds2.keys()

        for k in cpds1:
            assert cpds1[k].equals(cpds2[k])
示例#26
0
    def test_invalid_observations(self, train_model, train_data_idx):
        """Test with invalid observations type"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)
        ie = InferenceEngine(bn)

        with pytest.raises(
                TypeError,
                match="Expecting observations to be a dict, list or None"):
            ie.query("123")

        with pytest.raises(
                TypeError,
                match="Expecting observations to be a dict, list or None"):
            ie.query({"123", "abc"})

        with pytest.raises(
                TypeError,
                match="Expecting observations to be a dict, list or None"):
            ie.query(("123", "abc"))
def get_avg_auc_all_info(
    df: pd.DataFrame,
    bn: BayesianNetwork,
    n_splits: int = 5,
    seed: int = 2021,
    n_cpus: int = multiprocessing.cpu_count() - 1,
) -> float:
    """
    Utility function to compute AUC using all nodes beyond the parent nodes

    Args:
        df: Input dataframe
        bn: Bayesian network
        n_splits: Number of cross-validation folds
        seed: Random seed number
        n_cpus: Number of CPU cores to use

    Returns:
        Average AUC
    """
    bn.fit_node_states(df)
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    total_auc = 0

    for fold, (train_idx, test_idx) in enumerate(cv.split(df)):
        t0 = time()
        train_df = df.loc[train_idx, :]
        test_df = df.loc[test_idx, :]
        bn.fit_cpds(train_df, method="BayesianEstimator", bayes_prior="K2")
        chunks = [[bn, test_df, target] for target in bn.nodes]

        with multiprocessing.Pool(n_cpus) as p:
            result = p.starmap(_compute_auc_stub, chunks)

        total_auc += sum(result) / len(bn.nodes)
        print(
            f"Processing fold {fold} using {n_cpus} cores takes {time() - t0} seconds"
        )

    return total_auc / n_splits
示例#28
0
    def __init__(self, bayesian_network: BayesianNetwork,
                 dataset: pd.DataFrame):
        self.df = dataset
        self.bn = bayesian_network.fit_node_states(dataset).fit_cpds(dataset)

        self.problem = {
            'num_vars': self.df.shape[1],
            'names': list(self.df.columns),
            'bounds': [[]]
        }

        self.sampler = saltelli
        self.analyzer = sobol
def get_auc_data(
    df: pd.DataFrame,
    bn: BayesianNetwork,
    n_splits: int = 5,
    seed: int = 2021,
) -> pd.Series:
    """
    Utility function to compute AUC based only on data observations

    Args:
        df: Input dataframe
        bn: Bayesian network
        n_splits: Number of cross-validation folds
        seed: Random seed number

    Returns:
        Average AUC
    """
    bn.fit_node_states(df)
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    nodes_auc = defaultdict(list)

    for fold, (train_idx, test_idx) in enumerate(cv.split(df)):
        t0 = time()
        train_df = df.loc[train_idx, :]
        test_df = df.loc[test_idx, :]
        bn.fit_cpds(train_df, method="BayesianEstimator", bayes_prior="K2")

        for var in bn.nodes:
            _, auc = roc_auc(bn, test_df, var)
            nodes_auc[var].append(auc)

        print(f"Processing fold {fold} takes {time() - t0} seconds")

    nodes_auc = pd.DataFrame(nodes_auc)
    col = nodes_auc.mean(axis=0).idxmin()
    val = nodes_auc.mean(axis=0).min()
    print(f"Variable with lowest AUC is {col} with the value of {val}")
    return nodes_auc.mean().sort_values()
def get_avg_auc(
    df: pd.DataFrame,
    bn: BayesianNetwork,
    n_splits: int = 5,
    seed: int = 2021,
) -> float:
    """
    Estimate the average auc of all nodes in a Bayesian Network given a structure and a dataset using
    k-fold cross-validation. This function uses the bn.predict method in causalnex and cannot be used
    with latent variable models

    Args:
        df: a dataset in the pandas format
        bn: a bayesian network EM object
        n_splits: Number of folds in k-fold cv
        seed: random seed used in k-fold cv

    Returns:
        Average AUC
    """
    bn.fit_node_states(df)
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    total_auc = 0

    for fold, (train_idx, test_idx) in enumerate(cv.split(df)):
        t0 = time()
        cur_auc = 0
        train_df = df.loc[train_idx, :]
        test_df = df.loc[test_idx, :]
        bn.fit_cpds(train_df, method="BayesianEstimator", bayes_prior="K2")

        for var in bn.nodes:
            _, auc = roc_auc(bn, test_df, var)
            cur_auc += auc

        print(f"Processing fold {fold} takes {time() - t0} seconds")
        total_auc += cur_auc / len(bn.nodes)

    return total_auc / n_splits