Exemplo n.º 1
0
def chain_network() -> BayesianNetwork:
    """
    This Bayesian Model structure to test do interventions that split graph
    into subgraphs.

    a → b → c → d → e
    """
    n = 50
    nodes_names = list("abcde")
    random_binary_matrix = (np.random.randint(10, size=(n, len(nodes_names))) >
                            6).astype(int)
    df = pd.DataFrame(data=random_binary_matrix, columns=nodes_names)

    model = StructureModel()
    model.add_edges_from([
        ("a", "b"),
        ("b", "c"),
        ("c", "d"),
        ("d", "e"),
    ])
    chain_bn = BayesianNetwork(model)
    chain_bn = chain_bn.fit_node_states(df)
    chain_bn = chain_bn.fit_cpds(df,
                                 method="BayesianEstimator",
                                 bayes_prior="K2")
    return chain_bn
def train_bn(data, graph):

    bn = BayesianNetwork(graph)
    bn = bn.fit_node_states(data)
    bn = bn.fit_cpds(data, method='BayesianEstimator', bayes_prior='K2')

    return bn
Exemplo n.º 3
0
    def test_auc_of_random_is_half(self):
        """The AUC of random predictions should be 0.5"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for _ in range(10) for a in range(3)
             for b in range(3)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        test = pd.DataFrame(
            [[a, b, random.randint(0, 1)] for a in range(3) for b in range(3)
             for _ in range(1000)],
            columns=["a", "b", "c"],
        )

        _, auc = roc_auc(bn, test, "c")

        assert math.isclose(auc, 0.5, abs_tol=0.03)
Exemplo n.º 4
0
def bn_train_model(train_model) -> BayesianNetwork:
    """
    This generates a Bayesian Network and is used in testing Markov blanket method
    """
    train_model.add_edges_from([("a", "f"), ("f", "g"), ("e", "f")])

    return BayesianNetwork(train_model)
Exemplo n.º 5
0
    def test_auc_of_accurate_predictions(self):
        """AUC of accurate predictions should be 1"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "c")
        assert math.isclose(auc, 1, abs_tol=0.001)
Exemplo n.º 6
0
    def test_auc_for_nonnumeric_features(self):
        """AUC of accurate predictions should be 1 even after remapping numbers to strings"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        # remap values in column c
        train["c"] = train["c"].map({0: "f", 1: "g"})

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "c")
        assert math.isclose(auc, 1, abs_tol=0.001)
Exemplo n.º 7
0
    def test_roc_of_incorrect_has_fpr_lt_tpr(self):
        """The ROC of incorrect predictions should have FPR < TPR"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        # in test, c=0 is always more likely (opposite of train)
        test = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1000)] + [[a, b, 1] for a in range(3)
                                      for b in range(3) for _ in range(1)],
            columns=["a", "b", "c"],
        )

        roc, _ = roc_auc(bn, test, "c")

        assert len(roc) > 3
        assert all(fpr > tpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
Exemplo n.º 8
0
    def test_auc_with_missing_state_in_test(self):
        """AUC should still be calculated correctly with states missing in test set"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        test = train[train["c"] == 1]
        assert len(test["c"].unique()) == 1

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, test, "c")
        assert math.isclose(auc, 1, abs_tol=0.01)
Exemplo n.º 9
0
    def test_auc_node_with_no_parents(self):
        """Should be possible to compute auc for state with no parent nodes"""

        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "a")
        assert math.isclose(auc, 0.5, abs_tol=0.01)
Exemplo n.º 10
0
 def test_fit_with_null_states_raises_error(self):
     """An error should be raised if fit is called with null data"""
     cg = StructureModel()
     cg.add_weighted_edges_from([("a", "b", 1)])
     with pytest.raises(ValueError, match="node '.*' contains None state"):
         BayesianNetwork(cg).fit_node_states(
             pd.DataFrame([[None, 1]], columns=["a", "b"]))
Exemplo n.º 11
0
def get_markov_blanket(bn: BayesianNetwork,
                       target_node: str) -> "BayesianNetwork":
    """
    Generate the markov blanket of a node in the network
    Args:
        bn (BayesianNetwork): A BayesianNetwork object that contains the structure of the full graph
        target_node (str): Name of the target node that we want the markov boundary for
    Returns:
        A Bayesian Network object containing the structure of the input's markov blanket
    Raises:
        KeyError: if target_node is not in the network
    """

    if target_node not in bn.nodes:
        raise KeyError(f"{target_node} is not found in the network")

    mb_graph = deepcopy(bn)
    keep_nodes = set()
    for node in mb_graph.nodes:
        if node in mb_graph.structure.predecessors(target_node):
            keep_nodes.add(node)
        if node in mb_graph.structure.successors(target_node):
            keep_nodes.add(node)
            for parent in mb_graph.structure.predecessors(node):
                keep_nodes.add(parent)
    for node in mb_graph.nodes:
        if node not in keep_nodes and node != target_node:
            mb_graph.structure.remove_node(node)

    return BayesianNetwork(mb_graph.structure)
Exemplo n.º 12
0
    def test_roc_of_accurate_predictions(self):
        """TPR should always be better than FPR for accurate predictions"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(10)] + [[a, b, 1] for a in range(0, 2)
                                    for b in range(0, 2)
                                    for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(10)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        roc, _ = roc_auc(bn, train, "c")
        assert all(tpr > fpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
Exemplo n.º 13
0
 def test_all_states_included(self):
     """All states in a node should be included"""
     cg = StructureModel()
     cg.add_weighted_edges_from([("a", "b", 1)])
     bn = BayesianNetwork(cg).fit_node_states(
         pd.DataFrame([[i, i] for i in range(10)], columns=["a", "b"]))
     assert all(v in bn.node_states["a"] for v in range(10))
Exemplo n.º 14
0
    def test_roc_of_random_has_unit_gradient(self):
        """The ROC curve for random predictions should be a line from (0,0) to (1,1)"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        test = pd.DataFrame(
            [[a, b, random.randint(0, 1)] for a in range(3) for b in range(3)
             for _ in range(1000)],
            columns=["a", "b", "c"],
        )

        roc, _ = roc_auc(bn, test, "c")

        assert len(roc) > 3
        assert all(math.isclose(a, b, abs_tol=0.03) for a, b in roc)
Exemplo n.º 15
0
    def test_behaves_same_as_seperate_calls(self, train_data_idx, train_data_discrete):
        bn1 = BayesianNetwork(from_pandas(train_data_idx, w_threshold=0.3))
        bn2 = BayesianNetwork(from_pandas(train_data_idx, w_threshold=0.3))

        bn1.fit_node_states(train_data_discrete).fit_cpds(train_data_discrete)
        bn2.fit_node_states_and_cpds(train_data_discrete)

        assert bn1.edges == bn2.edges
        assert bn1.node_states == bn2.node_states

        cpds1 = bn1.cpds
        cpds2 = bn2.cpds

        assert cpds1.keys() == cpds2.keys()

        for k in cpds1:
            assert cpds1[k].equals(cpds2[k])
Exemplo n.º 16
0
    def test_cycles_in_structure(self):
        """An error should be raised if cycles are present"""

        with pytest.raises(
            ValueError,
            match=r"The given structure is not acyclic\. "
            r"Please review the following cycle\.*",
        ):
            BayesianNetwork(StructureModel([(0, 1), (1, 2), (2, 0)]))
Exemplo n.º 17
0
    def test_do_sets_state_probability_to_one(self, train_model, train_data_idx):
        """Do should update the probability of the given observation=state to 1"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        ie.do_intervention("d", 1)
        assert math.isclose(ie.query()["d"][1], 1)
Exemplo n.º 18
0
    def test_disconnected_components(self, test_input, n_components):
        """An error should be raised if there is more than one graph component"""

        with pytest.raises(
                ValueError,
                match=r"The given structure has " + str(n_components) +
                r" separated graph components\. "
                r"Please make sure it has only one\.",
        ):
            BayesianNetwork(StructureModel(test_input))
Exemplo n.º 19
0
    def test_do_sets_other_state_probabilitys_to_zero(self, train_model,
                                                      train_data_idx):
        """Do should update the probability of every other state for the observation to zero"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        ie.do_intervention("d", 1)
        assert ie.query()["d"][0] == 0
Exemplo n.º 20
0
    def test_do_accepts_all_state_probabilities(self, train_model, train_data_idx):
        """Do should accept a map of state->p and update p accordingly"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        ie.do_intervention("d", {0: 0.7, 1: 0.3})
        assert math.isclose(ie.query()["d"][0], 0.7)
        assert math.isclose(ie.query()["d"][1], 0.3)
Exemplo n.º 21
0
    def test_query_when_cpds_not_fit(self, train_data_idx, train_data_discrete):
        """An error should be raised if query before CPDs are fit"""

        bn = BayesianNetwork(
            from_pandas(train_data_idx, w_threshold=0.3)
        ).fit_node_states(train_data_discrete)

        with pytest.raises(
            ValueError, match=r"Bayesian Network does not contain any CPDs.*"
        ):
            InferenceEngine(bn)
Exemplo n.º 22
0
    def test_do_reflected_in_query(self, train_model, train_data_idx):
        """Do should adjust marginals returned by query when given a different observation"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        assert ie.query({"a": 1})["d"][1] != 1
        ie.do_intervention("d", 1)
        assert ie.query({"a": 1})["d"][1] == 1
    def test_fit_invalid_lv_name(self, lv_name):
        """An error should be raised if the latent variable is of an invalid type"""

        with pytest.raises(
                ValueError,
                match=r"Invalid latent variable name *",
        ):
            df, sm, _, _ = naive_bayes_plus_parents()
            sm = StructureModel(list(sm.edges))
            bn = BayesianNetwork(sm)
            bn.fit_latent_cpds(lv_name, [0, 1, 2], df)
    def test_fit_lv_not_added(self):
        """An error should be raised if the latent variable is not added to the network yet"""

        with pytest.raises(
                ValueError,
                match=r"Latent variable 'd' not added to the network",
        ):
            df, sm, _, _ = naive_bayes_plus_parents()
            sm = StructureModel(list(sm.edges))
            bn = BayesianNetwork(sm)
            bn.fit_latent_cpds("d", [0, 1, 2], df)
Exemplo n.º 25
0
    def test_report_ignores_unrequired_columns_in_data(self, train_data_idx,
                                                       train_data_discrete,
                                                       test_data_c_discrete):
        """Classification report should ignore any columns that are no needed by predict"""

        bn = BayesianNetwork(
            from_pandas(train_data_idx,
                        w_threshold=0.3)).fit_node_states(train_data_discrete)
        train_data_discrete["NEW_COL"] = [1] * len(train_data_discrete)
        bn.fit_cpds(train_data_discrete)
        classification_report(bn, test_data_c_discrete, "c")
    def test_add_node_in_edges_to_remove(self):
        """An error should be raised if the latent variable is part of the edges to remove"""

        with pytest.raises(
                ValueError,
                match="Should only remove edges NOT containing node 'd'",
        ):
            _, sm, _, _ = naive_bayes_plus_parents()
            sm = StructureModel(list(sm.edges))
            bn = BayesianNetwork(sm)
            bn.add_node("d", [], [("a", "d"), ("b", "d")])
Exemplo n.º 27
0
    def test_do_prevents_new_states_being_added(self, train_model, train_data_idx):
        """Do should not allow the introduction of new states"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        with pytest.raises(
            ValueError, match="The cpd states do not match expected states*"
        ):
            ie.do_intervention("d", {0: 0.7, 1: 0.3, 2: 0.0})
    def test_fit_invalid_lv_states(self, lv_states):
        """An error should be raised if the latent variable has invalid states"""

        with pytest.raises(
                ValueError,
                match="Latent variable 'd' contains no states",
        ):
            df, sm, _, _ = naive_bayes_plus_parents()
            sm = StructureModel(list(sm.edges))
            bn = BayesianNetwork(sm)
            bn.add_node("d", [("z", "d")], [])
            bn.fit_latent_cpds("d", lv_states, df)
Exemplo n.º 29
0
    def __init__(
        self,
        list_of_edges: List[Tuple[str]],
        discretiser_alg: Optional[Dict[str, str]] = None,
        discretiser_kwargs: Optional[Dict[str, Dict[str, Any]]] = None,
        probability_kwargs: Dict[str, Dict[str, Any]] = None,
        return_prob: bool = False,
    ):
        """
        Args:
            list_of_edges (list): Edge list to construct graph
            - if True: return pandas dataframe with predicted probability for each state
            - if False: return a 1-D prediction array
            discretiser_alg (dict): Specify a supervised algorithm to discretise
            each feature in the data. Available options for the dictionary values
            are ['unsupervised', 'tree', 'mdlp']
            - if 'unsupervised': discretise the data using unsupervised method
            - if 'tree': discretise the data using decision tree method
            - if 'mdlp': discretise the data using MDLP method
            discretiser_kwargs (dict): Keyword arguments for discretisation methods.
            Only applicable if discretiser_alg is not None.
            probability_kwargs (dict): keyword arguments for the probability model
            return_prob (bool): choose to return predictions or probability

        Raises:
            KeyError: If an incorrect argument is passed
            ValueError: If the keys in discretiser_alg and discretiser_kwargs differ
        """

        probability_kwargs = probability_kwargs or {
            "method": "BayesianEstimator",
            "bayes_prior": "K2",
        }

        if discretiser_alg is None:
            logging.info("No discretiser algorithm was given "
                         "The training data will not be discretised")
            discretiser_alg = {}

        discretiser_kwargs = discretiser_kwargs or {}

        self._validate_discretiser(discretiser_alg, discretiser_kwargs)

        self.list_of_edges = list_of_edges
        self.structure = StructureModel(self.list_of_edges)
        self.bn = BayesianNetwork(self.structure)
        self.return_prob = return_prob
        self.probability_kwargs = probability_kwargs
        self.discretiser_kwargs = discretiser_kwargs
        self.discretiser_alg = discretiser_alg
        self._target_name = None
        self._discretise_data = None
Exemplo n.º 30
0
    def test_do_on_node_with_no_effects_not_allowed(self, train_model, train_data_idx):
        """It should not be possible to create an isolated node in the network"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        with pytest.raises(
            ValueError,
            match="Do calculus cannot be applied because it would result in an isolate",
        ):
            ie.do_intervention("a", 1)