Пример #1
0
def chain_network() -> BayesianNetwork:
    """
    This Bayesian Model structure to test do interventions that split graph
    into subgraphs.

    a → b → c → d → e
    """
    n = 50
    nodes_names = list("abcde")
    random_binary_matrix = (np.random.randint(10, size=(n, len(nodes_names))) >
                            6).astype(int)
    df = pd.DataFrame(data=random_binary_matrix, columns=nodes_names)

    model = StructureModel()
    model.add_edges_from([
        ("a", "b"),
        ("b", "c"),
        ("c", "d"),
        ("d", "e"),
    ])
    chain_bn = BayesianNetwork(model)
    chain_bn = chain_bn.fit_node_states(df)
    chain_bn = chain_bn.fit_cpds(df,
                                 method="BayesianEstimator",
                                 bayes_prior="K2")
    return chain_bn
def train_bn(data, graph):

    bn = BayesianNetwork(graph)
    bn = bn.fit_node_states(data)
    bn = bn.fit_cpds(data, method='BayesianEstimator', bayes_prior='K2')

    return bn
Пример #3
0
    def test_do_sets_state_probability_to_one(self, train_model, train_data_idx):
        """Do should update the probability of the given observation=state to 1"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        ie.do_intervention("d", 1)
        assert math.isclose(ie.query()["d"][1], 1)
Пример #4
0
    def test_do_accepts_all_state_probabilities(self, train_model, train_data_idx):
        """Do should accept a map of state->p and update p accordingly"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        ie.do_intervention("d", {0: 0.7, 1: 0.3})
        assert math.isclose(ie.query()["d"][0], 0.7)
        assert math.isclose(ie.query()["d"][1], 0.3)
Пример #5
0
    def test_do_sets_other_state_probabilitys_to_zero(self, train_model,
                                                      train_data_idx):
        """Do should update the probability of every other state for the observation to zero"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        ie.do_intervention("d", 1)
        assert ie.query()["d"][0] == 0
Пример #6
0
    def test_report_ignores_unrequired_columns_in_data(self, train_data_idx,
                                                       train_data_discrete,
                                                       test_data_c_discrete):
        """Classification report should ignore any columns that are no needed by predict"""

        bn = BayesianNetwork(
            from_pandas(train_data_idx,
                        w_threshold=0.3)).fit_node_states(train_data_discrete)
        train_data_discrete["NEW_COL"] = [1] * len(train_data_discrete)
        bn.fit_cpds(train_data_discrete)
        classification_report(bn, test_data_c_discrete, "c")
Пример #7
0
    def test_do_reflected_in_query(self, train_model, train_data_idx):
        """Do should adjust marginals returned by query when given a different observation"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        assert ie.query({"a": 1})["d"][1] != 1
        ie.do_intervention("d", 1)
        assert ie.query({"a": 1})["d"][1] == 1
    def test_fit_lv_not_added(self):
        """An error should be raised if the latent variable is not added to the network yet"""

        with pytest.raises(
                ValueError,
                match=r"Latent variable 'd' not added to the network",
        ):
            df, sm, _, _ = naive_bayes_plus_parents()
            sm = StructureModel(list(sm.edges))
            bn = BayesianNetwork(sm)
            bn.fit_latent_cpds("d", [0, 1, 2], df)
    def test_fit_invalid_lv_name(self, lv_name):
        """An error should be raised if the latent variable is of an invalid type"""

        with pytest.raises(
                ValueError,
                match=r"Invalid latent variable name *",
        ):
            df, sm, _, _ = naive_bayes_plus_parents()
            sm = StructureModel(list(sm.edges))
            bn = BayesianNetwork(sm)
            bn.fit_latent_cpds(lv_name, [0, 1, 2], df)
    def test_add_node_in_edges_to_remove(self):
        """An error should be raised if the latent variable is part of the edges to remove"""

        with pytest.raises(
                ValueError,
                match="Should only remove edges NOT containing node 'd'",
        ):
            _, sm, _, _ = naive_bayes_plus_parents()
            sm = StructureModel(list(sm.edges))
            bn = BayesianNetwork(sm)
            bn.add_node("d", [], [("a", "d"), ("b", "d")])
Пример #11
0
    def test_do_prevents_new_states_being_added(self, train_model, train_data_idx):
        """Do should not allow the introduction of new states"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        with pytest.raises(
            ValueError, match="The cpd states do not match expected states*"
        ):
            ie.do_intervention("d", {0: 0.7, 1: 0.3, 2: 0.0})
Пример #12
0
    def __init__(
        self,
        list_of_edges: List[Tuple[str]],
        discretiser_alg: Optional[Dict[str, str]] = None,
        discretiser_kwargs: Optional[Dict[str, Dict[str, Any]]] = None,
        probability_kwargs: Dict[str, Dict[str, Any]] = None,
        return_prob: bool = False,
    ):
        """
        Args:
            list_of_edges (list): Edge list to construct graph
            - if True: return pandas dataframe with predicted probability for each state
            - if False: return a 1-D prediction array
            discretiser_alg (dict): Specify a supervised algorithm to discretise
            each feature in the data. Available options for the dictionary values
            are ['unsupervised', 'tree', 'mdlp']
            - if 'unsupervised': discretise the data using unsupervised method
            - if 'tree': discretise the data using decision tree method
            - if 'mdlp': discretise the data using MDLP method
            discretiser_kwargs (dict): Keyword arguments for discretisation methods.
            Only applicable if discretiser_alg is not None.
            probability_kwargs (dict): keyword arguments for the probability model
            return_prob (bool): choose to return predictions or probability

        Raises:
            KeyError: If an incorrect argument is passed
            ValueError: If the keys in discretiser_alg and discretiser_kwargs differ
        """

        probability_kwargs = probability_kwargs or {
            "method": "BayesianEstimator",
            "bayes_prior": "K2",
        }

        if discretiser_alg is None:
            logging.info("No discretiser algorithm was given "
                         "The training data will not be discretised")
            discretiser_alg = {}

        discretiser_kwargs = discretiser_kwargs or {}

        self._validate_discretiser(discretiser_alg, discretiser_kwargs)

        self.list_of_edges = list_of_edges
        self.structure = StructureModel(self.list_of_edges)
        self.bn = BayesianNetwork(self.structure)
        self.return_prob = return_prob
        self.probability_kwargs = probability_kwargs
        self.discretiser_kwargs = discretiser_kwargs
        self.discretiser_alg = discretiser_alg
        self._target_name = None
        self._discretise_data = None
Пример #13
0
    def test_do_expects_all_states_have_a_probability(self, train_model,
                                                      train_data_idx):
        """Do should accept only state probabilities where all states in the original cpds are present"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        with pytest.raises(
                ValueError,
                match="The cpd states do not match expected states*"):
            ie.do_intervention("d", {1: 1})
Пример #14
0
    def test_do_expects_all_state_probabilities_sum_to_one(
            self, train_model, train_data_idx):
        """Do should accept only state probabilities where the full distribution is provided"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        with pytest.raises(
                ValueError,
                match="The cpd for the provided observation must sum to 1"):
            ie.do_intervention("d", {0: 0.7, 1: 0.4})
Пример #15
0
    def test_do_on_node_with_no_effects_not_allowed(self, train_model, train_data_idx):
        """It should not be possible to create an isolated node in the network"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        with pytest.raises(
            ValueError,
            match="Do calculus cannot be applied because it would result in an isolate",
        ):
            ie.do_intervention("a", 1)
Пример #16
0
    def test_reset_do_sets_probabilities_back_to_initial_state(
        self, train_model, train_data_idx, train_data_idx_marginals
    ):
        """Resetting Do operator should re-introduce the original conditional dependencies"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        ie.do_intervention("d", {0: 0.7, 1: 0.3})
        ie.reset_do("d")

        assert math.isclose(ie.query()["d"][0], train_data_idx_marginals["d"][0])
        assert math.isclose(ie.query()["d"][1], train_data_idx_marginals["d"][1])
Пример #17
0
    def test_observations_affect_marginals(self, train_model, train_data_idx):
        """Observing the state of a node should affect the marginals of dependent nodes"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        m1 = ie.query({})
        m2 = ie.query({"d": 1})

        assert m2["d"][0] == 0
        assert m2["d"][1] == 1
        assert not math.isclose(m2["b"][1], m1["b"][1], abs_tol=0.01)
Пример #18
0
    def test_empty_query_returns_marginals(self, train_model, train_data_idx,
                                           train_data_idx_marginals):
        """An empty query should return all the marginal probabilities of the model's distribution"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        dist = ie.query({})

        for node, states in dist.items():
            for state, p in states.items():
                assert math.isclose(train_data_idx_marginals[node][state],
                                    p,
                                    abs_tol=0.05)
Пример #19
0
    def test_create_inference_with_bad_variable_names_fails(
            self, train_model, train_data_idx):

        model = StructureModel()
        model.add_edges_from([(str(u).replace("a",
                                              "$a"), str(v).replace("a", "$a"))
                              for u, v in train_model.edges])

        train_data_idx.rename(columns={"a": "$a"}, inplace=True)

        bn = BayesianNetwork(model).fit_node_states(train_data_idx)
        bn.fit_cpds(train_data_idx)

        with pytest.raises(ValueError, match="Variable names must match.*"):
            InferenceEngine(bn)
Пример #20
0
    def test_do_expects_all_state_probabilities_within_0_and_1(
            self, train_model, train_data_idx):
        """Do should accept only state probabilities where the full distribution is provided"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        with pytest.raises(
                ValueError,
                match=
                "The cpd for the provided observation must be between 0 and 1",
        ):
            ie.do_intervention("d", {0: -1.0, 1: 2.0})
Пример #21
0
 def test_all_states_included(self):
     """All states in a node should be included"""
     cg = StructureModel()
     cg.add_weighted_edges_from([("a", "b", 1)])
     bn = BayesianNetwork(cg).fit_node_states(
         pd.DataFrame([[i, i] for i in range(10)], columns=["a", "b"]))
     assert all(v in bn.node_states["a"] for v in range(10))
Пример #22
0
 def test_fit_with_null_states_raises_error(self):
     """An error should be raised if fit is called with null data"""
     cg = StructureModel()
     cg.add_weighted_edges_from([("a", "b", 1)])
     with pytest.raises(ValueError, match="node '.*' contains None state"):
         BayesianNetwork(cg).fit_node_states(
             pd.DataFrame([[None, 1]], columns=["a", "b"]))
Пример #23
0
def get_markov_blanket(bn: BayesianNetwork,
                       target_node: str) -> "BayesianNetwork":
    """
    Generate the markov blanket of a node in the network
    Args:
        bn (BayesianNetwork): A BayesianNetwork object that contains the structure of the full graph
        target_node (str): Name of the target node that we want the markov boundary for
    Returns:
        A Bayesian Network object containing the structure of the input's markov blanket
    Raises:
        KeyError: if target_node is not in the network
    """

    if target_node not in bn.nodes:
        raise KeyError(f"{target_node} is not found in the network")

    mb_graph = deepcopy(bn)
    keep_nodes = set()
    for node in mb_graph.nodes:
        if node in mb_graph.structure.predecessors(target_node):
            keep_nodes.add(node)
        if node in mb_graph.structure.successors(target_node):
            keep_nodes.add(node)
            for parent in mb_graph.structure.predecessors(node):
                keep_nodes.add(parent)
    for node in mb_graph.nodes:
        if node not in keep_nodes and node != target_node:
            mb_graph.structure.remove_node(node)

    return BayesianNetwork(mb_graph.structure)
Пример #24
0
def bn_train_model(train_model) -> BayesianNetwork:
    """
    This generates a Bayesian Network and is used in testing Markov blanket method
    """
    train_model.add_edges_from([("a", "f"), ("f", "g"), ("e", "f")])

    return BayesianNetwork(train_model)
Пример #25
0
    def test_set_structure(self):
        """An error should be raised if setting the structure"""

        sm = StructureModel()
        sm.add_weighted_edges_from([(1, 2, 2.0)], origin="unknown")
        sm.add_weighted_edges_from([(1, 3, 1.0)], origin="learned")
        sm.add_weighted_edges_from([(3, 5, 0.7)], origin="expert")

        bn = BayesianNetwork(sm)

        new_sm = StructureModel()
        sm.add_weighted_edges_from([(2, 5, 3.0)], origin="unknown")
        sm.add_weighted_edges_from([(2, 3, 2.0)], origin="learned")
        sm.add_weighted_edges_from([(3, 4, 1.7)], origin="expert")

        with pytest.raises(AttributeError, match=r"can't set attribute"):
            bn.structure = new_sm
Пример #26
0
def roc_auc(bn: BayesianNetwork, data: pd.DataFrame,
            node: str) -> Tuple[List[Tuple[float, float]], float]:
    """
    Build a report of the micro-average Receiver-Operating Characteristics (ROC), and the Area Under the ROC curve
    Micro-average computes roc_auc over all predictions for all states of node.

    Args:
        bn (BayesianNetwork): model to compute roc_auc.
        data (pd.DataFrame): test data that will be used to calculate ROC.
        node (str): name of the variable to generate the report for.

    Returns:
        roc - auc tuple
         - roc (List[Tuple[float, float]]): list of [(fpr, tpr)] observations.
         - auc float: auc for the node predictions.

    Example:
    ::
        >>> from causalnex.structure import StructureModel
        >>> from causalnex.network import BayesianNetwork
        >>>
        >>> sm = StructureModel()
        >>> sm.add_edges_from([
        >>>                    ('rush_hour', 'traffic'),
        >>>                    ('weather', 'traffic')
        >>>                    ])
        >>> bn = BayesianNetwork(sm)
        >>> import pandas as pd
        >>> data = pd.DataFrame({
        >>>                      'rush_hour': [True, False, False, False, True, False, True],
        >>>                      'weather': ['Terrible', 'Good', 'Bad', 'Good', 'Bad', 'Bad', 'Good'],
        >>>                      'traffic': ['heavy', 'light', 'heavy', 'light', 'heavy', 'heavy', 'heavy']
        >>>                      }
        >>> bn = bn.fit_node_states_and_cpds(data)
        >>> test_data = pd.DataFrame({
        >>>                         'rush_hour': [False, False, True, True],
        >>>                         'weather': ['Good', 'Bad', 'Good', 'Bad'],
        >>>                         'traffic': ['light', 'heavy', 'heavy', 'light']
        >>>                         })
        >>> from causalnex.evaluation import roc_auc
        >>> roc, auc = roc_auc(bn, test_data, "traffic")
        >>> print(auc)
        0.75
    """

    ground_truth = _build_ground_truth(bn, data, node)
    predictions = bn.predict_probability(data, node)

    # update column names to match those of ground_truth
    predictions.rename(columns=lambda x: x.lstrip(node + "_"), inplace=True)
    predictions = predictions[sorted(predictions.columns)]

    fpr, tpr, _ = metrics.roc_curve(ground_truth.values.ravel(),
                                    predictions.values.ravel())
    roc = list(zip(fpr, tpr))
    auc = metrics.auc(fpr, tpr)

    return roc, auc
Пример #27
0
    def test_cycles_in_structure(self):
        """An error should be raised if cycles are present"""

        with pytest.raises(
            ValueError,
            match=r"The given structure is not acyclic\. "
            r"Please review the following cycle\.*",
        ):
            BayesianNetwork(StructureModel([(0, 1), (1, 2), (2, 0)]))
Пример #28
0
    def test_disconnected_components(self, test_input, n_components):
        """An error should be raised if there is more than one graph component"""

        with pytest.raises(
                ValueError,
                match=r"The given structure has " + str(n_components) +
                r" separated graph components\. "
                r"Please make sure it has only one\.",
        ):
            BayesianNetwork(StructureModel(test_input))
Пример #29
0
def get_avg_auc_lvs(
    df: pd.DataFrame,
    bn: BayesianNetwork,
    lv_states: List,
    n_splits: int = 5,
    seed: int = 2021,
    markov_blanket: bool = False,
    n_cpus: int = multiprocessing.cpu_count() - 1,
) -> float:
    """
    Utility function to compute AUC using only the parent nodes

    Args:
        df: Input dataframe
        bn: Bayesian network
        lv_states: the states the LV can assume
        n_splits: Number of cross-validation folds
        seed: Random seed number
        markov_blanket: Whether we predict only using the Markov blanket
        n_cpus: Number of CPU cores to use

    Returns:
        Average AUC
    """
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    total_auc = 0

    for fold, (train_idx, test_idx) in enumerate(cv.split(df)):
        t0 = time()
        train_df = df.loc[train_idx, :]
        test_df = df.loc[test_idx, :]
        bn.fit_latent_cpds("LV", lv_states, train_df, n_runs=30)
        chunks = [[bn, test_df, target, markov_blanket] for target in bn.nodes
                  if target != "LV"]
        with multiprocessing.Pool(n_cpus) as p:
            result = p.starmap(_compute_auc_lv_stub, chunks)

        total_auc += sum(result) / (len(bn.nodes) - 1)
        print(
            f"Processing fold {fold} using {n_cpus} cores takes {time() - t0} seconds"
        )

    return total_auc / n_splits
Пример #30
0
    def test_query_when_cpds_not_fit(self, train_data_idx, train_data_discrete):
        """An error should be raised if query before CPDs are fit"""

        bn = BayesianNetwork(
            from_pandas(train_data_idx, w_threshold=0.3)
        ).fit_node_states(train_data_discrete)

        with pytest.raises(
            ValueError, match=r"Bayesian Network does not contain any CPDs.*"
        ):
            InferenceEngine(bn)