def chain_network() -> BayesianNetwork: """ This Bayesian Model structure to test do interventions that split graph into subgraphs. a → b → c → d → e """ n = 50 nodes_names = list("abcde") random_binary_matrix = (np.random.randint(10, size=(n, len(nodes_names))) > 6).astype(int) df = pd.DataFrame(data=random_binary_matrix, columns=nodes_names) model = StructureModel() model.add_edges_from([ ("a", "b"), ("b", "c"), ("c", "d"), ("d", "e"), ]) chain_bn = BayesianNetwork(model) chain_bn = chain_bn.fit_node_states(df) chain_bn = chain_bn.fit_cpds(df, method="BayesianEstimator", bayes_prior="K2") return chain_bn
def train_bn(data, graph): bn = BayesianNetwork(graph) bn = bn.fit_node_states(data) bn = bn.fit_cpds(data, method='BayesianEstimator', bayes_prior='K2') return bn
def test_do_sets_state_probability_to_one(self, train_model, train_data_idx): """Do should update the probability of the given observation=state to 1""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) ie.do_intervention("d", 1) assert math.isclose(ie.query()["d"][1], 1)
def test_do_accepts_all_state_probabilities(self, train_model, train_data_idx): """Do should accept a map of state->p and update p accordingly""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) ie.do_intervention("d", {0: 0.7, 1: 0.3}) assert math.isclose(ie.query()["d"][0], 0.7) assert math.isclose(ie.query()["d"][1], 0.3)
def test_do_sets_other_state_probabilitys_to_zero(self, train_model, train_data_idx): """Do should update the probability of every other state for the observation to zero""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) ie.do_intervention("d", 1) assert ie.query()["d"][0] == 0
def test_report_ignores_unrequired_columns_in_data(self, train_data_idx, train_data_discrete, test_data_c_discrete): """Classification report should ignore any columns that are no needed by predict""" bn = BayesianNetwork( from_pandas(train_data_idx, w_threshold=0.3)).fit_node_states(train_data_discrete) train_data_discrete["NEW_COL"] = [1] * len(train_data_discrete) bn.fit_cpds(train_data_discrete) classification_report(bn, test_data_c_discrete, "c")
def test_do_reflected_in_query(self, train_model, train_data_idx): """Do should adjust marginals returned by query when given a different observation""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) assert ie.query({"a": 1})["d"][1] != 1 ie.do_intervention("d", 1) assert ie.query({"a": 1})["d"][1] == 1
def test_fit_lv_not_added(self): """An error should be raised if the latent variable is not added to the network yet""" with pytest.raises( ValueError, match=r"Latent variable 'd' not added to the network", ): df, sm, _, _ = naive_bayes_plus_parents() sm = StructureModel(list(sm.edges)) bn = BayesianNetwork(sm) bn.fit_latent_cpds("d", [0, 1, 2], df)
def test_fit_invalid_lv_name(self, lv_name): """An error should be raised if the latent variable is of an invalid type""" with pytest.raises( ValueError, match=r"Invalid latent variable name *", ): df, sm, _, _ = naive_bayes_plus_parents() sm = StructureModel(list(sm.edges)) bn = BayesianNetwork(sm) bn.fit_latent_cpds(lv_name, [0, 1, 2], df)
def test_add_node_in_edges_to_remove(self): """An error should be raised if the latent variable is part of the edges to remove""" with pytest.raises( ValueError, match="Should only remove edges NOT containing node 'd'", ): _, sm, _, _ = naive_bayes_plus_parents() sm = StructureModel(list(sm.edges)) bn = BayesianNetwork(sm) bn.add_node("d", [], [("a", "d"), ("b", "d")])
def test_do_prevents_new_states_being_added(self, train_model, train_data_idx): """Do should not allow the introduction of new states""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) with pytest.raises( ValueError, match="The cpd states do not match expected states*" ): ie.do_intervention("d", {0: 0.7, 1: 0.3, 2: 0.0})
def __init__( self, list_of_edges: List[Tuple[str]], discretiser_alg: Optional[Dict[str, str]] = None, discretiser_kwargs: Optional[Dict[str, Dict[str, Any]]] = None, probability_kwargs: Dict[str, Dict[str, Any]] = None, return_prob: bool = False, ): """ Args: list_of_edges (list): Edge list to construct graph - if True: return pandas dataframe with predicted probability for each state - if False: return a 1-D prediction array discretiser_alg (dict): Specify a supervised algorithm to discretise each feature in the data. Available options for the dictionary values are ['unsupervised', 'tree', 'mdlp'] - if 'unsupervised': discretise the data using unsupervised method - if 'tree': discretise the data using decision tree method - if 'mdlp': discretise the data using MDLP method discretiser_kwargs (dict): Keyword arguments for discretisation methods. Only applicable if discretiser_alg is not None. probability_kwargs (dict): keyword arguments for the probability model return_prob (bool): choose to return predictions or probability Raises: KeyError: If an incorrect argument is passed ValueError: If the keys in discretiser_alg and discretiser_kwargs differ """ probability_kwargs = probability_kwargs or { "method": "BayesianEstimator", "bayes_prior": "K2", } if discretiser_alg is None: logging.info("No discretiser algorithm was given " "The training data will not be discretised") discretiser_alg = {} discretiser_kwargs = discretiser_kwargs or {} self._validate_discretiser(discretiser_alg, discretiser_kwargs) self.list_of_edges = list_of_edges self.structure = StructureModel(self.list_of_edges) self.bn = BayesianNetwork(self.structure) self.return_prob = return_prob self.probability_kwargs = probability_kwargs self.discretiser_kwargs = discretiser_kwargs self.discretiser_alg = discretiser_alg self._target_name = None self._discretise_data = None
def test_do_expects_all_states_have_a_probability(self, train_model, train_data_idx): """Do should accept only state probabilities where all states in the original cpds are present""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) with pytest.raises( ValueError, match="The cpd states do not match expected states*"): ie.do_intervention("d", {1: 1})
def test_do_expects_all_state_probabilities_sum_to_one( self, train_model, train_data_idx): """Do should accept only state probabilities where the full distribution is provided""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) with pytest.raises( ValueError, match="The cpd for the provided observation must sum to 1"): ie.do_intervention("d", {0: 0.7, 1: 0.4})
def test_do_on_node_with_no_effects_not_allowed(self, train_model, train_data_idx): """It should not be possible to create an isolated node in the network""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) with pytest.raises( ValueError, match="Do calculus cannot be applied because it would result in an isolate", ): ie.do_intervention("a", 1)
def test_reset_do_sets_probabilities_back_to_initial_state( self, train_model, train_data_idx, train_data_idx_marginals ): """Resetting Do operator should re-introduce the original conditional dependencies""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) ie.do_intervention("d", {0: 0.7, 1: 0.3}) ie.reset_do("d") assert math.isclose(ie.query()["d"][0], train_data_idx_marginals["d"][0]) assert math.isclose(ie.query()["d"][1], train_data_idx_marginals["d"][1])
def test_observations_affect_marginals(self, train_model, train_data_idx): """Observing the state of a node should affect the marginals of dependent nodes""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) m1 = ie.query({}) m2 = ie.query({"d": 1}) assert m2["d"][0] == 0 assert m2["d"][1] == 1 assert not math.isclose(m2["b"][1], m1["b"][1], abs_tol=0.01)
def test_empty_query_returns_marginals(self, train_model, train_data_idx, train_data_idx_marginals): """An empty query should return all the marginal probabilities of the model's distribution""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) dist = ie.query({}) for node, states in dist.items(): for state, p in states.items(): assert math.isclose(train_data_idx_marginals[node][state], p, abs_tol=0.05)
def test_create_inference_with_bad_variable_names_fails( self, train_model, train_data_idx): model = StructureModel() model.add_edges_from([(str(u).replace("a", "$a"), str(v).replace("a", "$a")) for u, v in train_model.edges]) train_data_idx.rename(columns={"a": "$a"}, inplace=True) bn = BayesianNetwork(model).fit_node_states(train_data_idx) bn.fit_cpds(train_data_idx) with pytest.raises(ValueError, match="Variable names must match.*"): InferenceEngine(bn)
def test_do_expects_all_state_probabilities_within_0_and_1( self, train_model, train_data_idx): """Do should accept only state probabilities where the full distribution is provided""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) with pytest.raises( ValueError, match= "The cpd for the provided observation must be between 0 and 1", ): ie.do_intervention("d", {0: -1.0, 1: 2.0})
def test_all_states_included(self): """All states in a node should be included""" cg = StructureModel() cg.add_weighted_edges_from([("a", "b", 1)]) bn = BayesianNetwork(cg).fit_node_states( pd.DataFrame([[i, i] for i in range(10)], columns=["a", "b"])) assert all(v in bn.node_states["a"] for v in range(10))
def test_fit_with_null_states_raises_error(self): """An error should be raised if fit is called with null data""" cg = StructureModel() cg.add_weighted_edges_from([("a", "b", 1)]) with pytest.raises(ValueError, match="node '.*' contains None state"): BayesianNetwork(cg).fit_node_states( pd.DataFrame([[None, 1]], columns=["a", "b"]))
def get_markov_blanket(bn: BayesianNetwork, target_node: str) -> "BayesianNetwork": """ Generate the markov blanket of a node in the network Args: bn (BayesianNetwork): A BayesianNetwork object that contains the structure of the full graph target_node (str): Name of the target node that we want the markov boundary for Returns: A Bayesian Network object containing the structure of the input's markov blanket Raises: KeyError: if target_node is not in the network """ if target_node not in bn.nodes: raise KeyError(f"{target_node} is not found in the network") mb_graph = deepcopy(bn) keep_nodes = set() for node in mb_graph.nodes: if node in mb_graph.structure.predecessors(target_node): keep_nodes.add(node) if node in mb_graph.structure.successors(target_node): keep_nodes.add(node) for parent in mb_graph.structure.predecessors(node): keep_nodes.add(parent) for node in mb_graph.nodes: if node not in keep_nodes and node != target_node: mb_graph.structure.remove_node(node) return BayesianNetwork(mb_graph.structure)
def bn_train_model(train_model) -> BayesianNetwork: """ This generates a Bayesian Network and is used in testing Markov blanket method """ train_model.add_edges_from([("a", "f"), ("f", "g"), ("e", "f")]) return BayesianNetwork(train_model)
def test_set_structure(self): """An error should be raised if setting the structure""" sm = StructureModel() sm.add_weighted_edges_from([(1, 2, 2.0)], origin="unknown") sm.add_weighted_edges_from([(1, 3, 1.0)], origin="learned") sm.add_weighted_edges_from([(3, 5, 0.7)], origin="expert") bn = BayesianNetwork(sm) new_sm = StructureModel() sm.add_weighted_edges_from([(2, 5, 3.0)], origin="unknown") sm.add_weighted_edges_from([(2, 3, 2.0)], origin="learned") sm.add_weighted_edges_from([(3, 4, 1.7)], origin="expert") with pytest.raises(AttributeError, match=r"can't set attribute"): bn.structure = new_sm
def roc_auc(bn: BayesianNetwork, data: pd.DataFrame, node: str) -> Tuple[List[Tuple[float, float]], float]: """ Build a report of the micro-average Receiver-Operating Characteristics (ROC), and the Area Under the ROC curve Micro-average computes roc_auc over all predictions for all states of node. Args: bn (BayesianNetwork): model to compute roc_auc. data (pd.DataFrame): test data that will be used to calculate ROC. node (str): name of the variable to generate the report for. Returns: roc - auc tuple - roc (List[Tuple[float, float]]): list of [(fpr, tpr)] observations. - auc float: auc for the node predictions. Example: :: >>> from causalnex.structure import StructureModel >>> from causalnex.network import BayesianNetwork >>> >>> sm = StructureModel() >>> sm.add_edges_from([ >>> ('rush_hour', 'traffic'), >>> ('weather', 'traffic') >>> ]) >>> bn = BayesianNetwork(sm) >>> import pandas as pd >>> data = pd.DataFrame({ >>> 'rush_hour': [True, False, False, False, True, False, True], >>> 'weather': ['Terrible', 'Good', 'Bad', 'Good', 'Bad', 'Bad', 'Good'], >>> 'traffic': ['heavy', 'light', 'heavy', 'light', 'heavy', 'heavy', 'heavy'] >>> } >>> bn = bn.fit_node_states_and_cpds(data) >>> test_data = pd.DataFrame({ >>> 'rush_hour': [False, False, True, True], >>> 'weather': ['Good', 'Bad', 'Good', 'Bad'], >>> 'traffic': ['light', 'heavy', 'heavy', 'light'] >>> }) >>> from causalnex.evaluation import roc_auc >>> roc, auc = roc_auc(bn, test_data, "traffic") >>> print(auc) 0.75 """ ground_truth = _build_ground_truth(bn, data, node) predictions = bn.predict_probability(data, node) # update column names to match those of ground_truth predictions.rename(columns=lambda x: x.lstrip(node + "_"), inplace=True) predictions = predictions[sorted(predictions.columns)] fpr, tpr, _ = metrics.roc_curve(ground_truth.values.ravel(), predictions.values.ravel()) roc = list(zip(fpr, tpr)) auc = metrics.auc(fpr, tpr) return roc, auc
def test_cycles_in_structure(self): """An error should be raised if cycles are present""" with pytest.raises( ValueError, match=r"The given structure is not acyclic\. " r"Please review the following cycle\.*", ): BayesianNetwork(StructureModel([(0, 1), (1, 2), (2, 0)]))
def test_disconnected_components(self, test_input, n_components): """An error should be raised if there is more than one graph component""" with pytest.raises( ValueError, match=r"The given structure has " + str(n_components) + r" separated graph components\. " r"Please make sure it has only one\.", ): BayesianNetwork(StructureModel(test_input))
def get_avg_auc_lvs( df: pd.DataFrame, bn: BayesianNetwork, lv_states: List, n_splits: int = 5, seed: int = 2021, markov_blanket: bool = False, n_cpus: int = multiprocessing.cpu_count() - 1, ) -> float: """ Utility function to compute AUC using only the parent nodes Args: df: Input dataframe bn: Bayesian network lv_states: the states the LV can assume n_splits: Number of cross-validation folds seed: Random seed number markov_blanket: Whether we predict only using the Markov blanket n_cpus: Number of CPU cores to use Returns: Average AUC """ cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed) total_auc = 0 for fold, (train_idx, test_idx) in enumerate(cv.split(df)): t0 = time() train_df = df.loc[train_idx, :] test_df = df.loc[test_idx, :] bn.fit_latent_cpds("LV", lv_states, train_df, n_runs=30) chunks = [[bn, test_df, target, markov_blanket] for target in bn.nodes if target != "LV"] with multiprocessing.Pool(n_cpus) as p: result = p.starmap(_compute_auc_lv_stub, chunks) total_auc += sum(result) / (len(bn.nodes) - 1) print( f"Processing fold {fold} using {n_cpus} cores takes {time() - t0} seconds" ) return total_auc / n_splits
def test_query_when_cpds_not_fit(self, train_data_idx, train_data_discrete): """An error should be raised if query before CPDs are fit""" bn = BayesianNetwork( from_pandas(train_data_idx, w_threshold=0.3) ).fit_node_states(train_data_discrete) with pytest.raises( ValueError, match=r"Bayesian Network does not contain any CPDs.*" ): InferenceEngine(bn)