def chain_network() -> BayesianNetwork: """ This Bayesian Model structure to test do interventions that split graph into subgraphs. a → b → c → d → e """ n = 50 nodes_names = list("abcde") random_binary_matrix = (np.random.randint(10, size=(n, len(nodes_names))) > 6).astype(int) df = pd.DataFrame(data=random_binary_matrix, columns=nodes_names) model = StructureModel() model.add_edges_from([ ("a", "b"), ("b", "c"), ("c", "d"), ("d", "e"), ]) chain_bn = BayesianNetwork(model) chain_bn = chain_bn.fit_node_states(df) chain_bn = chain_bn.fit_cpds(df, method="BayesianEstimator", bayes_prior="K2") return chain_bn
def train_bn(data, graph): bn = BayesianNetwork(graph) bn = bn.fit_node_states(data) bn = bn.fit_cpds(data, method='BayesianEstimator', bayes_prior='K2') return bn
def test_auc_of_random_is_half(self): """The AUC of random predictions should be 0.5""" # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold # points in roc curve) train = pd.DataFrame( [[a, b, 0] for _ in range(10) for a in range(3) for b in range(3)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(a * 1000 + b * 1000 + 1000)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02) test = pd.DataFrame( [[a, b, random.randint(0, 1)] for a in range(3) for b in range(3) for _ in range(1000)], columns=["a", "b", "c"], ) _, auc = roc_auc(bn, test, "c") assert math.isclose(auc, 0.5, abs_tol=0.03)
def bn_train_model(train_model) -> BayesianNetwork: """ This generates a Bayesian Network and is used in testing Markov blanket method """ train_model.add_edges_from([("a", "f"), ("f", "g"), ("e", "f")]) return BayesianNetwork(train_model)
def test_auc_of_accurate_predictions(self): """AUC of accurate predictions should be 1""" # equal class (c) weighting to guarantee high ROC expected train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(1)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(1)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) _, auc = roc_auc(bn, train, "c") assert math.isclose(auc, 1, abs_tol=0.001)
def test_auc_for_nonnumeric_features(self): """AUC of accurate predictions should be 1 even after remapping numbers to strings""" # equal class (c) weighting to guarantee high ROC expected train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(1)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(1)], columns=["a", "b", "c"], ) # remap values in column c train["c"] = train["c"].map({0: "f", 1: "g"}) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) _, auc = roc_auc(bn, train, "c") assert math.isclose(auc, 1, abs_tol=0.001)
def test_roc_of_incorrect_has_fpr_lt_tpr(self): """The ROC of incorrect predictions should have FPR < TPR""" # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold # points in roc curve) train = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(a * 1000 + b * 1000 + 1000)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02) # in test, c=0 is always more likely (opposite of train) test = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1000)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(1)], columns=["a", "b", "c"], ) roc, _ = roc_auc(bn, test, "c") assert len(roc) > 3 assert all(fpr > tpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
def test_auc_with_missing_state_in_test(self): """AUC should still be calculated correctly with states missing in test set""" # equal class (c) weighting to guarantee high ROC expected train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(1)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(1)], columns=["a", "b", "c"], ) test = train[train["c"] == 1] assert len(test["c"].unique()) == 1 cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) _, auc = roc_auc(bn, test, "c") assert math.isclose(auc, 1, abs_tol=0.01)
def test_auc_node_with_no_parents(self): """Should be possible to compute auc for state with no parent nodes""" train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(1)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(1)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) _, auc = roc_auc(bn, train, "a") assert math.isclose(auc, 0.5, abs_tol=0.01)
def test_fit_with_null_states_raises_error(self): """An error should be raised if fit is called with null data""" cg = StructureModel() cg.add_weighted_edges_from([("a", "b", 1)]) with pytest.raises(ValueError, match="node '.*' contains None state"): BayesianNetwork(cg).fit_node_states( pd.DataFrame([[None, 1]], columns=["a", "b"]))
def get_markov_blanket(bn: BayesianNetwork, target_node: str) -> "BayesianNetwork": """ Generate the markov blanket of a node in the network Args: bn (BayesianNetwork): A BayesianNetwork object that contains the structure of the full graph target_node (str): Name of the target node that we want the markov boundary for Returns: A Bayesian Network object containing the structure of the input's markov blanket Raises: KeyError: if target_node is not in the network """ if target_node not in bn.nodes: raise KeyError(f"{target_node} is not found in the network") mb_graph = deepcopy(bn) keep_nodes = set() for node in mb_graph.nodes: if node in mb_graph.structure.predecessors(target_node): keep_nodes.add(node) if node in mb_graph.structure.successors(target_node): keep_nodes.add(node) for parent in mb_graph.structure.predecessors(node): keep_nodes.add(parent) for node in mb_graph.nodes: if node not in keep_nodes and node != target_node: mb_graph.structure.remove_node(node) return BayesianNetwork(mb_graph.structure)
def test_roc_of_accurate_predictions(self): """TPR should always be better than FPR for accurate predictions""" # equal class (c) weighting to guarantee high ROC expected train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(10)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(10)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) roc, _ = roc_auc(bn, train, "c") assert all(tpr > fpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
def test_all_states_included(self): """All states in a node should be included""" cg = StructureModel() cg.add_weighted_edges_from([("a", "b", 1)]) bn = BayesianNetwork(cg).fit_node_states( pd.DataFrame([[i, i] for i in range(10)], columns=["a", "b"])) assert all(v in bn.node_states["a"] for v in range(10))
def test_roc_of_random_has_unit_gradient(self): """The ROC curve for random predictions should be a line from (0,0) to (1,1)""" # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold # points in roc curve) train = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(a * 1000 + b * 1000 + 1000)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02) test = pd.DataFrame( [[a, b, random.randint(0, 1)] for a in range(3) for b in range(3) for _ in range(1000)], columns=["a", "b", "c"], ) roc, _ = roc_auc(bn, test, "c") assert len(roc) > 3 assert all(math.isclose(a, b, abs_tol=0.03) for a, b in roc)
def test_behaves_same_as_seperate_calls(self, train_data_idx, train_data_discrete): bn1 = BayesianNetwork(from_pandas(train_data_idx, w_threshold=0.3)) bn2 = BayesianNetwork(from_pandas(train_data_idx, w_threshold=0.3)) bn1.fit_node_states(train_data_discrete).fit_cpds(train_data_discrete) bn2.fit_node_states_and_cpds(train_data_discrete) assert bn1.edges == bn2.edges assert bn1.node_states == bn2.node_states cpds1 = bn1.cpds cpds2 = bn2.cpds assert cpds1.keys() == cpds2.keys() for k in cpds1: assert cpds1[k].equals(cpds2[k])
def test_cycles_in_structure(self): """An error should be raised if cycles are present""" with pytest.raises( ValueError, match=r"The given structure is not acyclic\. " r"Please review the following cycle\.*", ): BayesianNetwork(StructureModel([(0, 1), (1, 2), (2, 0)]))
def test_do_sets_state_probability_to_one(self, train_model, train_data_idx): """Do should update the probability of the given observation=state to 1""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) ie.do_intervention("d", 1) assert math.isclose(ie.query()["d"][1], 1)
def test_disconnected_components(self, test_input, n_components): """An error should be raised if there is more than one graph component""" with pytest.raises( ValueError, match=r"The given structure has " + str(n_components) + r" separated graph components\. " r"Please make sure it has only one\.", ): BayesianNetwork(StructureModel(test_input))
def test_do_sets_other_state_probabilitys_to_zero(self, train_model, train_data_idx): """Do should update the probability of every other state for the observation to zero""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) ie.do_intervention("d", 1) assert ie.query()["d"][0] == 0
def test_do_accepts_all_state_probabilities(self, train_model, train_data_idx): """Do should accept a map of state->p and update p accordingly""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) ie.do_intervention("d", {0: 0.7, 1: 0.3}) assert math.isclose(ie.query()["d"][0], 0.7) assert math.isclose(ie.query()["d"][1], 0.3)
def test_query_when_cpds_not_fit(self, train_data_idx, train_data_discrete): """An error should be raised if query before CPDs are fit""" bn = BayesianNetwork( from_pandas(train_data_idx, w_threshold=0.3) ).fit_node_states(train_data_discrete) with pytest.raises( ValueError, match=r"Bayesian Network does not contain any CPDs.*" ): InferenceEngine(bn)
def test_do_reflected_in_query(self, train_model, train_data_idx): """Do should adjust marginals returned by query when given a different observation""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) assert ie.query({"a": 1})["d"][1] != 1 ie.do_intervention("d", 1) assert ie.query({"a": 1})["d"][1] == 1
def test_fit_invalid_lv_name(self, lv_name): """An error should be raised if the latent variable is of an invalid type""" with pytest.raises( ValueError, match=r"Invalid latent variable name *", ): df, sm, _, _ = naive_bayes_plus_parents() sm = StructureModel(list(sm.edges)) bn = BayesianNetwork(sm) bn.fit_latent_cpds(lv_name, [0, 1, 2], df)
def test_fit_lv_not_added(self): """An error should be raised if the latent variable is not added to the network yet""" with pytest.raises( ValueError, match=r"Latent variable 'd' not added to the network", ): df, sm, _, _ = naive_bayes_plus_parents() sm = StructureModel(list(sm.edges)) bn = BayesianNetwork(sm) bn.fit_latent_cpds("d", [0, 1, 2], df)
def test_report_ignores_unrequired_columns_in_data(self, train_data_idx, train_data_discrete, test_data_c_discrete): """Classification report should ignore any columns that are no needed by predict""" bn = BayesianNetwork( from_pandas(train_data_idx, w_threshold=0.3)).fit_node_states(train_data_discrete) train_data_discrete["NEW_COL"] = [1] * len(train_data_discrete) bn.fit_cpds(train_data_discrete) classification_report(bn, test_data_c_discrete, "c")
def test_add_node_in_edges_to_remove(self): """An error should be raised if the latent variable is part of the edges to remove""" with pytest.raises( ValueError, match="Should only remove edges NOT containing node 'd'", ): _, sm, _, _ = naive_bayes_plus_parents() sm = StructureModel(list(sm.edges)) bn = BayesianNetwork(sm) bn.add_node("d", [], [("a", "d"), ("b", "d")])
def test_do_prevents_new_states_being_added(self, train_model, train_data_idx): """Do should not allow the introduction of new states""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) with pytest.raises( ValueError, match="The cpd states do not match expected states*" ): ie.do_intervention("d", {0: 0.7, 1: 0.3, 2: 0.0})
def test_fit_invalid_lv_states(self, lv_states): """An error should be raised if the latent variable has invalid states""" with pytest.raises( ValueError, match="Latent variable 'd' contains no states", ): df, sm, _, _ = naive_bayes_plus_parents() sm = StructureModel(list(sm.edges)) bn = BayesianNetwork(sm) bn.add_node("d", [("z", "d")], []) bn.fit_latent_cpds("d", lv_states, df)
def __init__( self, list_of_edges: List[Tuple[str]], discretiser_alg: Optional[Dict[str, str]] = None, discretiser_kwargs: Optional[Dict[str, Dict[str, Any]]] = None, probability_kwargs: Dict[str, Dict[str, Any]] = None, return_prob: bool = False, ): """ Args: list_of_edges (list): Edge list to construct graph - if True: return pandas dataframe with predicted probability for each state - if False: return a 1-D prediction array discretiser_alg (dict): Specify a supervised algorithm to discretise each feature in the data. Available options for the dictionary values are ['unsupervised', 'tree', 'mdlp'] - if 'unsupervised': discretise the data using unsupervised method - if 'tree': discretise the data using decision tree method - if 'mdlp': discretise the data using MDLP method discretiser_kwargs (dict): Keyword arguments for discretisation methods. Only applicable if discretiser_alg is not None. probability_kwargs (dict): keyword arguments for the probability model return_prob (bool): choose to return predictions or probability Raises: KeyError: If an incorrect argument is passed ValueError: If the keys in discretiser_alg and discretiser_kwargs differ """ probability_kwargs = probability_kwargs or { "method": "BayesianEstimator", "bayes_prior": "K2", } if discretiser_alg is None: logging.info("No discretiser algorithm was given " "The training data will not be discretised") discretiser_alg = {} discretiser_kwargs = discretiser_kwargs or {} self._validate_discretiser(discretiser_alg, discretiser_kwargs) self.list_of_edges = list_of_edges self.structure = StructureModel(self.list_of_edges) self.bn = BayesianNetwork(self.structure) self.return_prob = return_prob self.probability_kwargs = probability_kwargs self.discretiser_kwargs = discretiser_kwargs self.discretiser_alg = discretiser_alg self._target_name = None self._discretise_data = None
def test_do_on_node_with_no_effects_not_allowed(self, train_model, train_data_idx): """It should not be possible to create an isolated node in the network""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) with pytest.raises( ValueError, match="Do calculus cannot be applied because it would result in an isolate", ): ie.do_intervention("a", 1)