def test_roc_of_incorrect_has_fpr_lt_tpr(self): """The ROC of incorrect predictions should have FPR < TPR""" # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold # points in roc curve) train = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(a * 1000 + b * 1000 + 1000)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02) # in test, c=0 is always more likely (opposite of train) test = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1000)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(1)], columns=["a", "b", "c"], ) roc, _ = roc_auc(bn, test, "c") assert len(roc) > 3 assert all(fpr > tpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
def test_auc_for_nonnumeric_features(self): """AUC of accurate predictions should be 1 even after remapping numbers to strings""" # equal class (c) weighting to guarantee high ROC expected train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(1)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(1)], columns=["a", "b", "c"], ) # remap values in column c train["c"] = train["c"].map({0: "f", 1: "g"}) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) _, auc = roc_auc(bn, train, "c") assert math.isclose(auc, 1, abs_tol=0.001)
def compare_result_with_ideal( em_cpds: Dict[str, pd.DataFrame], sm: StructureModel, data: pd.DataFrame, true_values_lv: np.array, node_states: Dict[AnyStr, Union[List, Set]], ) -> Tuple[float, float]: """ Compare learned CPDs with ideal CPDs Args: em_cpds: Learned CPDs for different nodes sm: Structure model data: Input dataset true_values_lv: Ideal values of the latent variable node_states: Possible tates of different nodes Returns: Maximum absolute difference and root mean square of differences """ data["z"] = true_values_lv.reshape(-1) bn = BayesianNetwork(sm) bn.fit_node_states(states_to_df(node_states)) bn.fit_cpds(data) max_delta = -1 avg_delta = 0 for node in em_cpds: deltas = (em_cpds[node] - bn.cpds[node]).abs().values max_delta = max(max_delta, deltas.max()) avg_delta += np.mean(deltas ** 2) avg_delta = np.sqrt(avg_delta / len(em_cpds)) return max_delta, avg_delta
def test_auc_with_missing_state_in_test(self): """AUC should still be calculated correctly with states missing in test set""" # equal class (c) weighting to guarantee high ROC expected train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(1)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(1)], columns=["a", "b", "c"], ) test = train[train["c"] == 1] assert len(test["c"].unique()) == 1 cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) _, auc = roc_auc(bn, test, "c") assert math.isclose(auc, 1, abs_tol=0.01)
def test_auc_node_with_no_parents(self): """Should be possible to compute auc for state with no parent nodes""" train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(1)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(1)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) _, auc = roc_auc(bn, train, "a") assert math.isclose(auc, 0.5, abs_tol=0.01)
def test_roc_of_accurate_predictions(self): """TPR should always be better than FPR for accurate predictions""" # equal class (c) weighting to guarantee high ROC expected train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(10)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(10)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) roc, _ = roc_auc(bn, train, "c") assert all(tpr > fpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
def test_auc_of_accurate_predictions(self): """AUC of accurate predictions should be 1""" # equal class (c) weighting to guarantee high ROC expected train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(1)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(1)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) _, auc = roc_auc(bn, train, "c") assert math.isclose(auc, 1, abs_tol=0.001)
def test_auc_of_incorrect_close_to_zero(self): """The AUC of incorrect predictions should be close to zero""" # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold # points in roc curve) train = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(a * 1000 + b * 1000 + 1000)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02) # in test, c=0 is always more likely (opposite of train) test = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1000)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(1)], columns=["a", "b", "c"], ) _, auc = roc_auc(bn, test, "c") assert math.isclose(auc, 0, abs_tol=0.001)
def test_roc_of_random_has_unit_gradient(self): """The ROC curve for random predictions should be a line from (0,0) to (1,1)""" # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold # points in roc curve) train = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(a * 1000 + b * 1000 + 1000)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02) test = pd.DataFrame( [[a, b, random.randint(0, 1)] for a in range(3) for b in range(3) for _ in range(1000)], columns=["a", "b", "c"], ) roc, _ = roc_auc(bn, test, "c") assert len(roc) > 3 assert all(math.isclose(a, b, abs_tol=0.03) for a, b in roc)
def test_do_sets_state_probability_to_one(self, train_model, train_data_idx): """Do should update the probability of the given observation=state to 1""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) ie.do_intervention("d", 1) assert math.isclose(ie.query()["d"][1], 1)
def test_do_accepts_all_state_probabilities(self, train_model, train_data_idx): """Do should accept a map of state->p and update p accordingly""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) ie.do_intervention("d", {0: 0.7, 1: 0.3}) assert math.isclose(ie.query()["d"][0], 0.7) assert math.isclose(ie.query()["d"][1], 0.3)
def test_do_sets_other_state_probabilitys_to_zero(self, train_model, train_data_idx): """Do should update the probability of every other state for the observation to zero""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) ie.do_intervention("d", 1) assert ie.query()["d"][0] == 0
def test_do_reflected_in_query(self, train_model, train_data_idx): """Do should adjust marginals returned by query when given a different observation""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) assert ie.query({"a": 1})["d"][1] != 1 ie.do_intervention("d", 1) assert ie.query({"a": 1})["d"][1] == 1
def test_do_prevents_new_states_being_added(self, train_model, train_data_idx): """Do should not allow the introduction of new states""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) with pytest.raises( ValueError, match="The cpd states do not match expected states*" ): ie.do_intervention("d", {0: 0.7, 1: 0.3, 2: 0.0})
def test_do_on_node_with_no_effects_not_allowed(self, train_model, train_data_idx): """It should not be possible to create an isolated node in the network""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) with pytest.raises( ValueError, match="Do calculus cannot be applied because it would result in an isolate", ): ie.do_intervention("a", 1)
def test_do_expects_all_states_have_a_probability(self, train_model, train_data_idx): """Do should accept only state probabilities where all states in the original cpds are present""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) with pytest.raises( ValueError, match="The cpd states do not match expected states*"): ie.do_intervention("d", {1: 1})
def test_do_expects_all_state_probabilities_sum_to_one( self, train_model, train_data_idx): """Do should accept only state probabilities where the full distribution is provided""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) with pytest.raises( ValueError, match="The cpd for the provided observation must sum to 1"): ie.do_intervention("d", {0: 0.7, 1: 0.4})
def test_observations_affect_marginals(self, train_model, train_data_idx): """Observing the state of a node should affect the marginals of dependent nodes""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) m1 = ie.query({}) m2 = ie.query({"d": 1}) assert m2["d"][0] == 0 assert m2["d"][1] == 1 assert not math.isclose(m2["b"][1], m1["b"][1], abs_tol=0.01)
def test_reset_do_sets_probabilities_back_to_initial_state( self, train_model, train_data_idx, train_data_idx_marginals ): """Resetting Do operator should re-introduce the original conditional dependencies""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) ie.do_intervention("d", {0: 0.7, 1: 0.3}) ie.reset_do("d") assert math.isclose(ie.query()["d"][0], train_data_idx_marginals["d"][0]) assert math.isclose(ie.query()["d"][1], train_data_idx_marginals["d"][1])
def test_do_expects_all_state_probabilities_within_0_and_1( self, train_model, train_data_idx): """Do should accept only state probabilities where the full distribution is provided""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) with pytest.raises( ValueError, match= "The cpd for the provided observation must be between 0 and 1", ): ie.do_intervention("d", {0: -1.0, 1: 2.0})
def test_empty_query_returns_marginals(self, train_model, train_data_idx, train_data_idx_marginals): """An empty query should return all the marginal probabilities of the model's distribution""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) dist = ie.query({}) for node, states in dist.items(): for state, p in states.items(): assert math.isclose(train_data_idx_marginals[node][state], p, abs_tol=0.05)
def chain_network() -> BayesianNetwork: """ This Bayesian Model structure to test do interventions that split graph into subgraphs. a → b → c → d → e """ n = 50 nodes_names = list("abcde") random_binary_matrix = (np.random.randint(10, size=(n, len(nodes_names))) > 6).astype(int) df = pd.DataFrame(data=random_binary_matrix, columns=nodes_names) model = StructureModel() model.add_edges_from([ ("a", "b"), ("b", "c"), ("c", "d"), ("d", "e"), ]) chain_bn = BayesianNetwork(model) chain_bn = chain_bn.fit_node_states(df) chain_bn = chain_bn.fit_cpds(df, method="BayesianEstimator", bayes_prior="K2") return chain_bn
def train_bn(data, graph): bn = BayesianNetwork(graph) bn = bn.fit_node_states(data) bn = bn.fit_cpds(data, method='BayesianEstimator', bayes_prior='K2') return bn
def test_fit_missing_states(self): """test issues/15: should be possible to fit with missing states""" sm = StructureModel([("a", "b"), ("c", "b")]) bn = BayesianNetwork(sm) train = pd.DataFrame(data=[[0, 0, 1], [1, 0, 1], [1, 1, 1]], columns=["a", "b", "c"]) test = pd.DataFrame(data=[[0, 0, 1], [1, 0, 1], [1, 1, 2]], columns=["a", "b", "c"]) data = pd.concat([train, test]) bn.fit_node_states(data) bn.fit_cpds(train, method="BayesianEstimator", bayes_prior="K2") assert bn.cpds["c"].loc[1][0] == 0.8 assert bn.cpds["c"].loc[2][0] == 0.2
def test_behaves_same_as_seperate_calls(self, train_data_idx, train_data_discrete): bn1 = BayesianNetwork(from_pandas(train_data_idx, w_threshold=0.3)) bn2 = BayesianNetwork(from_pandas(train_data_idx, w_threshold=0.3)) bn1.fit_node_states(train_data_discrete).fit_cpds(train_data_discrete) bn2.fit_node_states_and_cpds(train_data_discrete) assert bn1.edges == bn2.edges assert bn1.node_states == bn2.node_states cpds1 = bn1.cpds cpds2 = bn2.cpds assert cpds1.keys() == cpds2.keys() for k in cpds1: assert cpds1[k].equals(cpds2[k])
def test_invalid_observations(self, train_model, train_data_idx): """Test with invalid observations type""" bn = BayesianNetwork(train_model) bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx) ie = InferenceEngine(bn) with pytest.raises( TypeError, match="Expecting observations to be a dict, list or None"): ie.query("123") with pytest.raises( TypeError, match="Expecting observations to be a dict, list or None"): ie.query({"123", "abc"}) with pytest.raises( TypeError, match="Expecting observations to be a dict, list or None"): ie.query(("123", "abc"))
def get_avg_auc_all_info( df: pd.DataFrame, bn: BayesianNetwork, n_splits: int = 5, seed: int = 2021, n_cpus: int = multiprocessing.cpu_count() - 1, ) -> float: """ Utility function to compute AUC using all nodes beyond the parent nodes Args: df: Input dataframe bn: Bayesian network n_splits: Number of cross-validation folds seed: Random seed number n_cpus: Number of CPU cores to use Returns: Average AUC """ bn.fit_node_states(df) cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed) total_auc = 0 for fold, (train_idx, test_idx) in enumerate(cv.split(df)): t0 = time() train_df = df.loc[train_idx, :] test_df = df.loc[test_idx, :] bn.fit_cpds(train_df, method="BayesianEstimator", bayes_prior="K2") chunks = [[bn, test_df, target] for target in bn.nodes] with multiprocessing.Pool(n_cpus) as p: result = p.starmap(_compute_auc_stub, chunks) total_auc += sum(result) / len(bn.nodes) print( f"Processing fold {fold} using {n_cpus} cores takes {time() - t0} seconds" ) return total_auc / n_splits
def __init__(self, bayesian_network: BayesianNetwork, dataset: pd.DataFrame): self.df = dataset self.bn = bayesian_network.fit_node_states(dataset).fit_cpds(dataset) self.problem = { 'num_vars': self.df.shape[1], 'names': list(self.df.columns), 'bounds': [[]] } self.sampler = saltelli self.analyzer = sobol
def get_auc_data( df: pd.DataFrame, bn: BayesianNetwork, n_splits: int = 5, seed: int = 2021, ) -> pd.Series: """ Utility function to compute AUC based only on data observations Args: df: Input dataframe bn: Bayesian network n_splits: Number of cross-validation folds seed: Random seed number Returns: Average AUC """ bn.fit_node_states(df) cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed) nodes_auc = defaultdict(list) for fold, (train_idx, test_idx) in enumerate(cv.split(df)): t0 = time() train_df = df.loc[train_idx, :] test_df = df.loc[test_idx, :] bn.fit_cpds(train_df, method="BayesianEstimator", bayes_prior="K2") for var in bn.nodes: _, auc = roc_auc(bn, test_df, var) nodes_auc[var].append(auc) print(f"Processing fold {fold} takes {time() - t0} seconds") nodes_auc = pd.DataFrame(nodes_auc) col = nodes_auc.mean(axis=0).idxmin() val = nodes_auc.mean(axis=0).min() print(f"Variable with lowest AUC is {col} with the value of {val}") return nodes_auc.mean().sort_values()
def get_avg_auc( df: pd.DataFrame, bn: BayesianNetwork, n_splits: int = 5, seed: int = 2021, ) -> float: """ Estimate the average auc of all nodes in a Bayesian Network given a structure and a dataset using k-fold cross-validation. This function uses the bn.predict method in causalnex and cannot be used with latent variable models Args: df: a dataset in the pandas format bn: a bayesian network EM object n_splits: Number of folds in k-fold cv seed: random seed used in k-fold cv Returns: Average AUC """ bn.fit_node_states(df) cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed) total_auc = 0 for fold, (train_idx, test_idx) in enumerate(cv.split(df)): t0 = time() cur_auc = 0 train_df = df.loc[train_idx, :] test_df = df.loc[test_idx, :] bn.fit_cpds(train_df, method="BayesianEstimator", bayes_prior="K2") for var in bn.nodes: _, auc = roc_auc(bn, test_df, var) cur_auc += auc print(f"Processing fold {fold} takes {time() - t0} seconds") total_auc += cur_auc / len(bn.nodes) return total_auc / n_splits