Пример #1
0
    def test_do_accepts_all_state_probabilities(self, bn):
        """Do should accept a map of state->p and update p accordingly"""

        ie = InferenceEngine(bn)
        ie.do_intervention("d", {False: 0.7, True: 0.3})
        assert math.isclose(ie.query()["d"][False], 0.7)
        assert math.isclose(ie.query()["d"][True], 0.3)
Пример #2
0
    def test_do_reflected_in_query(self, bn):
        """Do should adjust marginals returned by query when given a different observation"""

        ie = InferenceEngine(bn)

        assert ie.query({"a": "b"})["d"][True] != 1
        ie.do_intervention("d", True)
        assert ie.query({"a": "b"})["d"][True] == 1
Пример #3
0
    def test_do_reflected_in_query(self, train_model, train_data_idx):
        """Do should adjust marginals returned by query when given a different observation"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        assert ie.query({"a": 1})["d"][1] != 1
        ie.do_intervention("d", 1)
        assert ie.query({"a": 1})["d"][1] == 1
Пример #4
0
    def test_observations_affect_marginals(self, bn):
        """Observing the state of a node should affect the marginals of dependent nodes"""

        ie = InferenceEngine(bn)

        m1 = ie.query({})
        m2 = ie.query({"d": True})

        assert m2["d"][False] == 0
        assert m2["d"][True] == 1
        assert not math.isclose(m2["b"]["x"], m1["b"]["x"], abs_tol=0.05)
Пример #5
0
    def test_observations_does_not_affect_marginals_of_independent_nodes(self, bn):
        """Observing the state of a node should not affect the marginal probability of an independent node"""

        ie = InferenceEngine(bn)

        m1 = ie.query({})
        m2 = ie.query({"d": True})

        assert m2["d"][False] == 0
        assert m2["d"][True] == 1
        assert math.isclose(m2["e"][True], m1["e"][True], abs_tol=0.05)
Пример #6
0
    def test_reset_do_sets_probabilities_back_to_initial_state(
            self, bn, train_data_discrete_marginals):
        """Resetting Do operator should re-introduce the original conditional dependencies"""

        ie = InferenceEngine(bn)
        ie.do_intervention("d", {False: 0.7, True: 0.3})
        ie.reset_do("d")

        assert math.isclose(ie.query()["d"][False],
                            train_data_discrete_marginals["d"][False])
        assert math.isclose(ie.query()["d"][False],
                            train_data_discrete_marginals["d"][False])
Пример #7
0
    def test_observations_affect_marginals(self, train_model, train_data_idx):
        """Observing the state of a node should affect the marginals of dependent nodes"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)

        m1 = ie.query({})
        m2 = ie.query({"d": 1})

        assert m2["d"][0] == 0
        assert m2["d"][1] == 1
        assert not math.isclose(m2["b"][1], m1["b"][1], abs_tol=0.01)
Пример #8
0
    def test_reset_do_sets_probabilities_back_to_initial_state(
            self, train_model, train_data_idx, train_data_idx_marginals):
        """Resetting Do operator should re-introduce the original conditional dependencies"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        ie.do_intervention("d", {0: 0.7, 1: 0.3})
        ie.reset_do("d")

        assert math.isclose(ie.query()["d"][0],
                            train_data_idx_marginals["d"][0])
        assert math.isclose(ie.query()["d"][1],
                            train_data_idx_marginals["d"][1])
    def intervention(cls, input):
        """Users can apply an intervention to any node in the data, updating its distribution
         using a do operator, examining the effect of that intervention by querying marginals 
         and resetting any interventions

        Args:
           input (a list of dictionaries): The data on which to do the interventions.
        """

        from causalnex.inference import InferenceEngine

        bn = cls.get_model()
        ie = InferenceEngine(bn)
        i_node = input["node"]
        i_states = input["states"]
        i_target = input["target_node"]

        print(i_node, i_states, i_target)
        lst = []

        # i_states is a list of dict
        for state in i_states:
            state = {int(k): int(v) for k, v in state.items()}
            ie.do_intervention(i_node, state)
            intervention_result = ie.query()[i_target]
            lst.append(intervention_result)
            print("Updated marginal", intervention_result)
            ie.reset_do(i_node)

        return lst
Пример #10
0
    def test_do_sets_state_probability_to_one(self, train_model, train_data_idx):
        """Do should update the probability of the given observation=state to 1"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        ie.do_intervention("d", 1)
        assert math.isclose(ie.query()["d"][1], 1)
Пример #11
0
    def test_do_accepts_all_state_probabilities(self, train_model, train_data_idx):
        """Do should accept a map of state->p and update p accordingly"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        ie.do_intervention("d", {0: 0.7, 1: 0.3})
        assert math.isclose(ie.query()["d"][0], 0.7)
        assert math.isclose(ie.query()["d"][1], 0.3)
Пример #12
0
    def test_do_sets_other_state_probabilitys_to_zero(self, train_model,
                                                      train_data_idx):
        """Do should update the probability of every other state for the observation to zero"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        ie.do_intervention("d", 1)
        assert ie.query()["d"][0] == 0
Пример #13
0
    def test_empty_query_returns_marginals(self, bn, train_data_discrete_marginals):
        """An empty query should return all the marginal probabilities of the model's distribution"""

        ie = InferenceEngine(bn)
        dist = ie.query({})

        for node, states in dist.items():
            for state, p in states.items():
                assert math.isclose(
                    train_data_discrete_marginals[node][state], p, abs_tol=0.05
                )
Пример #14
0
    def test_empty_query_returns_marginals(self, train_model, train_data_idx,
                                           train_data_idx_marginals):
        """An empty query should return all the marginal probabilities of the model's distribution"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)

        ie = InferenceEngine(bn)
        dist = ie.query({})

        for node, states in dist.items():
            for state, p in states.items():
                assert math.isclose(train_data_idx_marginals[node][state],
                                    p,
                                    abs_tol=0.05)
Пример #15
0
    def test_multi_query(self, bn):
        """Test query with a list of observations and multiprocessing"""

        ie = InferenceEngine(bn)
        results_parallel = ie.query([{
            "a": "a",
            "b": "x"
        }, {
            "a": "c",
            "e": False
        }, {
            "b": "x"
        }],
                                    parallel=True)
        results_loop = ie.query([{
            "a": "a",
            "b": "x"
        }, {
            "a": "c",
            "e": False
        }, {
            "b": "x"
        }],
                                parallel=False)
        single_0 = ie.query({"a": "a", "b": "x"})
        single_1 = ie.query({"a": "c", "e": False})
        single_2 = ie.query({"b": "x"})

        assert len(results_parallel) == 3
        assert results_parallel == results_loop
        assert results_parallel[0]["a"]["a"] == 1
        assert results_parallel[1]["e"][False] == 1
        assert results_parallel[2]["b"]["x"] == 1
        assert single_0 == results_parallel[0]
        assert single_1 == results_parallel[1]
        assert single_2 == results_parallel[2]
Пример #16
0
def marginal_probs(graph, query, observations, verbose=1):
    '''
    [graph]        : causalnex BayesianNetwork object
    [query]        : str
    [observations] : dict
    '''

    ie = InferenceEngine(graph)
    marginals = ie.query(observations)
    marg_probs = marginals[query]

    if verbose:
        print('Marginal probabilities of "{}" | {} = {}'.format(
                                            query, observations, marg_probs))

    return marg_probs
Пример #17
0
def predict_using_all_nodes(
    bn: BayesianNetwork,
    data: pd.DataFrame,
    target_var: str,
    markov_blanket: bool = False,
    lv_name: str = "LV",
) -> pd.DataFrame:
    """
    Compute marginals using all nodes

    Args:
        bn: Bayesian network
        data: Input dataframe
        target_var: Target variable name
        markov_blanket: Whether to compute marginals based only on Markov blanket of the target variable
        lv_name: Latent variable name

    Returns:
        Marginal dataframe
    """
    # Extract columns of interest
    if markov_blanket:
        blanket = bn.structure.get_markov_blanket([target_var, lv_name])
        cols_to_keep = blanket.nodes
    else:
        cols_to_keep = bn.nodes

    # Further drop target variable and latent variable (if applicable)
    cols_to_keep = [
        col for col in cols_to_keep if col not in {target_var, lv_name}
    ]

    # Perform inference
    ie = InferenceEngine(bn)
    observations = data[cols_to_keep].to_dict(orient="records")
    marginals = [prob[target_var] for prob in ie.query(observations)]
    return pd.DataFrame(marginals)
Пример #18
0
    def test_invalid_observations(self, train_model, train_data_idx):
        """Test with invalid observations type"""

        bn = BayesianNetwork(train_model)
        bn.fit_node_states(train_data_idx).fit_cpds(train_data_idx)
        ie = InferenceEngine(bn)

        with pytest.raises(
                TypeError,
                match="Expecting observations to be a dict, list or None"):
            ie.query("123")

        with pytest.raises(
                TypeError,
                match="Expecting observations to be a dict, list or None"):
            ie.query({"123", "abc"})

        with pytest.raises(
                TypeError,
                match="Expecting observations to be a dict, list or None"):
            ie.query(("123", "abc"))
Пример #19
0
bayesNetFull: BayesianNetwork = bayesNetFull.fit_cpds(
    data=discrData, method="BayesianEstimator", bayes_prior="K2")
# %% markdown [markdown]
# Get warnings, showing we are replacing the previously existing CPDs
#
# **Second**: For inference, must create a new `InferenceEngine` from our `BayesianNetwork`, which lets us query the model. The query method will compute the marginal likelihood of all states for all nodes. Query lets us get the marginal distributions, marginalizing to get rid of the conditioning variable(s) for each node variable.

# %% codecell
from causalnex.inference import InferenceEngine

eng = InferenceEngine(bn=bayesNetFull)
eng
# %% markdown [markdown]
# Query the baseline marginal distributions, which means querying marginals **as learned from data**:
# %% codecell
marginalDistLearned: Dict[str, Dict[str, float]] = eng.query()
marginalDistLearned
# %% codecell
marginalDistLearned['address']
# %% codecell
marginalDistLearned['G1']

# %% markdown [markdown]
# Output tells us that `P(G1=Fail) ~ 0.25` and `P(G1 = Pass) ~ 0.75`. As a quick sanity check can compute what proportion of our data are `Fail` and `Pass`, should give nearly the same result:
# %% codecell
import numpy as np

labels, counts = np.unique(discrData['G1'], return_counts=True)

print(list(zip(labels, counts)))
print('\nProportion failures = {}'.format(counts[0] / sum(counts)))
    def test_em_algorithm(self):  # pylint: disable=too-many-locals
        """
        Test if `BayesianNetwork` works with EM algorithm.
        We use a naive bayes + parents + an extra node not related to the latent variable.
        """

        # p0   p1  p2
        #   \  |  /
        #      z
        #   /  |  \
        # c0  c1  c2
        # |
        # cc0
        np.random.seed(22)

        data, sm, _, true_lv_values = naive_bayes_plus_parents(
            percentage_not_missing=0.1,
            samples=1000,
            p_z=0.7,
            p_c=0.7,
        )
        data["cc_0"] = np.where(
            np.random.random(len(data)) < 0.5, data["c_0"],
            (data["c_0"] + 1) % 3)
        data.drop(columns=["z"], inplace=True)

        complete_data = data.copy(deep=True)
        complete_data["z"] = true_lv_values

        # Baseline model: the structure of the figure trained with complete data. We try to reproduce it
        complete_bn = BayesianNetwork(
            StructureModel(list(sm.edges) + [("c_0", "cc_0")]))
        complete_bn.fit_node_states_and_cpds(complete_data)

        # BN without latent variable: All `p`s are connected to all `c`s + `c0` ->`cc0`
        sm_no_lv = StructureModel([(f"p_{p}", f"c_{c}") for p in range(3)
                                   for c in range(3)] + [("c_0", "cc_0")])
        bn = BayesianNetwork(sm_no_lv)
        bn.fit_node_states(data)
        bn.fit_cpds(data)

        # TEST 1: cc_0 does not depend on the latent variable so:
        assert np.all(bn.cpds["cc_0"] == complete_bn.cpds["cc_0"])

        # BN with latent variable
        # When we add the latent variable, we add the edges in the image above
        # and remove the connection among `p`s and `c`s
        edges_to_add = list(sm.edges)
        edges_to_remove = [(f"p_{p}", f"c_{c}") for p in range(3)
                           for c in range(3)]
        bn.add_node("z", edges_to_add, edges_to_remove)
        bn.fit_latent_cpds("z", [0, 1, 2], data, stopping_delta=0.001)

        # TEST 2: cc_0 CPD should remain untouched by the EM algorithm
        assert np.all(bn.cpds["cc_0"] == complete_bn.cpds["cc_0"])

        # TEST 3: We should recover the correct CPDs quite accurately
        assert bn.cpds.keys() == complete_bn.cpds.keys()
        assert self.mean_absolute_error(bn.cpds, complete_bn.cpds) < 0.01

        # TEST 4: Inference over recovered CPDs should be also accurate
        eng = InferenceEngine(bn)
        query = eng.query()
        n_rows = complete_data.shape[0]

        for node in query:
            assert (np.abs(query[node][0] -
                           sum(complete_data[node] == 0) / n_rows) < 1e-2)
            assert (np.abs(query[node][1] -
                           sum(complete_data[node] == 1) / n_rows) < 1e-2)

        # TEST 5: Inference using predict and predict_probability functions
        report = classification_report(bn, complete_data, "z")
        _, auc = roc_auc(bn, complete_data, "z")
        complete_report = classification_report(complete_bn, complete_data,
                                                "z")
        _, complete_auc = roc_auc(complete_bn, complete_data, "z")

        for category, metrics in report.items():
            if isinstance(metrics, dict):
                for key, val in metrics.items():
                    assert np.abs(val - complete_report[category][key]) < 1e-2
            else:
                assert np.abs(metrics - complete_report[category]) < 1e-2

        assert np.abs(auc - complete_auc) < 1e-2
Пример #21
0
    def test_do_sets_other_state_probabilitys_to_zero(self, bn):
        """Do should update the probability of every other state for the observation to zero"""

        ie = InferenceEngine(bn)
        ie.do_intervention("d", True)
        assert ie.query()["d"][False] == 0
Пример #22
0
    def test_do_sets_state_probability_to_one(self, bn):
        """Do should update the probability of the given observation=state to 1"""

        ie = InferenceEngine(bn)
        ie.do_intervention("d", True)
        assert math.isclose(ie.query()["d"][True], 1)
bayesNetFull: BayesianNetwork = bayesNetFull.fit_cpds(
    data=data, method="BayesianEstimator", bayes_prior="K2")
# %% markdown [markdown]
# Get warnings, showing we are replacing the previously existing CPDs
#
# **Second**: For inference, must create a new `InferenceEngine` from our `BayesianNetwork`, which lets us query the model. The query method will compute the marginal likelihood of all states for all nodes. Query lets us get the marginal distributions, marginalizing to get rid of the conditioning variable(s) for each node variable.

# %% codecell
from causalnex.inference import InferenceEngine

eng = InferenceEngine(bn=bayesNetFull)
eng
# %% markdown [markdown]
# Query the baseline marginal distributions, which means querying marginals **as learned from data**:
# %% codecell
marginalDistLearned: Dict[str, Dict[str, float]] = eng.query()
marginalDistLearned
# %% codecell
marginalDistLearned['injury_type']
# %% codecell
marginalDistLearned['absenteeism_level']

# %% markdown [markdown]
# As a quick sanity check can compute the corresponding proportion of our data , which should give nearly the same result:
# %% codecell
import numpy as np

labels, counts = np.unique(data['absenteeism_level'], return_counts=True)

print(list(zip(labels, counts)))
print('The prediction is \'{prediction}\''.format(prediction=predictions.loc[18, 'G1_prediction']))
print('The ground truth is \'{truth}\''.format(truth=discretised_data.loc[18, 'G1']))

# 평가
classification_report(bn, test, "G1")

roc, auc = roc_auc(bn, test, "G1")
print(auc)


# 한계(Marginal) 확률 베이스라인 (위와 같음)
bn = bn.fit_cpds(discretised_data, method="BayesianEstimator", bayes_prior="K2")

# 모든 상태와 노드에 대해서 한계(Marginal) 우도(Likelihood) 계산
ie = InferenceEngine(bn)
marginals = ie.query()
print('Marginal Likelihood of Target: ', marginals["G1"])

# 실제 레이블 개수 분포를 세어서 계산한 우도와 비슷한지 확인
labels, counts = np.unique(discretised_data["G1"], return_counts=True)
list(zip(labels, counts))


# 학습시간 변수 각각의 경우(레이블)에 대해서 한계 확률 계산해보기
marginals_short = ie.query({"studytime": "short-studytime"})
marginals_long = ie.query({"studytime": "long-studytime"})
print("Marginal G1 | Short Studtyime", marginals_short["G1"])
print("Marginal G1 | Long Studytime", marginals_long["G1"])

"""
Marginal G1 | Short Studtyime {'Fail': 0.2776556433482524, 'Pass': 0.7223443566517477}
Пример #25
0
    def test_query_after_do_intervention_has_split_graph(self, chain_network):
        """
        chain network: a → b → c → d → e

        test 1.
        - do intervention on node c generates 2 graphs (a → b) and (c → d → e)
        - assert the query can be run (it used to hang before)
        - assert rest_do works
        """
        ie = InferenceEngine(chain_network)
        original_margs = ie.query()

        var = "c"
        state_dict = {0: 1.0, 1: 0.0}
        ie.do_intervention(var, state_dict)
        # assert the intervention node has indeed the right state
        assert ie.query()[var][0] == state_dict[0]
        assert ie.query()[var][1] == state_dict[1]

        # assert the upstream nodes have the default marginals (no info
        # propagates in the upstream graph)
        assert ie.query()["a"][0] == original_margs["a"][0]
        assert ie.query()["a"][1] == original_margs["a"][1]
        assert ie.query()["b"][0] == original_margs["b"][0]
        assert ie.query()["b"][1] == original_margs["b"][1]

        # assert the _cpds of the upstream nodes are stored correctly
        orig_cpds = ie._cpds_original  # pylint: disable=protected-access
        upstream_cpds = ie._detached_cpds  # pylint: disable=protected-access
        assert orig_cpds["a"] == upstream_cpds["a"]
        assert orig_cpds["b"] == upstream_cpds["b"]

        ie.reset_do(var)
        reset_margs = ie.query()

        for node in original_margs.keys():
            dict_left = original_margs[node]
            dict_right = reset_margs[node]
            for (kl, kr) in zip(dict_left.keys(), dict_right.keys()):
                assert math.isclose(dict_left[kl], dict_right[kr])

        # repeating above tests intervening on b, so that there is one single
        # isolate
        var_b = "b"
        state_dict_b = {0: 1.0, 1: 0.0}
        ie.do_intervention(var_b, state_dict_b)
        # assert the intervention node has indeed the right state
        assert ie.query()[var_b][0] == state_dict[0]
        assert ie.query()[var_b][1] == state_dict[1]

        # assert the upstream nodes have the default marginals (no info
        # propagates in the upstream graph)
        assert ie.query()["a"][0] == original_margs["a"][0]
        assert ie.query()["a"][1] == original_margs["a"][1]

        # assert the _cpds of the upstream nodes are stored correctly
        orig_cpds = ie._cpds_original  # pylint: disable=protected-access
        upstream_cpds = ie._detached_cpds  # pylint: disable=protected-access
        assert orig_cpds["a"] == upstream_cpds["a"]

        ie.reset_do(var_b)
        reset_margs = ie.query()

        for node in original_margs.keys():
            dict_left = original_margs[node]
            dict_right = reset_margs[node]
            for (kl, kr) in zip(dict_left.keys(), dict_right.keys()):
                assert math.isclose(dict_left[kl], dict_right[kr])
Пример #26
0
# * $\color{red}{\text{TODO}}:$ why is it true that there are equally likely probabilities everywhere else?
# %% codecell
bayesNet.cpds[AbsenteeismLevel.var]



# %% markdown [markdown]
# ## Step 4: Inference (querying marginals)
# %% codecell
from causalnex.inference import InferenceEngine


eng = InferenceEngine(bn = bayesNet)

# querying the baseline marginals as learned from the data
marginalDist: Dict[Name, Dict[State, Probability]] = eng.query()
marginalDist

# %% markdown [markdown]
# Checking marginal distribution of **work-capacity**:
# %% codecell
eng.query()[WorkCapacity.var]
# %% markdown [markdown]
# Biasing so that lower work capacity probability gets higher:
# %% codecell
# NOTE: in the data, in TIME + 30, when exertion, training, experience are all HIGH, the work-capacity = LOW
eng.query({Time.var : 30, ExertionLevel.var : 'High', TrainingLevel.var : 'High', ExperienceLevel.var : 'High'})[WorkCapacity.var]
# %% codecell
# Different than data: at time = 30, in data all these exertion, experience, training are High, so testing what happens to workcapacity when they are set to Medium:
eng.query({Time.var : 30, ExertionLevel.var : 'Medium', TrainingLevel.var : 'Medium', ExperienceLevel.var : 'Medium'})[WorkCapacity.var]
# %% codecell