Exemplo n.º 1
0
    def test_roc_of_incorrect_has_fpr_lt_tpr(self):
        """The ROC of incorrect predictions should have FPR < TPR"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        # in test, c=0 is always more likely (opposite of train)
        test = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1000)] + [[a, b, 1] for a in range(3)
                                      for b in range(3) for _ in range(1)],
            columns=["a", "b", "c"],
        )

        roc, _ = roc_auc(bn, test, "c")

        assert len(roc) > 3
        assert all(fpr > tpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
Exemplo n.º 2
0
    def test_auc_of_incorrect_close_to_zero(self):
        """The AUC of incorrect predictions should be close to zero"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        # in test, c=0 is always more likely (opposite of train)
        test = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1000)] + [[a, b, 1] for a in range(3)
                                      for b in range(3) for _ in range(1)],
            columns=["a", "b", "c"],
        )

        _, auc = roc_auc(bn, test, "c")

        assert math.isclose(auc, 0, abs_tol=0.001)
Exemplo n.º 3
0
    def test_auc_node_with_no_parents(self):
        """Should be possible to compute auc for state with no parent nodes"""

        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "a")
        assert math.isclose(auc, 0.5, abs_tol=0.01)
Exemplo n.º 4
0
    def test_auc_for_nonnumeric_features(self):
        """AUC of accurate predictions should be 1 even after remapping numbers to strings"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        # remap values in column c
        train["c"] = train["c"].map({0: "f", 1: "g"})

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "c")
        assert math.isclose(auc, 1, abs_tol=0.001)
Exemplo n.º 5
0
    def test_auc_with_missing_state_in_test(self):
        """AUC should still be calculated correctly with states missing in test set"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        test = train[train["c"] == 1]
        assert len(test["c"].unique()) == 1

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, test, "c")
        assert math.isclose(auc, 1, abs_tol=0.01)
Exemplo n.º 6
0
    def test_auc_of_accurate_predictions(self):
        """AUC of accurate predictions should be 1"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(1)] + [[a, b, 1] for a in range(0, 2)
                                   for b in range(0, 2)
                                   for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(1)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        _, auc = roc_auc(bn, train, "c")
        assert math.isclose(auc, 1, abs_tol=0.001)
Exemplo n.º 7
0
    def test_roc_of_accurate_predictions(self):
        """TPR should always be better than FPR for accurate predictions"""

        # equal class (c) weighting to guarantee high ROC expected
        train = pd.DataFrame(
            [[a, b, 0] for a in range(0, 2) for b in range(0, 2)
             for _ in range(10)] + [[a, b, 1] for a in range(0, 2)
                                    for b in range(0, 2)
                                    for _ in range(a * 10 + b * 10 + 1000)] +
            [[a, b, 0] for a in range(2, 4) for b in range(2, 4)
             for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1]
                                                        for a in range(2, 4)
                                                        for b in range(2, 4)
                                                        for _ in range(10)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        roc, _ = roc_auc(bn, train, "c")
        assert all(tpr > fpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
Exemplo n.º 8
0
    def test_roc_of_random_has_unit_gradient(self):
        """The ROC curve for random predictions should be a line from (0,0) to (1,1)"""

        # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold
        # points in roc curve)
        train = pd.DataFrame(
            [[a, b, 0] for a in range(3) for b in range(3)
             for _ in range(1)] + [[a, b, 1] for a in range(3)
                                   for b in range(3)
                                   for _ in range(a * 1000 + b * 1000 + 1000)],
            columns=["a", "b", "c"],
        )

        cg = StructureModel()
        cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)])

        bn = BayesianNetwork(cg)
        bn.fit_node_states(train)
        bn.fit_cpds(train)

        assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02)

        test = pd.DataFrame(
            [[a, b, random.randint(0, 1)] for a in range(3) for b in range(3)
             for _ in range(1000)],
            columns=["a", "b", "c"],
        )

        roc, _ = roc_auc(bn, test, "c")

        assert len(roc) > 3
        assert all(math.isclose(a, b, abs_tol=0.03) for a, b in roc)
Exemplo n.º 9
0
def get_auc_data(
    df: pd.DataFrame,
    bn: BayesianNetwork,
    n_splits: int = 5,
    seed: int = 2021,
) -> pd.Series:
    """
    Utility function to compute AUC based only on data observations

    Args:
        df: Input dataframe
        bn: Bayesian network
        n_splits: Number of cross-validation folds
        seed: Random seed number

    Returns:
        Average AUC
    """
    bn.fit_node_states(df)
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    nodes_auc = defaultdict(list)

    for fold, (train_idx, test_idx) in enumerate(cv.split(df)):
        t0 = time()
        train_df = df.loc[train_idx, :]
        test_df = df.loc[test_idx, :]
        bn.fit_cpds(train_df, method="BayesianEstimator", bayes_prior="K2")

        for var in bn.nodes:
            _, auc = roc_auc(bn, test_df, var)
            nodes_auc[var].append(auc)

        print(f"Processing fold {fold} takes {time() - t0} seconds")

    nodes_auc = pd.DataFrame(nodes_auc)
    col = nodes_auc.mean(axis=0).idxmin()
    val = nodes_auc.mean(axis=0).min()
    print(f"Variable with lowest AUC is {col} with the value of {val}")
    return nodes_auc.mean().sort_values()
Exemplo n.º 10
0
def get_avg_auc(
    df: pd.DataFrame,
    bn: BayesianNetwork,
    n_splits: int = 5,
    seed: int = 2021,
) -> float:
    """
    Estimate the average auc of all nodes in a Bayesian Network given a structure and a dataset using
    k-fold cross-validation. This function uses the bn.predict method in causalnex and cannot be used
    with latent variable models

    Args:
        df: a dataset in the pandas format
        bn: a bayesian network EM object
        n_splits: Number of folds in k-fold cv
        seed: random seed used in k-fold cv

    Returns:
        Average AUC
    """
    bn.fit_node_states(df)
    cv = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    total_auc = 0

    for fold, (train_idx, test_idx) in enumerate(cv.split(df)):
        t0 = time()
        cur_auc = 0
        train_df = df.loc[train_idx, :]
        test_df = df.loc[test_idx, :]
        bn.fit_cpds(train_df, method="BayesianEstimator", bayes_prior="K2")

        for var in bn.nodes:
            _, auc = roc_auc(bn, test_df, var)
            cur_auc += auc

        print(f"Processing fold {fold} takes {time() - t0} seconds")
        total_auc += cur_auc / len(bn.nodes)

    return total_auc / n_splits
Exemplo n.º 11
0
# %% codecell
from causalnex.evaluation import classification_report

classification_report(bn=bayesNetCPD, data=test, node='G1')
# %% markdown [markdown]
# **Interpret Results of classification report:** this report shows that the model can classify reasonably well whether a student passs the exam. For predictions where the student fails, the precision is adequate but recall is bad. This implies that we can rely on predictions for `G1_Fail` but we are likely to miss some of the predictions we should have made. Perhaps these missing predictions are a result of something missing in our structure
# * ALERT - explore graph structure when the recall is bad
#
#
# ## ROC / AUC
# The ROC and AUC can be obtained with `roc_auc` method within CausalNex metrics module.
# ROC curve is computed by micro-averaging predictions made across all states (classes) of the target node.
# %% codecell
from causalnex.evaluation import roc_auc

roc, auc = roc_auc(bn=bayesNetCPD, data=test, node='G1')

print(f"ROC = \n{roc}\n")
print(f"AUC = {auc}")
# %% markdown [markdown]
# High value of AUC gives confidence in model performance
#
#
#
# # 5/ Querying Marginals
# After iterating over our model structure, CPDs, and validating our model quality, we can **query our model under different observations** to gain insights.
#
# ## Baseline Marginals
# To query the model for baseline marginals that reflect the population as a whole, a `query` method can be used.
#
# **First:** update the model using the complete dataset since the one we currently have is built only from training data.
# 타겟 확인
print(bn.cpds["G1"])  # 시험 G1 성적 - Pass/Fail

# 타겟을 제외한 인풋(18번째 row) 확인
print(discretised_data.loc[18, discretised_data.columns != 'G1'])


# 예측
predictions = bn.predict(discretised_data, "G1")
print('The prediction is \'{prediction}\''.format(prediction=predictions.loc[18, 'G1_prediction']))
print('The ground truth is \'{truth}\''.format(truth=discretised_data.loc[18, 'G1']))

# 평가
classification_report(bn, test, "G1")

roc, auc = roc_auc(bn, test, "G1")
print(auc)


# 한계(Marginal) 확률 베이스라인 (위와 같음)
bn = bn.fit_cpds(discretised_data, method="BayesianEstimator", bayes_prior="K2")

# 모든 상태와 노드에 대해서 한계(Marginal) 우도(Likelihood) 계산
ie = InferenceEngine(bn)
marginals = ie.query()
print('Marginal Likelihood of Target: ', marginals["G1"])

# 실제 레이블 개수 분포를 세어서 계산한 우도와 비슷한지 확인
labels, counts = np.unique(discretised_data["G1"], return_counts=True)
list(zip(labels, counts))
# %% codecell
from causalnex.evaluation import classification_report

classification_report(bn=bayesNetCPD, data=data, node='absenteeism_level')
# %% markdown [markdown]
# **Interpret Results of classification report:** Precisions are very low for the no absentee level, and both precions and recall are very low for other absentee levels, implying we are likely to miss some of the predictions we should have made. Perhaps these missing predictions are a result of something missing in our structure
# * $\color{red}{\text{ALERT:}}$  explore graph structure when the recall is bad
#
#
# ## Measure 2: ROC / AUC
# The ROC and AUC can be obtained with `roc_auc` method within CausalNex metrics module.
# ROC curve is computed by micro-averaging predictions made across all states (classes) of the target node.
# %% codecell
from causalnex.evaluation import roc_auc

roc, auc = roc_auc(bn=bayesNetCPD, data=data, node='absenteeism_level')

print(f"ROC = \n{roc}\n")
print(f"AUC = {auc}")
# %% markdown [markdown]
# High value of AUC gives confidence in model performance, low value of AUC implies poor model performance.
#
#
#
# # 5/ Querying Marginals
# After iterating over our model structure, CPDs, and validating our model quality, we can **query our model under different observations** to gain insights.
#
# ## Baseline Marginals
# To query the model for baseline marginals that reflect the population as a whole, a `query` method can be used.
#
# **First:** update the model using the complete dataset since the one we currently have is built only from training data.
    def test_em_algorithm(self):  # pylint: disable=too-many-locals
        """
        Test if `BayesianNetwork` works with EM algorithm.
        We use a naive bayes + parents + an extra node not related to the latent variable.
        """

        # p0   p1  p2
        #   \  |  /
        #      z
        #   /  |  \
        # c0  c1  c2
        # |
        # cc0
        np.random.seed(22)

        data, sm, _, true_lv_values = naive_bayes_plus_parents(
            percentage_not_missing=0.1,
            samples=1000,
            p_z=0.7,
            p_c=0.7,
        )
        data["cc_0"] = np.where(
            np.random.random(len(data)) < 0.5, data["c_0"],
            (data["c_0"] + 1) % 3)
        data.drop(columns=["z"], inplace=True)

        complete_data = data.copy(deep=True)
        complete_data["z"] = true_lv_values

        # Baseline model: the structure of the figure trained with complete data. We try to reproduce it
        complete_bn = BayesianNetwork(
            StructureModel(list(sm.edges) + [("c_0", "cc_0")]))
        complete_bn.fit_node_states_and_cpds(complete_data)

        # BN without latent variable: All `p`s are connected to all `c`s + `c0` ->`cc0`
        sm_no_lv = StructureModel([(f"p_{p}", f"c_{c}") for p in range(3)
                                   for c in range(3)] + [("c_0", "cc_0")])
        bn = BayesianNetwork(sm_no_lv)
        bn.fit_node_states(data)
        bn.fit_cpds(data)

        # TEST 1: cc_0 does not depend on the latent variable so:
        assert np.all(bn.cpds["cc_0"] == complete_bn.cpds["cc_0"])

        # BN with latent variable
        # When we add the latent variable, we add the edges in the image above
        # and remove the connection among `p`s and `c`s
        edges_to_add = list(sm.edges)
        edges_to_remove = [(f"p_{p}", f"c_{c}") for p in range(3)
                           for c in range(3)]
        bn.add_node("z", edges_to_add, edges_to_remove)
        bn.fit_latent_cpds("z", [0, 1, 2], data, stopping_delta=0.001)

        # TEST 2: cc_0 CPD should remain untouched by the EM algorithm
        assert np.all(bn.cpds["cc_0"] == complete_bn.cpds["cc_0"])

        # TEST 3: We should recover the correct CPDs quite accurately
        assert bn.cpds.keys() == complete_bn.cpds.keys()
        assert self.mean_absolute_error(bn.cpds, complete_bn.cpds) < 0.01

        # TEST 4: Inference over recovered CPDs should be also accurate
        eng = InferenceEngine(bn)
        query = eng.query()
        n_rows = complete_data.shape[0]

        for node in query:
            assert (np.abs(query[node][0] -
                           sum(complete_data[node] == 0) / n_rows) < 1e-2)
            assert (np.abs(query[node][1] -
                           sum(complete_data[node] == 1) / n_rows) < 1e-2)

        # TEST 5: Inference using predict and predict_probability functions
        report = classification_report(bn, complete_data, "z")
        _, auc = roc_auc(bn, complete_data, "z")
        complete_report = classification_report(complete_bn, complete_data,
                                                "z")
        _, complete_auc = roc_auc(complete_bn, complete_data, "z")

        for category, metrics in report.items():
            if isinstance(metrics, dict):
                for key, val in metrics.items():
                    assert np.abs(val - complete_report[category][key]) < 1e-2
            else:
                assert np.abs(metrics - complete_report[category]) < 1e-2

        assert np.abs(auc - complete_auc) < 1e-2