def test_cvl_local_score_gbn(): gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) cvl = pbn.CVLikelihood(df, 10, seed) assert np.isclose( cvl.local_score(gbn, 'a', []), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'a', [])) assert np.isclose( cvl.local_score(gbn, 'b', ['a']), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'b', ['a'])) assert np.isclose( cvl.local_score(gbn, 'c', ['a', 'b']), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'c', ['a', 'b'])) assert np.isclose( cvl.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'd', ['a', 'b', 'c'])) assert np.isclose(cvl.local_score(gbn, 'd', ['a', 'b', 'c']), cvl.local_score(gbn, 'd', ['b', 'c', 'a'])) assert cvl.local_score(gbn, 'a') == cvl.local_score(gbn, 'a', gbn.parents('a')) assert cvl.local_score(gbn, 'b') == cvl.local_score(gbn, 'b', gbn.parents('b')) assert cvl.local_score(gbn, 'c') == cvl.local_score(gbn, 'c', gbn.parents('c')) assert cvl.local_score(gbn, 'd') == cvl.local_score(gbn, 'd', gbn.parents('d'))
def test_opposite(): bn = pbn.SemiparametricBN(["a", "b"]) o = pbn.AddArc("a", "b", 1) oppo = o.opposite(bn) assert oppo.source() == 'a' assert oppo.target() == 'b' assert oppo.delta() == -1 assert type(oppo) == pbn.RemoveArc o = pbn.RemoveArc("a", "b", 1) oppo = o.opposite(bn) assert oppo.source() == 'a' assert oppo.target() == 'b' assert oppo.delta() == -1 assert type(oppo) == pbn.AddArc o = pbn.FlipArc("a", "b", 1) oppo = o.opposite(bn) assert oppo.source() == 'b' assert oppo.target() == 'a' assert oppo.delta() == -1 assert type(oppo) == pbn.FlipArc bn.set_node_type("a", pbn.LinearGaussianCPDType()) o = pbn.ChangeNodeType("a", pbn.CKDEType(), 1) oppo = o.opposite(bn) assert oppo.node() == 'a' assert oppo.node_type() == pbn.LinearGaussianCPDType() assert oppo.delta() == -1 assert type(oppo) == pbn.ChangeNodeType
def dyn_other_fit_bytes(): variables = ["a", "b", "c", "d"] static_nodes = [v + "_t_" + str(m) for v in variables for m in range(1, 3)] transition_nodes = [v + "_t_0" for v in variables] other_static = OtherBN(static_nodes, [("a_t_2", "d_t_1")], [("b_t_2", pbn.DiscreteFactorType()), ("b_t_1", pbn.DiscreteFactorType()), ("c_t_1", pbn.CKDEType()), ("d_t_1", pbn.LinearGaussianCPDType())]) lg = LinearGaussianCPD("d_t_1", ["a_t_2"], [1, 2], 2) other_static.add_cpds([lg]) other_transition = ConditionalOtherBN( transition_nodes, static_nodes, [("a_t_2", "d_t_0")], [("b_t_0", pbn.DiscreteFactorType()), ("c_t_0", pbn.CKDEType()), ("d_t_0", pbn.LinearGaussianCPDType())]) lg = LinearGaussianCPD("d_t_0", ["a_t_2"], [3, 4], 1.5) other_transition.add_cpds([lg]) assert other_static.type() == other_transition.type() dyn_other = DynamicOtherBN(variables, 2, other_static, other_transition) df_continuous = util_test.generate_normal_data_indep(1000) df_discrete = util_test.generate_discrete_data_dependent(1000) df = df_continuous df["b"] = df_discrete["B"] dyn_other.fit(df) dyn_other.include_cpd = True return pickle.dumps(dyn_other)
def test_holdout_local_score_null_spbn(): spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) b_null = np.random.randint(0, SIZE, size=100) c_null = np.random.randint(0, SIZE, size=100) d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() df_null.loc[df_null.index[a_null], 'a'] = np.nan df_null.loc[df_null.index[b_null], 'b'] = np.nan df_null.loc[df_null.index[c_null], 'c'] = np.nan df_null.loc[df_null.index[d_null], 'd'] = np.nan hl = pbn.HoldoutLikelihood(df_null, 0.2, seed) assert np.isclose( hl.local_score(spbn, 'a', []), numpy_local_score(pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'a', [])) assert np.isclose( hl.local_score(spbn, 'b', ['a']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'b', ['a'])) assert np.isclose( hl.local_score(spbn, 'c', ['a', 'b']), numpy_local_score(pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'c', ['a', 'b'])) assert np.isclose( hl.local_score(spbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['a', 'b', 'c'])) assert np.isclose( hl.local_score(spbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['b', 'c', 'a'])) assert hl.local_score(spbn, 'a') == hl.local_score(spbn, 'a', spbn.parents('a')) assert hl.local_score(spbn, 'b') == hl.local_score(spbn, 'b', spbn.parents('b')) assert hl.local_score(spbn, 'c') == hl.local_score(spbn, 'c', spbn.parents('c')) assert hl.local_score(spbn, 'd') == hl.local_score(spbn, 'd', spbn.parents('d'))
def test_node_type(): spbn = SemiparametricBN(['a', 'b', 'c', 'd']) assert spbn.num_nodes() == 4 assert spbn.num_arcs() == 0 assert spbn.nodes() == ['a', 'b', 'c', 'd'] for n in spbn.nodes(): assert spbn.node_type(n) == pbn.UnknownFactorType() spbn.set_node_type('b', pbn.CKDEType()) assert spbn.node_type('b') == pbn.CKDEType() spbn.set_node_type('b', pbn.LinearGaussianCPDType()) assert spbn.node_type('b') == pbn.LinearGaussianCPDType()
def test_factor_type(): lg1 = pbn.LinearGaussianCPD("a", []) lg2 = pbn.LinearGaussianCPD("b", ["a"]) lg3 = pbn.LinearGaussianCPD("c", ["b", "a"]) assert lg1.type() == pbn.LinearGaussianCPDType() assert lg1.type() == lg2.type() assert lg1.type() == lg3.type() assert lg2.type() == lg3.type() c1 = pbn.CKDE("a", []) c2 = pbn.CKDE("b", ["a"]) c3 = pbn.CKDE("c", ["b", "a"]) assert c1.type() == pbn.CKDEType() assert c1.type() == c2.type() assert c1.type() == c3.type() assert c2.type() == c3.type() d1 = pbn.DiscreteFactor("a", []) d2 = pbn.DiscreteFactor("b", ["a"]) d3 = pbn.DiscreteFactor("c", ["b", "a"]) assert d1.type() == pbn.DiscreteFactorType() assert d1.type() == d2.type() assert d1.type() == d3.type() assert d2.type() == d3.type() assert lg1.type() != c1.type() assert lg1.type() != d1.type() assert c1.type() != d1.type()
def test_serialization_factor_type(lg_type_bytes, ckde_type_bytes, discrete_type_bytes, new_type_bytes, other_type_bytes): loaded_lg = pickle.loads(lg_type_bytes) new_lg = pbn.LinearGaussianCPDType() assert new_lg == loaded_lg loaded_ckde = pickle.loads(ckde_type_bytes) new_ckde = pbn.CKDEType() assert loaded_ckde == new_ckde loaded_discrete = pickle.loads(discrete_type_bytes) new_discrete = pbn.DiscreteFactorType() assert loaded_discrete == new_discrete loaded_new = pickle.loads(new_type_bytes) new_new = NewType() assert loaded_new == new_new loaded_other = pickle.loads(other_type_bytes) new_other = OtherType() assert loaded_other == new_other assert new_lg != new_ckde assert new_lg != new_discrete assert new_lg != new_new assert new_lg != new_other assert new_ckde != new_discrete assert new_ckde != new_new assert new_ckde != new_other assert new_discrete != new_new assert new_discrete != new_other assert new_new != new_other
def other_partial_fit_bytes(): other = OtherBN(["a", "b", "c", "d"], [("a", "b")], [("b", pbn.LinearGaussianCPDType()), ("c", pbn.CKDEType()), ("d", pbn.DiscreteFactorType())]) lg = LinearGaussianCPD("b", ["a"], [1, 2], 2) other.add_cpds([lg]) other.include_cpd = True return pickle.dumps(other)
def cond_other_partial_fit_bytes(): other = ConditionalOtherBN(["c", "d"], ["a", "b"], [("a", "c")], [("c", pbn.CKDEType()), ("d", pbn.LinearGaussianCPDType())]) lg = LinearGaussianCPD("d", [], [3], 1.5) other.add_cpds([lg]) other.include_cpd = True return pickle.dumps(other)
def test_cvl_local_score_gbn_null(): gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) b_null = np.random.randint(0, SIZE, size=100) c_null = np.random.randint(0, SIZE, size=100) d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() df_null.loc[df_null.index[a_null], 'a'] = np.nan df_null.loc[df_null.index[b_null], 'b'] = np.nan df_null.loc[df_null.index[c_null], 'c'] = np.nan df_null.loc[df_null.index[d_null], 'd'] = np.nan cvl = pbn.CVLikelihood(df_null, 10, seed) assert np.isclose( cvl.local_score(gbn, 'a', []), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'a', [])) assert np.isclose( cvl.local_score(gbn, 'b', ['a']), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'b', ['a'])) assert np.isclose( cvl.local_score(gbn, 'c', ['a', 'b']), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'c', ['a', 'b'])) assert np.isclose( cvl.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'd', ['a', 'b', 'c'])) assert np.isclose(cvl.local_score(gbn, 'd', ['a', 'b', 'c']), cvl.local_score(gbn, 'd', ['b', 'c', 'a'])) assert cvl.local_score(gbn, 'a') == cvl.local_score(gbn, 'a', gbn.parents('a')) assert cvl.local_score(gbn, 'b') == cvl.local_score(gbn, 'b', gbn.parents('b')) assert cvl.local_score(gbn, 'c') == cvl.local_score(gbn, 'c', gbn.parents('c')) assert cvl.local_score(gbn, 'd') == cvl.local_score(gbn, 'd', gbn.parents('d'))
def test_cpd(): spbn = SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('d', pbn.CKDEType())]) with pytest.raises(ValueError) as ex: spbn.cpd('a') assert "not added" in str(ex.value) spbn.fit(df) assert spbn.cpd('a').type() == pbn.LinearGaussianCPDType() assert spbn.cpd('b').type() == pbn.LinearGaussianCPDType() assert spbn.cpd('c').type() == pbn.LinearGaussianCPDType() assert spbn.cpd('d').type() == pbn.CKDEType() assert spbn.cpd('a').fitted() assert spbn.cpd('b').fitted() assert spbn.cpd('c').fitted() assert spbn.cpd('d').fitted()
def test_holdout_local_score_spbn(): spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) hl = pbn.HoldoutLikelihood(df, 0.2, seed) assert np.isclose( hl.local_score(spbn, 'a', []), numpy_local_score(pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'a', [])) assert np.isclose( hl.local_score(spbn, 'b', ['a']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'b', ['a'])) assert np.isclose( hl.local_score(spbn, 'c', ['a', 'b']), numpy_local_score(pbn.CKDEType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'c', ['a', 'b'])) assert np.isclose( hl.local_score(spbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['a', 'b', 'c'])) assert np.isclose( hl.local_score(spbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['b', 'c', 'a'])) assert hl.local_score(spbn, 'a') == hl.local_score(spbn, 'a', spbn.parents('a')) assert hl.local_score(spbn, 'b') == hl.local_score(spbn, 'b', spbn.parents('b')) assert hl.local_score(spbn, 'c') == hl.local_score(spbn, 'c', spbn.parents('c')) assert hl.local_score(spbn, 'd') == hl.local_score(spbn, 'd', spbn.parents('d'))
def test_serialization_bn_model(gaussian_bytes, spbn_bytes, kde_bytes, discrete_bytes, genericbn_bytes, newbn_bytes, otherbn_bytes): loaded_g = pickle.loads(gaussian_bytes) assert set(loaded_g.nodes()) == set(["a", "b", "c", "d"]) assert loaded_g.arcs() == [("a", "b")] assert loaded_g.type() == pbn.GaussianNetworkType() loaded_s = pickle.loads(spbn_bytes) assert set(loaded_s.nodes()) == set(["a", "b", "c", "d"]) assert loaded_s.arcs() == [("a", "b")] assert loaded_s.type() == pbn.SemiparametricBNType() assert loaded_s.node_types() == { 'a': pbn.UnknownFactorType(), 'b': pbn.CKDEType(), 'c': pbn.UnknownFactorType(), 'd': pbn.UnknownFactorType() } loaded_k = pickle.loads(kde_bytes) assert set(loaded_k.nodes()) == set(["a", "b", "c", "d"]) assert loaded_k.arcs() == [("a", "b")] assert loaded_k.type() == pbn.KDENetworkType() loaded_d = pickle.loads(discrete_bytes) assert set(loaded_d.nodes()) == set(["a", "b", "c", "d"]) assert loaded_d.arcs() == [("a", "b")] assert loaded_d.type() == pbn.DiscreteBNType() loaded_gen = pickle.loads(genericbn_bytes) assert set(loaded_gen.nodes()) == set(["a", "b", "c", "d"]) assert loaded_gen.arcs() == [("a", "b")] assert loaded_gen.type() == MyRestrictedGaussianNetworkType() loaded_nn = pickle.loads(newbn_bytes) assert set(loaded_g.nodes()) == set(["a", "b", "c", "d"]) assert loaded_nn.arcs() == [("a", "b")] assert loaded_nn.type() == MyRestrictedGaussianNetworkType() loaded_o = pickle.loads(otherbn_bytes) assert set(loaded_g.nodes()) == set(["a", "b", "c", "d"]) assert loaded_o.arcs() == [("a", "b")] assert loaded_o.type() == NonHomogeneousType() assert loaded_o.node_types() == { 'a': pbn.UnknownFactorType(), 'b': pbn.LinearGaussianCPDType(), 'c': pbn.CKDEType(), 'd': pbn.DiscreteFactorType() } assert loaded_o.extra_info == "extra" assert loaded_nn.type() != loaded_o.type()
def numpy_local_score(node_type, data, variable, evidence): cv = pbn.CrossValidation(data, 10, seed) loglik = 0 for train_df, test_df in cv: if isinstance(variable, str): node_data = train_df.to_pandas().loc[:, [variable] + evidence].dropna() variable_data = node_data.loc[:, variable] evidence_data = node_data.loc[:, evidence] test_node_data = test_df.to_pandas().loc[:, [variable] + evidence].dropna() test_variable_data = test_node_data.loc[:, variable] test_evidence_data = test_node_data.loc[:, evidence] else: node_data = train_df.to_pandas().iloc[:, [variable] + evidence].dropna() variable_data = node_data.iloc[:, 0] evidence_data = node_data.iloc[:, 1:] test_node_data = test_df.to_pandas().iloc[:, [variable] + evidence].dropna() test_variable_data = test_node_data.iloc[:, 0] test_evidence_data = test_node_data.iloc[:, 1:] if node_type == pbn.LinearGaussianCPDType(): N = variable_data.shape[0] d = evidence_data.shape[1] linregress_data = np.column_stack( (np.ones(N), evidence_data.to_numpy())) (beta, res, _, _) = np.linalg.lstsq(linregress_data, variable_data.to_numpy(), rcond=None) var = res / (N - d - 1) means = beta[0] + np.sum(beta[1:] * test_evidence_data, axis=1) loglik += norm.logpdf(test_variable_data, means, np.sqrt(var)).sum() elif node_type == pbn.CKDEType(): k_joint = gaussian_kde( node_data.to_numpy().T, bw_method=lambda s: np.power(4 / (s.d + 2), 1 / (s.d + 4)) * s.scotts_factor()) if evidence: k_marg = gaussian_kde(evidence_data.to_numpy().T, bw_method=k_joint.covariance_factor()) loglik += np.sum( k_joint.logpdf(test_node_data.to_numpy().T) - k_marg.logpdf(test_evidence_data.to_numpy().T)) else: loglik += np.sum(k_joint.logpdf(test_node_data.to_numpy().T)) return loglik
def test_serialization_unfitted_factor(lg_bytes, ckde_bytes, discrete_bytes, new_bytes, newbis_bytes): loaded_lg = pickle.loads(lg_bytes) assert loaded_lg.variable() == "c" assert set(loaded_lg.evidence()) == set(["a", "b"]) assert not loaded_lg.fitted() assert loaded_lg.type() == pbn.LinearGaussianCPDType() loaded_ckde = pickle.loads(ckde_bytes) assert loaded_ckde.variable() == "c" assert set(loaded_ckde.evidence()) == set(["a", "b"]) assert not loaded_ckde.fitted() assert loaded_ckde.type() == pbn.CKDEType() loaded_discrete = pickle.loads(discrete_bytes) assert loaded_discrete.variable() == "c" assert set(loaded_discrete.evidence()) == set(["a", "b"]) assert not loaded_discrete.fitted() assert loaded_discrete.type() == pbn.DiscreteFactorType() loaded_new = pickle.loads(new_bytes) assert loaded_new.variable() == "c" assert set(loaded_new.evidence()) == set(["a", "b"]) assert not loaded_new.fitted() assert type(loaded_new.type()) == NewType nn = NewFactor("a", []) assert loaded_new.type() == nn.type() from pybnesian import GaussianNetwork dummy_network = GaussianNetwork(["a", "b", "c", "d"]) assert type(loaded_new.type().new_factor(dummy_network, "a", [])) == NewFactor loaded_newbis = pickle.loads(newbis_bytes) assert loaded_newbis.variable() == "c" assert set(loaded_newbis.evidence()) == set(["a", "b"]) assert not loaded_newbis.fitted() assert type(loaded_newbis.type()) == NewType nnbis = NewFactorBis("a", []) assert loaded_newbis.type() == nnbis.type() assert type(loaded_newbis.type().new_factor(dummy_network, "a", [])) == NewFactorBis assert loaded_lg.type() != loaded_ckde.type() assert loaded_lg.type() != loaded_discrete.type() assert loaded_lg.type() != loaded_new.type() assert loaded_ckde.type() != loaded_discrete.type() assert loaded_ckde.type() != loaded_new.type() assert loaded_discrete.type() != loaded_new.type() assert loaded_newbis.type() == loaded_new.type()
def test_cvl_local_score_spbn(): spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) cvl = pbn.CVLikelihood(df, 10, seed) assert np.isclose(cvl.local_score(spbn, 'a', []), numpy_local_score(pbn.CKDEType(), df, 'a', [])) assert np.isclose( cvl.local_score(spbn, 'b', ['a']), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'b', ['a'])) assert np.isclose(cvl.local_score(spbn, 'c', ['a', 'b']), numpy_local_score(pbn.CKDEType(), df, 'c', ['a', 'b'])) assert np.isclose( cvl.local_score(spbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'd', ['a', 'b', 'c'])) assert np.isclose( cvl.local_score(spbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'd', ['b', 'c', 'a'])) assert cvl.local_score(spbn, 'a') == cvl.local_score(spbn, 'a', spbn.parents('a')) assert cvl.local_score(spbn, 'b') == cvl.local_score(spbn, 'b', spbn.parents('b')) assert cvl.local_score(spbn, 'c') == cvl.local_score(spbn, 'c', spbn.parents('c')) assert cvl.local_score(spbn, 'd') == cvl.local_score(spbn, 'd', spbn.parents('d')) assert np.isclose( cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), 'a', []), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'a', [])) assert np.isclose( cvl.local_score_node_type(spbn, pbn.CKDEType(), 'b', ['a']), numpy_local_score(pbn.CKDEType(), df, 'b', ['a'])) assert np.isclose( cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), 'c', ['a', 'b']), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'c', ['a', 'b'])) assert np.isclose( cvl.local_score_node_type(spbn, pbn.CKDEType(), 'd', ['a', 'b', 'c']), numpy_local_score(pbn.CKDEType(), df, 'd', ['a', 'b', 'c'])) assert np.isclose( cvl.local_score_node_type(spbn, pbn.CKDEType(), 'd', ['a', 'b', 'c']), numpy_local_score(pbn.CKDEType(), df, 'd', ['b', 'c', 'a']))
def test_holdout_local_score_gbn(): gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) hl = pbn.HoldoutLikelihood(df, 0.2, seed) assert np.isclose( hl.local_score(gbn, 'a', []), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'a', [])) assert np.isclose( hl.local_score(gbn, 'b', ['a']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'b', ['a'])) assert np.isclose( hl.local_score(gbn, 'c', ['a', 'b']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'c', ['a', 'b'])) assert np.isclose( hl.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['a', 'b', 'c'])) assert np.isclose(hl.local_score(gbn, 'd', ['a', 'b', 'c']), hl.local_score(gbn, 'd', ['b', 'c', 'a'])) assert hl.local_score(gbn, 'a') == hl.local_score(gbn, 'a', gbn.parents('a')) assert hl.local_score(gbn, 'b') == hl.local_score(gbn, 'b', gbn.parents('b')) assert hl.local_score(gbn, 'c') == hl.local_score(gbn, 'c', gbn.parents('c')) assert hl.local_score(gbn, 'd') == hl.local_score(gbn, 'd', gbn.parents('d'))
def dyn_other_partial_fit_bytes(): variables = ["a", "b", "c", "d"] static_nodes = [v + "_t_" + str(m) for v in variables for m in range(1, 3)] transition_nodes = [v + "_t_0" for v in variables] other_static = OtherBN(static_nodes, [("a_t_2", "d_t_1")], [("b_t_1", pbn.DiscreteFactorType()), ("c_t_1", pbn.CKDEType()), ("d_t_1", pbn.LinearGaussianCPDType())]) lg = LinearGaussianCPD("d_t_1", ["a_t_2"], [1, 2], 2) other_static.add_cpds([lg]) other_transition = ConditionalOtherBN( transition_nodes, static_nodes, [("a_t_2", "d_t_0")], [("b_t_0", pbn.DiscreteFactorType()), ("c_t_0", pbn.CKDEType()), ("d_t_0", pbn.LinearGaussianCPDType())]) lg = LinearGaussianCPD("d_t_0", ["a_t_2"], [3, 4], 1.5) other_transition.add_cpds([lg]) assert other_static.type() == other_transition.type() dyn_other = DynamicOtherBN(variables, 2, other_static, other_transition) dyn_other.include_cpd = True return pickle.dumps(dyn_other)
def other_fit_bytes(): other = OtherBN(["a", "b", "c", "d"], [("a", "b")], [("b", pbn.LinearGaussianCPDType()), ("c", pbn.CKDEType()), ("d", pbn.DiscreteFactorType())]) cpd_a = LinearGaussianCPD("a", [], [0], 0.5) cpd_b = LinearGaussianCPD("b", ["a"], [1, 2], 2) df_continuous = util_test.generate_normal_data_indep(100) cpd_c = CKDE("c", []) cpd_c.fit(df_continuous) df_discrete = util_test.generate_discrete_data_dependent(100) df_discrete.columns = df_discrete.columns.str.lower() cpd_d = DiscreteFactor("d", []) cpd_d.fit(df_discrete) other.add_cpds([cpd_a, cpd_b, cpd_c, cpd_d]) other.include_cpd = True return pickle.dumps(other)
def test_fit(): spbn = SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) with pytest.raises(ValueError) as ex: for n in spbn.nodes(): cpd = spbn.cpd(n) assert "not added" in str(ex.value) spbn.fit(df) for n in spbn.nodes(): cpd = spbn.cpd(n) assert cpd.type() == pbn.LinearGaussianCPDType() assert type(cpd) == pbn.LinearGaussianCPD assert cpd.variable() == n assert set(cpd.evidence()) == set(spbn.parents(n)) spbn.fit(df) spbn.remove_arc('a', 'b') cpd_b = spbn.cpd('b') assert type(cpd_b) == pbn.LinearGaussianCPD assert cpd_b.evidence != spbn.parents('b') spbn.fit(df) cpd_b = spbn.cpd('b') assert type(cpd_b) == pbn.LinearGaussianCPD assert cpd_b.evidence() == spbn.parents('b') spbn.set_node_type('c', pbn.CKDEType()) with pytest.raises(ValueError) as ex: cpd_c = spbn.cpd('c') assert "not added" in str(ex.value) spbn.fit(df) cpd_c = spbn.cpd('c') assert cpd_c.type() == spbn.node_type('c')
def test_mle_lg(): mle = pbn.MLE(pbn.LinearGaussianCPDType()) p = mle.estimate(df, "a", []) np_beta, np_var = numpy_fit_mle_lg(df, "a", []) assert np.all(np.isclose(p.beta, np_beta)) assert np.isclose(p.variance, np_var) p = mle.estimate(df, "b", ["a"]) np_beta, np_var = numpy_fit_mle_lg(df, "b", ["a"]) assert np.all(np.isclose(p.beta, np_beta)) assert np.isclose(p.variance, np_var) p = mle.estimate(df, "c", ["a", "b"]) np_beta, np_var = numpy_fit_mle_lg(df, "c", ["a", "b"]) assert np.all(np.isclose(p.beta, np_beta)) assert np.isclose(p.variance, np_var) p = mle.estimate(df, "d", ["a", "b", "c"]) np_beta, np_var = numpy_fit_mle_lg(df, "d", ["a", "b", "c"]) assert np.all(np.isclose(p.beta, np_beta)) assert np.isclose(p.variance, np_var)
def test_lists(): gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd']) bic = pbn.BIC(df) arc_op = pbn.ArcOperatorSet() arc_op.set_arc_blacklist([("b", "a")]) arc_op.set_arc_whitelist([("b", "c")]) arc_op.set_max_indegree(3) arc_op.set_type_whitelist([("a", pbn.LinearGaussianCPDType())]) arc_op.cache_scores(gbn, bic) arc_op.set_arc_blacklist([("e", "a")]) with pytest.raises(IndexError) as ex: arc_op.cache_scores(gbn, bic) assert "not present in the graph" in str(ex.value) arc_op.set_arc_whitelist([("e", "a")]) with pytest.raises(IndexError) as ex: arc_op.cache_scores(gbn, bic) assert "not present in the graph" in str(ex.value)
def cond_otherbn_bytes(): other = ConditionalOtherBN(["c", "d"], ["a", "b"], [("a", "c")], [("b", pbn.LinearGaussianCPDType()), ("c", pbn.CKDEType()), ("d", pbn.DiscreteFactorType())]) return pickle.dumps(other)
def data_default_node_type(self, dt): if dt.equals(pa.float64()) or dt.equals(pa.float32()): return pbn.LinearGaussianCPDType() else: raise ValueError( "Data type not compatible with NonHomogeneousType")
def test_serialization_fitted_dbn(dyn_gaussian_partial_fit_bytes, dyn_gaussian_fit_bytes, dyn_other_partial_fit_bytes, dyn_other_fit_bytes): # #################### # Gaussian partial fit # #################### loaded_partial = pickle.loads(dyn_gaussian_partial_fit_bytes) assert not loaded_partial.fitted() assert not loaded_partial.static_bn().fitted() assert not loaded_partial.transition_bn().fitted() cpd = loaded_partial.static_bn().cpd("d_t_1") assert cpd.variable() == "d_t_1" assert cpd.evidence() == ["a_t_2"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 cpd = loaded_partial.transition_bn().cpd("b_t_0") assert cpd.variable() == "b_t_0" assert cpd.evidence() == ["c_t_2"] assert list(cpd.beta) == [3, 4] assert cpd.variance == 5 # #################### # Gaussian fit # #################### loaded_fitted = pickle.loads(dyn_gaussian_fit_bytes) assert loaded_fitted.fitted() assert loaded_fitted.static_bn().fitted() assert loaded_fitted.transition_bn().fitted() # #################### # Other partial fit # #################### loaded_partial = pickle.loads(dyn_other_partial_fit_bytes) assert not loaded_partial.fitted() assert not loaded_partial.static_bn().fitted() assert not loaded_partial.transition_bn().fitted() assert loaded_partial.static_bn().node_type( "b_t_1") == pbn.DiscreteFactorType() assert loaded_partial.static_bn().node_type("c_t_1") == pbn.CKDEType() assert loaded_partial.static_bn().node_type( "d_t_1") == pbn.LinearGaussianCPDType() assert loaded_partial.transition_bn().node_type( "b_t_0") == pbn.DiscreteFactorType() assert loaded_partial.transition_bn().node_type("c_t_0") == pbn.CKDEType() assert loaded_partial.transition_bn().node_type( "d_t_0") == pbn.LinearGaussianCPDType() cpd = loaded_partial.static_bn().cpd("d_t_1") assert cpd.variable() == "d_t_1" assert cpd.evidence() == ["a_t_2"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 cpd = loaded_partial.transition_bn().cpd("d_t_0") assert cpd.variable() == "d_t_0" assert cpd.evidence() == ["a_t_2"] assert list(cpd.beta) == [3, 4] assert cpd.variance == 1.5 # #################### # Other fit # #################### loaded_fitted = pickle.loads(dyn_other_fit_bytes) assert loaded_fitted.fitted() assert loaded_fitted.static_bn().fitted() assert loaded_fitted.transition_bn().fitted() assert loaded_partial.static_bn().node_type( "b_t_1") == pbn.DiscreteFactorType() assert loaded_partial.static_bn().node_type("c_t_1") == pbn.CKDEType() assert loaded_partial.static_bn().node_type( "d_t_1") == pbn.LinearGaussianCPDType() assert loaded_partial.transition_bn().node_type( "b_t_0") == pbn.DiscreteFactorType() assert loaded_partial.transition_bn().node_type("c_t_0") == pbn.CKDEType() assert loaded_partial.transition_bn().node_type( "d_t_0") == pbn.LinearGaussianCPDType() cpd = loaded_partial.static_bn().cpd("d_t_1") assert cpd.variable() == "d_t_1" assert cpd.evidence() == ["a_t_2"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 cpd = loaded_partial.transition_bn().cpd("d_t_0") assert cpd.variable() == "d_t_0" assert cpd.evidence() == ["a_t_2"] assert list(cpd.beta) == [3, 4] assert cpd.variance == 1.5
def lg_type_bytes(): lg = pbn.LinearGaussianCPDType() return pickle.dumps(lg)
def default_node_type(self): return pbn.LinearGaussianCPDType()
def test_mle_create(): with pytest.raises(ValueError) as ex: mle = pbn.MLE(pbn.CKDEType()) assert "MLE not available" in str(ex.value) mle = pbn.MLE(pbn.LinearGaussianCPDType())
def test_cvl_local_score_null_spbn(): spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) b_null = np.random.randint(0, SIZE, size=100) c_null = np.random.randint(0, SIZE, size=100) d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() df_null.loc[df_null.index[a_null], 'a'] = np.nan df_null.loc[df_null.index[b_null], 'b'] = np.nan df_null.loc[df_null.index[c_null], 'c'] = np.nan df_null.loc[df_null.index[d_null], 'd'] = np.nan cvl = pbn.CVLikelihood(df_null, 10, seed) assert np.isclose(cvl.local_score(spbn, 'a', []), numpy_local_score(pbn.CKDEType(), df_null, 'a', [])) assert np.isclose( cvl.local_score(spbn, 'b', ['a']), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'b', ['a'])) assert np.isclose( cvl.local_score(spbn, 'c', ['a', 'b']), numpy_local_score(pbn.CKDEType(), df_null, 'c', ['a', 'b'])) assert np.isclose( cvl.local_score(spbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'd', ['a', 'b', 'c'])) assert np.isclose( cvl.local_score(spbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'd', ['b', 'c', 'a'])) assert cvl.local_score(spbn, 'a') == cvl.local_score(spbn, 'a', spbn.parents('a')) assert cvl.local_score(spbn, 'b') == cvl.local_score(spbn, 'b', spbn.parents('b')) assert cvl.local_score(spbn, 'c') == cvl.local_score(spbn, 'c', spbn.parents('c')) assert cvl.local_score(spbn, 'd') == cvl.local_score(spbn, 'd', spbn.parents('d')) assert np.isclose( cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), 'a', []), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'a', [])) assert np.isclose( cvl.local_score_node_type(spbn, pbn.CKDEType(), 'b', ['a']), numpy_local_score(pbn.CKDEType(), df_null, 'b', ['a'])) assert np.isclose( cvl.local_score_node_type(spbn, pbn.LinearGaussianCPDType(), 'c', ['a', 'b']), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'c', ['a', 'b'])) assert np.isclose( cvl.local_score_node_type(spbn, pbn.CKDEType(), 'd', ['a', 'b', 'c']), numpy_local_score(pbn.CKDEType(), df_null, 'd', ['a', 'b', 'c'])) assert np.isclose( cvl.local_score_node_type(spbn, pbn.CKDEType(), 'd', ['a', 'b', 'c']), numpy_local_score(pbn.CKDEType(), df_null, 'd', ['b', 'c', 'a']))
def test_serialization_fitted_bn(gaussian_partial_fit_bytes, gaussian_fit_bytes, other_partial_fit_bytes, other_fit_bytes): # #################### # Gaussian partial fit # #################### loaded_partial = pickle.loads(gaussian_partial_fit_bytes) assert not loaded_partial.fitted() cpd = loaded_partial.cpd("b") assert cpd.variable() == "b" assert cpd.evidence() == ["a"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 # #################### # Gaussian fit # #################### loaded_fitted = pickle.loads(gaussian_fit_bytes) assert loaded_fitted.fitted() cpd_a = loaded_fitted.cpd("a") assert cpd_a.variable() == "a" assert cpd_a.evidence() == [] assert cpd_a.beta == [0] assert cpd_a.variance == 0.5 cpd_b = loaded_fitted.cpd("b") assert cpd_b.variable() == "b" assert cpd_b.evidence() == ["a"] assert list(cpd_b.beta) == [1, 2] assert cpd_b.variance == 2 cpd_c = loaded_fitted.cpd("c") assert cpd_c.variable() == "c" assert cpd_c.evidence() == [] assert cpd_c.beta == [2] assert cpd_c.variance == 1 cpd_d = loaded_fitted.cpd("d") assert cpd_d.variable() == "d" assert cpd_d.evidence() == [] assert cpd_d.beta == [3] assert cpd_d.variance == 1.5 # #################### # OtherBN homogeneous partial fit # #################### loaded_other = pickle.loads(other_partial_fit_bytes) assert not loaded_other.fitted() cpd = loaded_partial.cpd("b") assert cpd.variable() == "b" assert cpd.evidence() == ["a"] assert list(cpd.beta) == [1, 2] assert cpd.variance == 2 # #################### # OtherBN homogeneous fit # #################### loaded_other_fitted = pickle.loads(other_fit_bytes) assert loaded_other_fitted.fitted() cpd_a = loaded_other_fitted.cpd("a") assert cpd_a.variable() == "a" assert cpd_a.evidence() == [] assert cpd_a.beta == [0] assert cpd_a.variance == 0.5 assert cpd_a.type() == pbn.LinearGaussianCPDType() cpd_b = loaded_other_fitted.cpd("b") assert cpd_b.variable() == "b" assert cpd_b.evidence() == ["a"] assert list(cpd_b.beta) == [1, 2] assert cpd_b.variance == 2 assert cpd_b.type() == pbn.LinearGaussianCPDType() cpd_c = loaded_other_fitted.cpd("c") assert cpd_c.variable() == "c" assert cpd_c.evidence() == [] assert cpd_c.fitted() assert cpd_c.num_instances() == 100 assert cpd_c.type() == pbn.CKDEType() cpd_d = loaded_other_fitted.cpd("d") assert cpd_d.variable() == "d" assert cpd_d.evidence() == [] assert cpd_d.fitted() assert cpd_d.type() == pbn.DiscreteFactorType()