def test_cvl_local_score_gbn(): gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) cvl = pbn.CVLikelihood(df, 10, seed) assert np.isclose( cvl.local_score(gbn, 'a', []), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'a', [])) assert np.isclose( cvl.local_score(gbn, 'b', ['a']), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'b', ['a'])) assert np.isclose( cvl.local_score(gbn, 'c', ['a', 'b']), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'c', ['a', 'b'])) assert np.isclose( cvl.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), df, 'd', ['a', 'b', 'c'])) assert np.isclose(cvl.local_score(gbn, 'd', ['a', 'b', 'c']), cvl.local_score(gbn, 'd', ['b', 'c', 'a'])) assert cvl.local_score(gbn, 'a') == cvl.local_score(gbn, 'a', gbn.parents('a')) assert cvl.local_score(gbn, 'b') == cvl.local_score(gbn, 'b', gbn.parents('b')) assert cvl.local_score(gbn, 'c') == cvl.local_score(gbn, 'c', gbn.parents('c')) assert cvl.local_score(gbn, 'd') == cvl.local_score(gbn, 'd', gbn.parents('d'))
def test_bic_local_score_null(): gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd'], [('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) b_null = np.random.randint(0, SIZE, size=100) c_null = np.random.randint(0, SIZE, size=100) d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() df_null.loc[df_null.index[a_null], 'a'] = np.nan df_null.loc[df_null.index[b_null], 'b'] = np.nan df_null.loc[df_null.index[c_null], 'c'] = np.nan df_null.loc[df_null.index[d_null], 'd'] = np.nan bic = pbn.BIC(df_null) assert np.isclose(bic.local_score(gbn, 'a', []), numpy_local_score(df_null, 'a', [])) assert np.isclose(bic.local_score(gbn, 'b', ['a']), numpy_local_score(df_null, 'b', ['a'])) assert np.isclose(bic.local_score(gbn, 'c', ['a', 'b']), numpy_local_score(df_null, 'c', ['a', 'b'])) assert np.isclose(bic.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(df_null, 'd', ['a', 'b', 'c'])) assert np.isclose(bic.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(df_null, 'd', ['b', 'c', 'a'])) assert bic.local_score(gbn, 'a') == bic.local_score(gbn, 'a', gbn.parents('a')) assert bic.local_score(gbn, 'b') == bic.local_score(gbn, 'b', gbn.parents('b')) assert bic.local_score(gbn, 'c') == bic.local_score(gbn, 'c', gbn.parents('c')) assert bic.local_score(gbn, 'd') == bic.local_score(gbn, 'd', gbn.parents('d'))
def test_bic_score(): gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) bic = pbn.BIC(df) assert np.isclose(bic.score(gbn), (bic.local_score(gbn, 'a', []) + bic.local_score(gbn, 'b', ['a']) + bic.local_score(gbn, 'c', ['a', 'b']) + bic.local_score(gbn, 'd', ['a', 'b', 'c'])))
def test_create_change_node(): gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd']) cv = pbn.CVLikelihood(df) node_op = pbn.ChangeNodeTypeSet() with pytest.raises(ValueError) as ex: node_op.cache_scores(gbn, cv) assert "can only be used with non-homogeneous" in str(ex.value)
def test_nomax(): gbn = pbn.GaussianNetwork(['a', 'b']) bic = pbn.BIC(df) arc_op = pbn.ArcOperatorSet(whitelist=[("a", "b")]) arc_op.cache_scores(gbn, bic) op = arc_op.find_max(gbn) assert op is None
def test_apply(): gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd']) assert gbn.num_arcs() == 0 assert not gbn.has_arc('a', 'b') o = pbn.AddArc("a", "b", 1) o.apply(gbn) assert gbn.num_arcs() == 1 assert gbn.has_arc('a', 'b') o = pbn.FlipArc("a", "b", 1) o.apply(gbn) assert gbn.num_arcs() == 1 assert not gbn.has_arc('a', 'b') assert gbn.has_arc('b', 'a') o = pbn.RemoveArc("b", "a", 1) o.apply(gbn) assert gbn.num_arcs() == 0 assert not gbn.has_arc('b', 'a') o = pbn.ChangeNodeType("a", pbn.CKDEType(), 1) with pytest.raises(ValueError) as ex: o.apply(gbn) assert "Wrong factor type" in str(ex.value) spbn = pbn.SemiparametricBN(['a', 'b', 'c', 'd']) assert spbn.num_arcs() == 0 o = pbn.ChangeNodeType("a", pbn.CKDEType(), 1) assert (spbn.node_type('a') == pbn.UnknownFactorType()) o.apply(spbn) assert (spbn.node_type('a') == pbn.CKDEType()) assert not spbn.has_arc('a', 'b') o = pbn.AddArc("a", "b", 1) o.apply(spbn) assert spbn.num_arcs() == 1 assert spbn.has_arc('a', 'b') o = pbn.FlipArc("a", "b", 1) o.apply(spbn) assert spbn.num_arcs() == 1 assert not spbn.has_arc('a', 'b') assert spbn.has_arc('b', 'a') o = pbn.RemoveArc("b", "a", 1) o.apply(spbn) assert spbn.num_arcs() == 0 assert not spbn.has_arc('b', 'a')
def test_holdout_local_score_gbn_null(): gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) b_null = np.random.randint(0, SIZE, size=100) c_null = np.random.randint(0, SIZE, size=100) d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() df_null.loc[df_null.index[a_null], 'a'] = np.nan df_null.loc[df_null.index[b_null], 'b'] = np.nan df_null.loc[df_null.index[c_null], 'c'] = np.nan df_null.loc[df_null.index[d_null], 'd'] = np.nan hl = pbn.HoldoutLikelihood(df_null, 0.2, seed) assert np.isclose( hl.local_score(gbn, 'a', []), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'a', [])) assert np.isclose( hl.local_score(gbn, 'b', ['a']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'b', ['a'])) assert np.isclose( hl.local_score(gbn, 'c', ['a', 'b']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'c', ['a', 'b'])) assert np.isclose( hl.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['a', 'b', 'c'])) assert np.isclose(hl.local_score(gbn, 'd', ['a', 'b', 'c']), hl.local_score(gbn, 'd', ['b', 'c', 'a'])) assert hl.local_score(gbn, 'a') == hl.local_score(gbn, 'a', gbn.parents('a')) assert hl.local_score(gbn, 'b') == hl.local_score(gbn, 'b', gbn.parents('b')) assert hl.local_score(gbn, 'c') == hl.local_score(gbn, 'c', gbn.parents('c')) assert hl.local_score(gbn, 'd') == hl.local_score(gbn, 'd', gbn.parents('d'))
def test_bic_local_score(): gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd'], [('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) bic = pbn.BIC(df) assert np.isclose(bic.local_score(gbn, 'a', []), numpy_local_score(df, 'a', [])) assert np.isclose(bic.local_score(gbn, 'b', ['a']), numpy_local_score(df, 'b', ['a'])) assert np.isclose(bic.local_score(gbn, 'c', ['a', 'b']), numpy_local_score(df, 'c', ['a', 'b'])) assert np.isclose(bic.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(df, 'd', ['a', 'b', 'c'])) assert np.isclose(bic.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(df, 'd', ['b', 'c', 'a'])) assert bic.local_score(gbn, 'a') == bic.local_score(gbn, 'a', gbn.parents('a')) assert bic.local_score(gbn, 'b') == bic.local_score(gbn, 'b', gbn.parents('b')) assert bic.local_score(gbn, 'c') == bic.local_score(gbn, 'c', gbn.parents('c')) assert bic.local_score(gbn, 'd') == bic.local_score(gbn, 'd', gbn.parents('d'))
def test_holdout_score(): gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) hl = pbn.HoldoutLikelihood(df, 0.2, 0) assert np.isclose( hl.score(gbn), (hl.local_score(gbn, 'a', []) + hl.local_score(gbn, 'b', ['a']) + hl.local_score(gbn, 'c', ['a', 'b']) + hl.local_score(gbn, 'd', ['a', 'b', 'c']))) spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) assert np.isclose(hl.score(spbn), (hl.local_score(spbn, 'a') + hl.local_score(spbn, 'b') + hl.local_score(spbn, 'c') + hl.local_score(spbn, 'd')))
def test_cvl_local_score_gbn_null(): gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) np.random.seed(0) a_null = np.random.randint(0, SIZE, size=100) b_null = np.random.randint(0, SIZE, size=100) c_null = np.random.randint(0, SIZE, size=100) d_null = np.random.randint(0, SIZE, size=100) df_null = df.copy() df_null.loc[df_null.index[a_null], 'a'] = np.nan df_null.loc[df_null.index[b_null], 'b'] = np.nan df_null.loc[df_null.index[c_null], 'c'] = np.nan df_null.loc[df_null.index[d_null], 'd'] = np.nan cvl = pbn.CVLikelihood(df_null, 10, seed) assert np.isclose( cvl.local_score(gbn, 'a', []), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'a', [])) assert np.isclose( cvl.local_score(gbn, 'b', ['a']), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'b', ['a'])) assert np.isclose( cvl.local_score(gbn, 'c', ['a', 'b']), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'c', ['a', 'b'])) assert np.isclose( cvl.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), df_null, 'd', ['a', 'b', 'c'])) assert np.isclose(cvl.local_score(gbn, 'd', ['a', 'b', 'c']), cvl.local_score(gbn, 'd', ['b', 'c', 'a'])) assert cvl.local_score(gbn, 'a') == cvl.local_score(gbn, 'a', gbn.parents('a')) assert cvl.local_score(gbn, 'b') == cvl.local_score(gbn, 'b', gbn.parents('b')) assert cvl.local_score(gbn, 'c') == cvl.local_score(gbn, 'c', gbn.parents('c')) assert cvl.local_score(gbn, 'd') == cvl.local_score(gbn, 'd', gbn.parents('d'))
def test_cvl_score(): gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) cv = pbn.CVLikelihood(df, 10, 0) assert np.isclose( cv.score(gbn), (cv.local_score(gbn, 'a', []) + cv.local_score(gbn, 'b', ['a']) + cv.local_score(gbn, 'c', ['a', 'b']) + cv.local_score(gbn, 'd', ['a', 'b', 'c']))) spbn = pbn.SemiparametricBN([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')], [('a', pbn.CKDEType()), ('c', pbn.CKDEType())]) assert np.isclose(cv.score(spbn), (cv.local_score(spbn, 'a') + cv.local_score(spbn, 'b') + cv.local_score(spbn, 'c') + cv.local_score(spbn, 'd')))
def test_check_max_score(): gbn = pbn.GaussianNetwork(['c', 'd']) bic = pbn.BIC(df) arc_op = pbn.ArcOperatorSet() arc_op.cache_scores(gbn, bic) op = arc_op.find_max(gbn) assert np.isclose( op.delta(), (bic.local_score(gbn, 'd', ['c']) - bic.local_score(gbn, 'd'))) # BIC is decomposable so the best operation is the arc in reverse direction. arc_op.set_arc_blacklist([(op.source(), op.target())]) arc_op.cache_scores(gbn, bic) op2 = arc_op.find_max(gbn) assert op.source() == op2.target() assert op.target() == op2.source() assert (type(op) == type(op2)) and (type(op) == pbn.AddArc)
def test_lists(): gbn = pbn.GaussianNetwork(['a', 'b', 'c', 'd']) bic = pbn.BIC(df) arc_op = pbn.ArcOperatorSet() arc_op.set_arc_blacklist([("b", "a")]) arc_op.set_arc_whitelist([("b", "c")]) arc_op.set_max_indegree(3) arc_op.set_type_whitelist([("a", pbn.LinearGaussianCPDType())]) arc_op.cache_scores(gbn, bic) arc_op.set_arc_blacklist([("e", "a")]) with pytest.raises(IndexError) as ex: arc_op.cache_scores(gbn, bic) assert "not present in the graph" in str(ex.value) arc_op.set_arc_whitelist([("e", "a")]) with pytest.raises(IndexError) as ex: arc_op.cache_scores(gbn, bic) assert "not present in the graph" in str(ex.value)
def test_holdout_local_score_gbn(): gbn = pbn.GaussianNetwork([('a', 'b'), ('a', 'c'), ('a', 'd'), ('b', 'c'), ('b', 'd'), ('c', 'd')]) hl = pbn.HoldoutLikelihood(df, 0.2, seed) assert np.isclose( hl.local_score(gbn, 'a', []), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'a', [])) assert np.isclose( hl.local_score(gbn, 'b', ['a']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'b', ['a'])) assert np.isclose( hl.local_score(gbn, 'c', ['a', 'b']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'c', ['a', 'b'])) assert np.isclose( hl.local_score(gbn, 'd', ['a', 'b', 'c']), numpy_local_score(pbn.LinearGaussianCPDType(), hl.training_data().to_pandas(), hl.test_data().to_pandas(), 'd', ['a', 'b', 'c'])) assert np.isclose(hl.local_score(gbn, 'd', ['a', 'b', 'c']), hl.local_score(gbn, 'd', ['b', 'c', 'a'])) assert hl.local_score(gbn, 'a') == hl.local_score(gbn, 'a', gbn.parents('a')) assert hl.local_score(gbn, 'b') == hl.local_score(gbn, 'b', gbn.parents('b')) assert hl.local_score(gbn, 'c') == hl.local_score(gbn, 'c', gbn.parents('c')) assert hl.local_score(gbn, 'd') == hl.local_score(gbn, 'd', gbn.parents('d'))
def test_factor_defined_factor_type(): class F_type(FactorType): def __init__(self): FactorType.__init__(self) def __str__(self): return "FType" class F(Factor): def __init__(self, variable, evidence): Factor.__init__(self, variable, evidence) def type(self): return F_type() f1 = F("a", []) f2 = F("b", ["a"]) f3 = F("c", ["a", "b"]) assert f1.type() == f2.type() assert f1.type() == f3.type() assert f2.type() == f3.type() assert str(f1.type()) == str(f2.type()) == str(f3.type()) == "FType" dummy_network = pbn.GaussianNetwork(["a", "b", "c", "d"]) with pytest.raises(RuntimeError) as ex: f4 = f1.type().new_factor(dummy_network, "d", ["a", "b", "c"]) assert 'Tried to call pure virtual function "FactorType::new_factor"' in str( ex.value) class G_type(FactorType): def __init__(self): FactorType.__init__(self) def new_factor(self, model, variable, evidence): return G(variable, evidence) def __str__(self): return "GType" class G(Factor): def __init__(self, variable, evidence): Factor.__init__(self, variable, evidence) def type(self): return G_type() g1 = G("a", []) g2 = G("b", ["a"]) g3 = G("c", ["a", "b"]) assert g1.type() == g2.type() assert g1.type() == g3.type() assert g2.type() == g3.type() assert f1.type() != g1.type() assert str(g1.type()) == str(g2.type()) == str(g3.type()) == "GType" g4 = g1.type().new_factor(dummy_network, "d", ["a", "b", "c"]) assert g1.type() == g4.type() assert g4.variable() == "d" assert g4.evidence() == ["a", "b", "c"]
def test_hc_estimate(): bic = pbn.BIC(df) column_names = list(df.columns.values) start = pbn.GaussianNetwork(column_names) # Check algorithm with BN with nodes removed. column_names.insert(1, 'e') column_names.insert(3, 'f') start_removed_nodes = pbn.GaussianNetwork(column_names) start_removed_nodes.remove_node('e') start_removed_nodes.remove_node('f') arc_set = pbn.ArcOperatorSet() hc = pbn.GreedyHillClimbing() res = hc.estimate(arc_set, bic, start, max_iters=1) assert res.num_arcs() == 1 added_arc = res.arcs()[0] op_delta = bic.score(res) - bic.score(start) res_removed = hc.estimate(arc_set, bic, start_removed_nodes, max_iters=1) assert res.num_arcs() == 1 added_arc_removed = res_removed.arcs()[0] assert added_arc == added_arc_removed or added_arc == added_arc_removed[:: -1] assert np.isclose(op_delta, bic.score(res_removed) - bic.score(start_removed_nodes)) # BIC is score equivalent, so if we blacklist the added_arc, its reverse will be added. res = hc.estimate(arc_set, bic, start, max_iters=1, arc_blacklist=[added_arc]) assert res.num_arcs() == 1 reversed_arc = res.arcs()[0][::-1] assert added_arc == reversed_arc res_removed = hc.estimate(arc_set, bic, start_removed_nodes, max_iters=1, arc_blacklist=[added_arc_removed]) assert res.num_arcs() == 1 reversed_arc_removed = res_removed.arcs()[0][::-1] assert added_arc_removed == reversed_arc_removed assert np.isclose( op_delta, bic.local_score(res, added_arc[1], [added_arc[0]]) - bic.local_score(res, added_arc[1], [])) assert np.isclose( op_delta, bic.local_score(res, added_arc_removed[1], [added_arc_removed[0]]) - bic.local_score(res, added_arc_removed[1], [])) res = hc.estimate(arc_set, bic, start, epsilon=(op_delta + 0.01)) assert res.num_arcs() == start.num_arcs() res_removed = hc.estimate(arc_set, bic, start_removed_nodes, epsilon=(op_delta + 0.01)) assert res_removed.num_arcs() == start_removed_nodes.num_arcs() # Can't compare models because the arcs could be oriented in different direction, # leading to a different search path. Execute the code, just to check no error is given. res = hc.estimate(arc_set, bic, start, verbose=False) res_removed = hc.estimate(arc_set, bic, start_removed_nodes, verbose=False)
def test_hc_estimate_validation(): column_names = list(df.columns.values) start = pbn.GaussianNetwork(column_names) column_names.insert(1, 'e') column_names.insert(4, 'f') start_removed_nodes = pbn.GaussianNetwork(column_names) start_removed_nodes.remove_node('e') start_removed_nodes.remove_node('f') vl = pbn.ValidatedLikelihood(df) arc_set = pbn.ArcOperatorSet() hc = pbn.GreedyHillClimbing() res = hc.estimate(arc_set, vl, start, max_iters=1) assert res.num_arcs() == 1 added_arc = res.arcs()[0] op_delta = vl.cv_lik.score(res) - vl.cv_lik.score(start) res_removed = hc.estimate(arc_set, vl, start_removed_nodes, max_iters=1) assert res_removed.num_arcs() == 1 added_arc_removed = res_removed.arcs()[0] assert added_arc == added_arc_removed or added_arc == added_arc_removed[:: -1] assert np.isclose( op_delta, vl.cv_lik.score(res_removed) - vl.cv_lik.score(start_removed_nodes)) assert np.isclose( op_delta, vl.cv_lik.local_score(res, added_arc[1], [added_arc[0]]) - vl.cv_lik.local_score(res, added_arc[1], [])) assert np.isclose( op_delta, vl.cv_lik.local_score(res, added_arc_removed[1], [added_arc_removed[0]]) - vl.cv_lik.local_score(res, added_arc_removed[1], [])) # CV is score equivalent for GBNs, so if we blacklist the added_edge, its reverse will be added. res = hc.estimate(arc_set, vl, start, max_iters=1, arc_blacklist=[added_arc]) assert res.num_arcs() == 1 reversed_arc = res.arcs()[0][::-1] assert added_arc == reversed_arc res_removed = hc.estimate(arc_set, vl, start_removed_nodes, max_iters=1, arc_blacklist=[added_arc_removed]) assert res_removed.num_arcs() == 1 reversed_arc_removed = res_removed.arcs()[0][::-1] assert reversed_arc == reversed_arc_removed res = hc.estimate(arc_set, vl, start, epsilon=(op_delta + 0.01)) assert res.num_arcs() == start.num_arcs() res_removed = hc.estimate(arc_set, vl, start_removed_nodes, epsilon=(op_delta + 0.01)) assert res_removed.num_arcs() == start_removed_nodes.num_arcs() # Can't compare models because the arcs could be oriented in different direction, # leading to a different search path. Execute the code, just to check no error is given. res = hc.estimate(arc_set, vl, start, verbose=False) res_removed = hc.estimate(arc_set, vl, start_removed_nodes, verbose=False)