def estimate(self): """ Estimates the `DAG` structure that fits best to the given data set, according to the scoring method supplied in the constructor. Exhaustively searches through all models. Only estimates network structure, no parametrization. Returns ------- model: `DAG` instance A `DAG` with maximal score. Examples -------- >>> import pandas as pd >>> import numpy as np >>> from pgmpy.estimators import ExhaustiveSearch >>> # create random data sample with 3 variables, where B and C are identical: >>> data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB')) >>> data['C'] = data['B'] >>> est = ExhaustiveSearch(data) >>> best_model = est.estimate() >>> best_model <pgmpy.base.DAG.DAG object at 0x7f695c535470> >>> best_model.edges() [('B', 'C')] """ best_dag = max(self.all_dags(), key=self.scoring_method.score) best_model = DAG() best_model.add_nodes_from(sorted(best_dag.nodes())) best_model.add_edges_from(sorted(best_dag.edges())) return best_model
def estimate(self, tabu_length=100, max_indegree=2, black_list=None, epsilon=1e-4, max_iter=1e6, show_progress=True): # We will be using K2Score for this model score = K2Score(data=self.data) # Model gets the score for a node and its parents # This is used on every iteration for all possible changes # This is greddy and picks the best available option score_fn = score.local_score # Initialize a Starting DAG # PGMPY made a DAG class that adds some functionality to nx.DiGrpah start_dag = DAG() start_dag.add_nodes_from(self.variables) # Set the edges we do not want to have in the graph if black_list is None: black_list = set() else: black_list = set(black_list) # Just change Maxindegree to a certain number when doing the model # I think this is to keep track of the changes we already made to the model tabu_list = deque(maxlen=tabu_length) # Initialize a current model current_model = start_dag if show_progress: iteration = trange(int(max_iter)) else: iteration = range(int(max_iter)) for _ in iteration: # Get the best operations based on K2 score with self._legal_operations best_operation, best_score_change = max(self._legal_operations( model=current_model, score=score_fn, tabu_list=tabu_list, max_indegree=max_indegree, black_list=black_list, ), key=lambda t: t[1]) if best_score_change < epsilon: break elif best_operation[0] == '+': current_model.add_edge(*best_operation[1]) tabu_list.append(("-", best_operation[1])) elif best_operation[0] == '-': current_model.remove_edge(*best_operation[1]) tabu_list.append(("+", best_operation[1])) elif best_operation[0] == 'flip': X, Y = best_operation[1] current_model.remove_edge(X, Y) current_model.add_edge(Y, X) tabu_list.append(best_operation) return current_model
def estimate( self, start=None, tabu_length=0, max_indegree=None, epsilon=1e-4, max_iter=1e6 ): """ Performs local hill climb search to estimates the `DAG` structure that has optimal score, according to the scoring method supplied in the constructor. Starts at model `start` and proceeds by step-by-step network modifications until a local maximum is reached. Only estimates network structure, no parametrization. Parameters ---------- start: DAG instance The starting point for the local search. By default a completely disconnected network is used. tabu_length: int If provided, the last `tabu_length` graph modifications cannot be reversed during the search procedure. This serves to enforce a wider exploration of the search space. Default value: 100. max_indegree: int or None If provided and unequal None, the procedure only searches among models where all nodes have at most `max_indegree` parents. Defaults to None. epsilon: float (default: 1e-4) Defines the exit condition. If the improvement in score is less than `epsilon`, the learned model is returned. max_iter: int (default: 1e6) The maximum number of iterations allowed. Returns the learned model when the number of iterations is greater than `max_iter`. Returns ------- model: `DAG` instance A `DAG` at a (local) score maximum. Examples -------- >>> import pandas as pd >>> import numpy as np >>> from pgmpy.estimators import HillClimbSearch, BicScore >>> # create data sample with 9 random variables: ... data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 9)), columns=list('ABCDEFGHI')) >>> # add 10th dependent variable ... data['J'] = data['A'] * data['B'] >>> est = HillClimbSearch(data, scoring_method=BicScore(data)) >>> best_model = est.estimate() >>> sorted(best_model.nodes()) ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'] >>> best_model.edges() [('B', 'J'), ('A', 'J')] >>> # search a model with restriction on the number of parents: >>> est.estimate(max_indegree=1).edges() [('J', 'A'), ('B', 'J')] """ nodes = self.state_names.keys() if start is None: start = DAG() start.add_nodes_from(nodes) elif not isinstance(start, DAG) or not set(start.nodes()) == set(nodes): raise ValueError( "'start' should be a DAG with the same variables as the data set, or 'None'." ) tabu_list = [] current_model = start iter_no = 0 while iter_no <= max_iter: iter_no += 1 best_score_delta = 0 best_operation = None for operation, score_delta in self._legal_operations( current_model, tabu_list, max_indegree ): if score_delta > best_score_delta: best_operation = operation best_score_delta = score_delta if best_operation is None or best_score_delta < epsilon: break elif best_operation[0] == "+": current_model.add_edge(*best_operation[1]) tabu_list = ([("-", best_operation[1])] + tabu_list)[:tabu_length] elif best_operation[0] == "-": current_model.remove_edge(*best_operation[1]) tabu_list = ([("+", best_operation[1])] + tabu_list)[:tabu_length] elif best_operation[0] == "flip": X, Y = best_operation[1] current_model.remove_edge(X, Y) current_model.add_edge(Y, X) tabu_list = ([best_operation] + tabu_list)[:tabu_length] return current_model
class TestDAGCreation(unittest.TestCase): def setUp(self): self.graph = DAG() def test_class_init_without_data(self): self.assertIsInstance(self.graph, DAG) def test_class_init_with_data_string(self): self.graph = DAG([("a", "b"), ("b", "c")]) self.assertListEqual(sorted(self.graph.nodes()), ["a", "b", "c"]) self.assertListEqual( hf.recursive_sorted(self.graph.edges()), [["a", "b"], ["b", "c"]] ) def test_add_node_string(self): self.graph.add_node("a") self.assertListEqual(list(self.graph.nodes()), ["a"]) def test_add_node_nonstring(self): self.graph.add_node(1) def test_add_nodes_from_string(self): self.graph.add_nodes_from(["a", "b", "c", "d"]) self.assertListEqual(sorted(self.graph.nodes()), ["a", "b", "c", "d"]) def test_add_nodes_from_non_string(self): self.graph.add_nodes_from([1, 2, 3, 4]) def test_add_node_weight(self): self.graph.add_node("weighted_a", 0.3) self.assertEqual(self.graph.nodes["weighted_a"]["weight"], 0.3) def test_add_nodes_from_weight(self): self.graph.add_nodes_from(["weighted_b", "weighted_c"], [0.5, 0.6]) self.assertEqual(self.graph.nodes["weighted_b"]["weight"], 0.5) self.assertEqual(self.graph.nodes["weighted_c"]["weight"], 0.6) self.graph.add_nodes_from(["e", "f"]) self.assertEqual(self.graph.nodes["e"]["weight"], None) self.assertEqual(self.graph.nodes["f"]["weight"], None) def test_add_edge_string(self): self.graph.add_edge("d", "e") self.assertListEqual(sorted(self.graph.nodes()), ["d", "e"]) self.assertListEqual(list(self.graph.edges()), [("d", "e")]) self.graph.add_nodes_from(["a", "b", "c"]) self.graph.add_edge("a", "b") self.assertListEqual( hf.recursive_sorted(self.graph.edges()), [["a", "b"], ["d", "e"]] ) def test_add_edge_nonstring(self): self.graph.add_edge(1, 2) def test_add_edges_from_string(self): self.graph.add_edges_from([("a", "b"), ("b", "c")]) self.assertListEqual(sorted(self.graph.nodes()), ["a", "b", "c"]) self.assertListEqual( hf.recursive_sorted(self.graph.edges()), [["a", "b"], ["b", "c"]] ) self.graph.add_nodes_from(["d", "e", "f"]) self.graph.add_edges_from([("d", "e"), ("e", "f")]) self.assertListEqual(sorted(self.graph.nodes()), ["a", "b", "c", "d", "e", "f"]) self.assertListEqual( hf.recursive_sorted(self.graph.edges()), hf.recursive_sorted([("a", "b"), ("b", "c"), ("d", "e"), ("e", "f")]), ) def test_add_edges_from_nonstring(self): self.graph.add_edges_from([(1, 2), (2, 3)]) def test_add_edge_weight(self): self.graph.add_edge("a", "b", weight=0.3) if nx.__version__.startswith("1"): self.assertEqual(self.graph.edge["a"]["b"]["weight"], 0.3) else: self.assertEqual(self.graph.adj["a"]["b"]["weight"], 0.3) def test_add_edges_from_weight(self): self.graph.add_edges_from([("b", "c"), ("c", "d")], weights=[0.5, 0.6]) if nx.__version__.startswith("1"): self.assertEqual(self.graph.edge["b"]["c"]["weight"], 0.5) self.assertEqual(self.graph.edge["c"]["d"]["weight"], 0.6) self.graph.add_edges_from([("e", "f")]) self.assertEqual(self.graph.edge["e"]["f"]["weight"], None) else: self.assertEqual(self.graph.adj["b"]["c"]["weight"], 0.5) self.assertEqual(self.graph.adj["c"]["d"]["weight"], 0.6) self.graph.add_edges_from([("e", "f")]) self.assertEqual(self.graph.adj["e"]["f"]["weight"], None) def test_update_node_parents_bm_constructor(self): self.graph = DAG([("a", "b"), ("b", "c")]) self.assertListEqual(list(self.graph.predecessors("a")), []) self.assertListEqual(list(self.graph.predecessors("b")), ["a"]) self.assertListEqual(list(self.graph.predecessors("c")), ["b"]) def test_update_node_parents(self): self.graph.add_nodes_from(["a", "b", "c"]) self.graph.add_edges_from([("a", "b"), ("b", "c")]) self.assertListEqual(list(self.graph.predecessors("a")), []) self.assertListEqual(list(self.graph.predecessors("b")), ["a"]) self.assertListEqual(list(self.graph.predecessors("c")), ["b"]) def test_get_leaves(self): self.graph.add_edges_from( [("A", "B"), ("B", "C"), ("B", "D"), ("D", "E"), ("D", "F"), ("A", "G")] ) self.assertEqual(sorted(self.graph.get_leaves()), sorted(["C", "G", "E", "F"])) def test_get_roots(self): self.graph.add_edges_from( [("A", "B"), ("B", "C"), ("B", "D"), ("D", "E"), ("D", "F"), ("A", "G")] ) self.assertEqual(["A"], self.graph.get_roots()) self.graph.add_edge("H", "G") self.assertEqual(sorted(["A", "H"]), sorted(self.graph.get_roots())) def test_init_with_cycle(self): self.assertRaises(ValueError, DAG, [("a", "a")]) self.assertRaises(ValueError, DAG, [("a", "b"), ("b", "a")]) self.assertRaises(ValueError, DAG, [("a", "b"), ("b", "c"), ("c", "a")]) def tearDown(self): del self.graph
def pdag_to_dag(pdag): """Completes a PDAG to a DAG, without adding v-structures, if such a completion exists. If no faithful extension is possible, some fully oriented DAG that corresponds to the PDAG is returned and a warning is generated. This is a static method. Parameters ---------- pdag: DAG A directed acyclic graph pattern, consisting in (acyclic) directed edges as well as "undirected" edges, represented as both-way edges between nodes. Returns ------- dag: DAG A faithful orientation of pdag, if one exists. Otherwise any fully orientated DAG/BayesianModel with the structure of pdag. References ---------- [1] Chickering, Learning Equivalence Classes of Bayesian-Network Structures, 2002; See page 454 (last paragraph) for the algorithm pdag_to_dag http://www.jmlr.org/papers/volume2/chickering02a/chickering02a.pdf [2] Dor & Tarsi, A simple algorithm to construct a consistent extension of a partially oriented graph, 1992, http://ftp.cs.ucla.edu/pub/stat_ser/r185-dor-tarsi.pdf Examples -------- >>> import pandas as pd >>> import numpy as np >>> from pgmpy.base import DAG >>> from pgmpy.estimators import ConstraintBasedEstimator >>> data = pd.DataFrame(np.random.randint(0, 4, size=(5000, 3)), columns=list('ABD')) >>> data['C'] = data['A'] - data['B'] >>> data['D'] += data['A'] >>> c = ConstraintBasedEstimator(data) >>> pdag = c.skeleton_to_pdag(*c.estimate_skeleton()) >>> pdag.edges() [('B', 'C'), ('D', 'A'), ('A', 'D'), ('A', 'C')] >>> c.pdag_to_dag(pdag).edges() [('B', 'C'), ('A', 'D'), ('A', 'C')] >>> # pdag_to_dag is static: ... pdag1 = DAG([('A', 'B'), ('C', 'B'), ('C', 'D'), ('D', 'C'), ('D', 'A'), ('A', 'D')]) >>> ConstraintBasedEstimator.pdag_to_dag(pdag1).edges() [('D', 'C'), ('C', 'B'), ('A', 'B'), ('A', 'D')] >>> # example of a pdag with no faithful extension: ... pdag2 = DAG([('A', 'B'), ('A', 'C'), ('B', 'C'), ('C', 'B')]) >>> ConstraintBasedEstimator.pdag_to_dag(pdag2).edges() UserWarning: PDAG has no faithful extension (= no oriented DAG with the same v-structures as PDAG). Remaining undirected PDAG edges oriented arbitrarily. [('B', 'C'), ('A', 'B'), ('A', 'C')] """ pdag = pdag.copy() dag = DAG() dag.add_nodes_from(pdag.nodes()) # add already directed edges of pdag to dag for X, Y in pdag.edges(): if not pdag.has_edge(Y, X): dag.add_edge(X, Y) while pdag.number_of_nodes() > 0: # find node with (1) no directed outgoing edges and # (2) the set of undirected neighbors is either empty or # undirected neighbors + parents of X are a clique found = False for X in pdag.nodes(): directed_outgoing_edges = set(pdag.successors(X)) - set( pdag.predecessors(X)) undirected_neighbors = set(pdag.successors(X)) & set( pdag.predecessors(X)) neighbors_are_clique = all((pdag.has_edge(Y, Z) for Z in pdag.predecessors(X) for Y in undirected_neighbors if not Y == Z)) if not directed_outgoing_edges and (not undirected_neighbors or neighbors_are_clique): found = True # add all edges of X as outgoing edges to dag for Y in pdag.predecessors(X): dag.add_edge(Y, X) pdag.remove_node(X) break if not found: warn( "PDAG has no faithful extension (= no oriented DAG with the " + "same v-structures as PDAG). Remaining undirected PDAG edges " + "oriented arbitrarily.") for X, Y in pdag.edges(): if not dag.has_edge(Y, X): try: dag.add_edge(X, Y) except ValueError: pass break return dag