def estimate(self):
        """
        Estimates the `DAG` structure that fits best to the given data set,
        according to the scoring method supplied in the constructor.
        Exhaustively searches through all models. Only estimates network structure, no parametrization.

        Returns
        -------
        model: `DAG` instance
            A `DAG` with maximal score.

        Examples
        --------
        >>> import pandas as pd
        >>> import numpy as np
        >>> from pgmpy.estimators import ExhaustiveSearch
        >>> # create random data sample with 3 variables, where B and C are identical:
        >>> data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 2)), columns=list('AB'))
        >>> data['C'] = data['B']
        >>> est = ExhaustiveSearch(data)
        >>> best_model = est.estimate()
        >>> best_model
        <pgmpy.base.DAG.DAG object at 0x7f695c535470>
        >>> best_model.edges()
        [('B', 'C')]
        """

        best_dag = max(self.all_dags(), key=self.scoring_method.score)

        best_model = DAG()
        best_model.add_nodes_from(sorted(best_dag.nodes()))
        best_model.add_edges_from(sorted(best_dag.edges()))
        return best_model
예제 #2
0
    def estimate(self,
                 tabu_length=100,
                 max_indegree=2,
                 black_list=None,
                 epsilon=1e-4,
                 max_iter=1e6,
                 show_progress=True):

        # We will be using K2Score for this model
        score = K2Score(data=self.data)
        # Model gets the score for a node and its parents
        # This is used on every iteration for all possible changes
        # This is greddy and picks the best available option
        score_fn = score.local_score
        # Initialize a Starting DAG
        # PGMPY made a DAG class that adds some functionality to nx.DiGrpah
        start_dag = DAG()
        start_dag.add_nodes_from(self.variables)
        # Set the edges we do not want to have in the graph
        if black_list is None:
            black_list = set()
        else:
            black_list = set(black_list)

        # Just change Maxindegree to a certain number when doing the model

        # I think this is to keep track of the changes we already made to the model
        tabu_list = deque(maxlen=tabu_length)
        # Initialize a current model
        current_model = start_dag
        if show_progress:
            iteration = trange(int(max_iter))
        else:
            iteration = range(int(max_iter))
        for _ in iteration:
            # Get the best operations based on K2 score with self._legal_operations
            best_operation, best_score_change = max(self._legal_operations(
                model=current_model,
                score=score_fn,
                tabu_list=tabu_list,
                max_indegree=max_indegree,
                black_list=black_list,
            ),
                                                    key=lambda t: t[1])

            if best_score_change < epsilon:
                break
            elif best_operation[0] == '+':
                current_model.add_edge(*best_operation[1])
                tabu_list.append(("-", best_operation[1]))
            elif best_operation[0] == '-':
                current_model.remove_edge(*best_operation[1])
                tabu_list.append(("+", best_operation[1]))
            elif best_operation[0] == 'flip':
                X, Y = best_operation[1]
                current_model.remove_edge(X, Y)
                current_model.add_edge(Y, X)
                tabu_list.append(best_operation)

        return current_model
예제 #3
0
def random_dag(number_of_nodes: int = 5,
               edge_density: float = 0.4,
               max_in_degree: int = 4) -> DAG:
    """Create a connected, random directed acyclic graph (DAG), with the given number of nodes,
    the given edge density, and with no node exceeding having too high in degree"""
    node_names = [f"X{i}" for i in range(number_of_nodes)]
    dag = DAG()

    # First make sure the dag is connected
    visited = list()
    unvisited = list(node_names)
    node = random.choice(unvisited)
    unvisited.remove(node)
    visited.append(node)
    dag.add_node(node)

    while unvisited:
        node = random.choice(unvisited)
        neighbor = random.choice(visited)
        if node_names.index(node) < node_names.index(
                neighbor) and dag.in_degree(neighbor) < max_in_degree:
            dag.add_edge(node, neighbor)
        elif node_names.index(neighbor) < node_names.index(node):
            dag.add_edge(neighbor, node)
        else:
            continue
        unvisited.remove(node)
        visited.append(node)

    # Then add edges until desired density is reached
    maximum_number_of_edges = number_of_nodes * (number_of_nodes - 1) / 2
    while dag.number_of_edges() < int(edge_density * maximum_number_of_edges):
        add_random_edge(dag, node_names)

    return dag
예제 #4
0
    def pdag2dag(self, edge_dict):
        pdag_edges = [(pi, n) for n, p in edge_dict.items() for pi in p]
        pdag = DAG(pdag_edges)
        dag_edges = ConstraintBasedEstimator.pdag_to_dag(pdag).edges()
        dag = dict([(n, set()) for n in range(len(edge_dict))])
        for e in dag_edges:
            dag[e[1]].add(e[0])

        return dag
 def test_markov_blanet(self):
     G = DAG([
         ("x", "y"),
         ("z", "y"),
         ("y", "w"),
         ("y", "v"),
         ("u", "w"),
         ("s", "v"),
         ("w", "t"),
         ("w", "m"),
         ("v", "n"),
         ("v", "q"),
     ])
     self.assertEqual(set(G.get_markov_blanket("y")),
                      set(["s", "w", "x", "u", "z", "v"]))
    def estimate(
        self, start=None, tabu_length=0, max_indegree=None, epsilon=1e-4, max_iter=1e6
    ):
        """
        Performs local hill climb search to estimates the `DAG` structure
        that has optimal score, according to the scoring method supplied in the constructor.
        Starts at model `start` and proceeds by step-by-step network modifications
        until a local maximum is reached. Only estimates network structure, no parametrization.

        Parameters
        ----------
        start: DAG instance
            The starting point for the local search. By default a completely disconnected network is used.

        tabu_length: int
            If provided, the last `tabu_length` graph modifications cannot be reversed
            during the search procedure. This serves to enforce a wider exploration
            of the search space. Default value: 100.

        max_indegree: int or None
            If provided and unequal None, the procedure only searches among models
            where all nodes have at most `max_indegree` parents. Defaults to None.

        epsilon: float (default: 1e-4)
            Defines the exit condition. If the improvement in score is less than `epsilon`,
            the learned model is returned.

        max_iter: int (default: 1e6)
            The maximum number of iterations allowed. Returns the learned model when the
            number of iterations is greater than `max_iter`.

        Returns
        -------
        model: `DAG` instance
            A `DAG` at a (local) score maximum.

        Examples
        --------
        >>> import pandas as pd
        >>> import numpy as np
        >>> from pgmpy.estimators import HillClimbSearch, BicScore
        >>> # create data sample with 9 random variables:
        ... data = pd.DataFrame(np.random.randint(0, 5, size=(5000, 9)), columns=list('ABCDEFGHI'))
        >>> # add 10th dependent variable
        ... data['J'] = data['A'] * data['B']
        >>> est = HillClimbSearch(data, scoring_method=BicScore(data))
        >>> best_model = est.estimate()
        >>> sorted(best_model.nodes())
        ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
        >>> best_model.edges()
        [('B', 'J'), ('A', 'J')]
        >>> # search a model with restriction on the number of parents:
        >>> est.estimate(max_indegree=1).edges()
        [('J', 'A'), ('B', 'J')]
        """
        nodes = self.state_names.keys()
        if start is None:
            start = DAG()
            start.add_nodes_from(nodes)
        elif not isinstance(start, DAG) or not set(start.nodes()) == set(nodes):
            raise ValueError(
                "'start' should be a DAG with the same variables as the data set, or 'None'."
            )

        tabu_list = []
        current_model = start

        iter_no = 0
        while iter_no <= max_iter:
            iter_no += 1

            best_score_delta = 0
            best_operation = None

            for operation, score_delta in self._legal_operations(
                current_model, tabu_list, max_indegree
            ):
                if score_delta > best_score_delta:
                    best_operation = operation
                    best_score_delta = score_delta

            if best_operation is None or best_score_delta < epsilon:
                break
            elif best_operation[0] == "+":
                current_model.add_edge(*best_operation[1])
                tabu_list = ([("-", best_operation[1])] + tabu_list)[:tabu_length]
            elif best_operation[0] == "-":
                current_model.remove_edge(*best_operation[1])
                tabu_list = ([("+", best_operation[1])] + tabu_list)[:tabu_length]
            elif best_operation[0] == "flip":
                X, Y = best_operation[1]
                current_model.remove_edge(X, Y)
                current_model.add_edge(Y, X)
                tabu_list = ([best_operation] + tabu_list)[:tabu_length]

        return current_model
 def setUp(self):
     self.G = BayesianModel([("d", "g"), ("i", "g"), ("g", "l"),
                             ("i", "s")])
     self.G2 = DAG([("d", "g"), ("i", "g"), ("g", "l"), ("i", "s")])
예제 #8
0
 def setUp(self):
     self.graph = DAG()
     self.graph.add_edges_from([("X", "A"), ("A", "Y"), ("A", "B")])
예제 #9
0
 def test_class_init_with_data_string(self):
     self.graph = DAG([("a", "b"), ("b", "c")])
     self.assertListEqual(sorted(self.graph.nodes()), ["a", "b", "c"])
     self.assertListEqual(
         hf.recursive_sorted(self.graph.edges()), [["a", "b"], ["b", "c"]]
     )
예제 #10
0
 def setUp(self):
     self.graph = DAG()
     self.graph.add_edges_from([("diff", "grade"), ("intel", "grade")])
예제 #11
0
 def test_update_node_parents_bm_constructor(self):
     self.graph = DAG([("a", "b"), ("b", "c")])
     self.assertListEqual(list(self.graph.predecessors("a")), [])
     self.assertListEqual(list(self.graph.predecessors("b")), ["a"])
     self.assertListEqual(list(self.graph.predecessors("c")), ["b"])
예제 #12
0
 def setUp(self):
     self.graph = DAG()
예제 #13
0
    def pdag_to_dag(pdag):
        """Completes a PDAG to a DAG, without adding v-structures, if such a
        completion exists. If no faithful extension is possible, some fully
        oriented DAG that corresponds to the PDAG is returned and a warning is
        generated. This is a static method.

        Parameters
        ----------
        pdag: DAG
            A directed acyclic graph pattern, consisting in (acyclic) directed edges
            as well as "undirected" edges, represented as both-way edges between
            nodes.

        Returns
        -------
        dag: DAG
            A faithful orientation of pdag, if one exists. Otherwise any
            fully orientated DAG/BayesianModel with the structure of pdag.

        References
        ----------
        [1] Chickering, Learning Equivalence Classes of Bayesian-Network Structures,
            2002; See page 454 (last paragraph) for the algorithm pdag_to_dag
            http://www.jmlr.org/papers/volume2/chickering02a/chickering02a.pdf
        [2] Dor & Tarsi, A simple algorithm to construct a consistent extension
            of a partially oriented graph, 1992,
            http://ftp.cs.ucla.edu/pub/stat_ser/r185-dor-tarsi.pdf

        Examples
        --------
        >>> import pandas as pd
        >>> import numpy as np
        >>> from pgmpy.base import DAG
        >>> from pgmpy.estimators import ConstraintBasedEstimator
        >>> data = pd.DataFrame(np.random.randint(0, 4, size=(5000, 3)), columns=list('ABD'))
        >>> data['C'] = data['A'] - data['B']
        >>> data['D'] += data['A']
        >>> c = ConstraintBasedEstimator(data)
        >>> pdag = c.skeleton_to_pdag(*c.estimate_skeleton())
        >>> pdag.edges()
        [('B', 'C'), ('D', 'A'), ('A', 'D'), ('A', 'C')]
        >>> c.pdag_to_dag(pdag).edges()
        [('B', 'C'), ('A', 'D'), ('A', 'C')]

        >>> # pdag_to_dag is static:
        ... pdag1 = DAG([('A', 'B'), ('C', 'B'), ('C', 'D'), ('D', 'C'), ('D', 'A'), ('A', 'D')])
        >>> ConstraintBasedEstimator.pdag_to_dag(pdag1).edges()
        [('D', 'C'), ('C', 'B'), ('A', 'B'), ('A', 'D')]

        >>> # example of a pdag with no faithful extension:
        ... pdag2 = DAG([('A', 'B'), ('A', 'C'), ('B', 'C'), ('C', 'B')])
        >>> ConstraintBasedEstimator.pdag_to_dag(pdag2).edges()
        UserWarning: PDAG has no faithful extension (= no oriented DAG with the same v-structures as PDAG).
        Remaining undirected PDAG edges oriented arbitrarily.
        [('B', 'C'), ('A', 'B'), ('A', 'C')]
        """

        pdag = pdag.copy()
        dag = DAG()
        dag.add_nodes_from(pdag.nodes())

        # add already directed edges of pdag to dag
        for X, Y in pdag.edges():
            if not pdag.has_edge(Y, X):
                dag.add_edge(X, Y)

        while pdag.number_of_nodes() > 0:
            # find node with (1) no directed outgoing edges and
            #                (2) the set of undirected neighbors is either empty or
            #                    undirected neighbors + parents of X are a clique
            found = False
            for X in pdag.nodes():
                directed_outgoing_edges = set(pdag.successors(X)) - set(
                    pdag.predecessors(X))
                undirected_neighbors = set(pdag.successors(X)) & set(
                    pdag.predecessors(X))
                neighbors_are_clique = all((pdag.has_edge(Y, Z)
                                            for Z in pdag.predecessors(X)
                                            for Y in undirected_neighbors
                                            if not Y == Z))

                if not directed_outgoing_edges and (not undirected_neighbors
                                                    or neighbors_are_clique):
                    found = True
                    # add all edges of X as outgoing edges to dag
                    for Y in pdag.predecessors(X):
                        dag.add_edge(Y, X)

                    pdag.remove_node(X)
                    break

            if not found:
                warn(
                    "PDAG has no faithful extension (= no oriented DAG with the "
                    +
                    "same v-structures as PDAG). Remaining undirected PDAG edges "
                    + "oriented arbitrarily.")
                for X, Y in pdag.edges():
                    if not dag.has_edge(Y, X):
                        try:
                            dag.add_edge(X, Y)
                        except ValueError:
                            pass
                break

        return dag