示例#1
0
    def generator(num_nodes, seed, weight=None):
        np.random.seed(seed)

        sm = StructureModel()
        nodes = list("".join(x) for x in product(
            string.ascii_lowercase, string.ascii_lowercase))[:num_nodes]
        np.random.shuffle(nodes)
        sm.add_nodes_from(nodes)

        # one edge:
        sm.add_weighted_edges_from([("aa", "ab", weight)])
        return sm
示例#2
0
 def test_baseline_probability_probit(self, graph, distribution):
     """ Test whether probability centered around 50% if no intercept given"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_binary_data(
         graph,
         1000000,
         distribution=distribution,
         noise_scale=0.1,
         seed=10,
         intercept=False,
     )
     assert 0.45 < data[:, 0].mean() < 0.55
示例#3
0
 def test_intercept_probability_logit(self, graph, distribution):
     """ Test whether probability is not centered around 50% when using an intercept"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_binary_data(
         graph,
         1000000,
         distribution=distribution,
         noise_scale=0.1,
         seed=10,
         intercept=True,
     )
     mean_prob = data[:, 0].mean()
     assert not np.isclose(mean_prob, 0.5, atol=0.05)
示例#4
0
 def test_intercept_probability(self, graph, distribution, n_categories):
     """ Test whether probability is not centered around 50% when using an intercept"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_categorical_dataframe(
         graph,
         1000000,
         distribution=distribution,
         n_categories=n_categories,
         noise_scale=0.1,
         seed=10,
         intercept=True,
     )
     assert not np.allclose(
         data.mean(axis=0), 1 / n_categories, atol=0.01, rtol=0)
示例#5
0
 def test_baseline_probability(self, graph, distribution, n_categories):
     """ Test whether probability centered around 50% if no intercept given"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_categorical_dataframe(
         graph,
         10000,
         distribution=distribution,
         n_categories=n_categories,
         noise_scale=1.0,
         seed=10,
         intercept=False,
     )
     # without intercept, the probabilities should be fairly uniform
     assert np.allclose(data.mean(axis=0),
                        1 / n_categories,
                        atol=0.01,
                        rtol=0)
示例#6
0
    def test_incorrect_weight_dist(self):
        sm = StructureModel()
        nodes = list(str(x) for x in range(6))
        np.random.shuffle(nodes)
        sm.add_nodes_from(nodes)

        sm.add_weighted_edges_from([("0", "1", None), ("2", "4", None)])

        with pytest.raises(ValueError, match="Unknown weight distribution"):
            _ = sem_generator(
                graph=sm,
                schema=None,
                default_type="continuous",
                distributions={"weight": "unknown"},
                noise_std=2.0,
                n_samples=1000,
                intercept=False,
                seed=10,
            )
示例#7
0
def from_pandas_lasso(
    X: pd.DataFrame,
    beta: float,
    max_iter: int = 100,
    h_tol: float = 1e-8,
    w_threshold: float = 0.0,
    tabu_edges: List[Tuple[str, str]] = None,
    tabu_parent_nodes: List[str] = None,
    tabu_child_nodes: List[str] = None,
) -> StructureModel:
    """
    Learn the `StructureModel`, the graph structure with lasso regularisation
    describing conditional dependencies between variables in data presented as a pandas dataframe.

    Based on DAGs with NO TEARS.
    @inproceedings{zheng2018dags,
        author = {Zheng, Xun and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.},
        booktitle = {Advances in Neural Information Processing Systems},
        title = {{DAGs with NO TEARS: Continuous Optimization for Structure Learning}},
        year = {2018},
        codebase = {https://github.com/xunzheng/notears}
    }

    Args:
        X: input data.
        beta: Constant that multiplies the lasso term.
        max_iter: max number of dual ascent steps during optimisation.
        h_tol: exit if h(W) < h_tol (as opposed to strict definition of 0).
        w_threshold: fixed threshold for absolute edge weights.
        tabu_edges: list of edges(from, to) not to be included in the graph.
        tabu_parent_nodes: list of nodes banned from being a parent of any other nodes.
        tabu_child_nodes: list of nodes banned from being a child of any other nodes.

    Returns:
         StructureModel: graph of conditional dependencies between data variables.

    Raises:
        ValueError: If X does not contain data.
    """

    data = deepcopy(X)

    non_numeric_cols = data.select_dtypes(exclude="number").columns

    if not non_numeric_cols.empty:
        raise ValueError(
            "All columns must have numeric data. "
            "Consider mapping the following columns to int {non_numeric_cols}".
            format(non_numeric_cols=non_numeric_cols))

    col_idx = {c: i for i, c in enumerate(data.columns)}
    idx_col = {i: c for c, i in col_idx.items()}

    if tabu_edges:
        tabu_edges = [(col_idx[u], col_idx[v]) for u, v in tabu_edges]
    if tabu_parent_nodes:
        tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes]
    if tabu_child_nodes:
        tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes]

    g = from_numpy_lasso(
        data.values,
        beta,
        max_iter,
        h_tol,
        w_threshold,
        tabu_edges,
        tabu_parent_nodes,
        tabu_child_nodes,
    )

    sm = StructureModel()
    sm.add_nodes_from(data.columns)
    sm.add_weighted_edges_from(
        [(idx_col[u], idx_col[v], w) for u, v, w in g.edges.data("weight")],
        origin="learned",
    )

    return sm
示例#8
0
def from_pandas(
    X: pd.DataFrame,
    max_iter: int = 100,
    h_tol: float = 1e-8,
    w_threshold: float = 0.0,
    tabu_edges: List[Tuple[str, str]] = None,
    tabu_parent_nodes: List[str] = None,
    tabu_child_nodes: List[str] = None,
) -> StructureModel:
    """
    Learn the `StructureModel`, the graph structure describing conditional dependencies between variables
    in data presented as a pandas dataframe.

    The optimisation is to minimise a score function :math:`F(W)` over the graph's
    weighted adjacency matrix, :math:`W`, subject to the a constraint function :math:`h(W)`,
    where :math:`h(W) == 0` characterises an acyclic graph.
    :math:`h(W) > 0` is a continuous, differentiable function that encapsulated how acyclic the graph is
    (less == more acyclic).
    Full details of this approach to structure learning are provided in the publication:

    Based on DAGs with NO TEARS.
    @inproceedings{zheng2018dags,
        author = {Zheng, Xun and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.},
        booktitle = {Advances in Neural Information Processing Systems},
        title = {{DAGs with NO TEARS: Continuous Optimization for Structure Learning}},
        year = {2018},
        codebase = {https://github.com/xunzheng/notears}
    }

    Args:
        X: input data.
        max_iter: max number of dual ascent steps during optimisation.
        h_tol: exit if h(W) < h_tol (as opposed to strict definition of 0).
        w_threshold: fixed threshold for absolute edge weights.
        tabu_edges: list of edges(from, to) not to be included in the graph.
        tabu_parent_nodes: list of nodes banned from being a parent of any other nodes.
        tabu_child_nodes: list of nodes banned from being a child of any other nodes.

    Returns:
         StructureModel: graph of conditional dependencies between data variables.

    Raises:
        ValueError: If X does not contain data.
    """

    data = deepcopy(X)

    non_numeric_cols = data.select_dtypes(exclude="number").columns

    if len(non_numeric_cols) > 0:
        raise ValueError(
            "All columns must have numeric data. "
            "Consider mapping the following columns to int {non_numeric_cols}".
            format(non_numeric_cols=non_numeric_cols))

    col_idx = {c: i for i, c in enumerate(data.columns)}
    idx_col = {i: c for c, i in col_idx.items()}

    if tabu_edges:
        tabu_edges = [(col_idx[u], col_idx[v]) for u, v in tabu_edges]
    if tabu_parent_nodes:
        tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes]
    if tabu_child_nodes:
        tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes]

    g = from_numpy(
        data.values,
        max_iter,
        h_tol,
        w_threshold,
        tabu_edges,
        tabu_parent_nodes,
        tabu_child_nodes,
    )

    sm = StructureModel()
    sm.add_nodes_from(data.columns)
    sm.add_weighted_edges_from(
        [(idx_col[u], idx_col[v], w) for u, v, w in g.edges.data("weight")],
        origin="learned",
    )

    return sm
示例#9
0
def from_pandas(X: pd.DataFrame,
                dist_type_schema: Dict[Union[str, int], str] = None,
                lasso_beta: float = 0.0,
                ridge_beta: float = 0.0,
                use_bias: bool = False,
                hidden_layer_units: Iterable[int] = None,
                max_iter: int = 100,
                w_threshold: float = None,
                tabu_edges: List[Tuple[str, str]] = None,
                tabu_parent_nodes: List[str] = None,
                tabu_child_nodes: List[str] = None,
                **kwargs) -> StructureModel:
    """
    Learn the `StructureModel`, the graph structure describing conditional dependencies between variables
    in data presented as a pandas dataframe.

    The optimisation is to minimise a score function :math:`F(W)` over the graph's
    weighted adjacency matrix, :math:`W`, subject to the a constraint function :math:`h(W)`,
    where :math:`h(W) == 0` characterises an acyclic graph.
    :math:`h(W) > 0` is a continuous, differentiable function that encapsulated how acyclic the graph is
    (less == more acyclic).
    Full details of this approach to structure learning are provided in the publication:

    Based on DAGs with NO TEARS.
    @inproceedings{zheng2018dags,
        author = {Zheng, Xun and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.},
        booktitle = {Advances in Neural Information Processing Systems},
        title = {{DAGs with NO TEARS: Continuous Optimization for Structure Learning}},
        year = {2018},
        codebase = {https://github.com/xunzheng/notears}
    }

    Args:
        X: 2d input data, axis=0 is data rows, axis=1 is data columns. Data must be row oriented.

        dist_type_schema: The dist type schema corresponding to the passed in data X.
        It maps the pandas column name in X to the string alias of a dist type.
        A list of alias names can be found in ``dist_type/__init__.py``.
        If None, assumes that all data in X is continuous.

        lasso_beta: Constant that multiplies the lasso term (l1 regularisation).
        NOTE when using nonlinearities, the l1 loss only applies to the dag_layer.

        use_bias: Whether to fit a bias parameter in the NOTEARS algorithm.

        ridge_beta: Constant that multiplies the ridge term (l2 regularisation).
        When using nonlinear layers use of this parameter is recommended.

        hidden_layer_units: An iterable where its length determine the number of layers used,
        and the numbers determine the number of nodes used for the layer in order.

        w_threshold: fixed threshold for absolute edge weights.

        max_iter: max number of dual ascent steps during optimisation.

        tabu_edges: list of edges(from, to) not to be included in the graph.

        tabu_parent_nodes: list of nodes banned from being a parent of any other nodes.

        tabu_child_nodes: list of nodes banned from being a child of any other nodes.

        **kwargs: additional arguments for NOTEARS MLP model

    Returns:
         StructureModel: graph of conditional dependencies between data variables.

    Raises:
        ValueError: If X does not contain data.
    """

    data = deepcopy(X)

    # if dist_type_schema is not None, convert dist_type_schema from cols to idx
    dist_type_schema = (dist_type_schema if dist_type_schema is None else {
        X.columns.get_loc(col): alias
        for col, alias in dist_type_schema.items()
    })

    non_numeric_cols = data.select_dtypes(exclude="number").columns

    if len(non_numeric_cols) > 0:
        raise ValueError(
            "All columns must have numeric data. "
            "Consider mapping the following columns to int {non_numeric_cols}".
            format(non_numeric_cols=non_numeric_cols))

    col_idx = {c: i for i, c in enumerate(data.columns)}
    idx_col = {i: c for c, i in col_idx.items()}

    if tabu_edges:
        tabu_edges = [(col_idx[u], col_idx[v]) for u, v in tabu_edges]
    if tabu_parent_nodes:
        tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes]
    if tabu_child_nodes:
        tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes]

    g = from_numpy(X=data.values,
                   dist_type_schema=dist_type_schema,
                   lasso_beta=lasso_beta,
                   ridge_beta=ridge_beta,
                   use_bias=use_bias,
                   hidden_layer_units=hidden_layer_units,
                   w_threshold=w_threshold,
                   max_iter=max_iter,
                   tabu_edges=tabu_edges,
                   tabu_parent_nodes=tabu_parent_nodes,
                   tabu_child_nodes=tabu_child_nodes,
                   **kwargs)

    # set comprehension to ensure only unique dist types are extraced
    # NOTE: this prevents double-renaming caused by the same dist type used on expanded columns
    unique_dist_types = {node[1]["dist_type"] for node in g.nodes(data=True)}
    # use the dist types to update the idx_col mapping
    idx_col_expanded = deepcopy(idx_col)
    for dist_type in unique_dist_types:
        idx_col_expanded = dist_type.update_idx_col(idx_col_expanded)

    sm = StructureModel()
    # add expanded set of nodes
    sm.add_nodes_from(list(idx_col_expanded.values()))

    # recover the edge weights from g
    for u, v, edge_dict in g.edges.data(True):
        sm.add_edge(
            idx_col_expanded[u],
            idx_col_expanded[v],
            origin="learned",
            weight=edge_dict["weight"],
            mean_effect=edge_dict["mean_effect"],
        )

    # retrieve all graphs attrs
    for key, val in g.graph.items():
        sm.graph[key] = val

    # recover the node biases from g
    for node in g.nodes(data=True):
        node_name = idx_col_expanded[node[0]]
        sm.nodes[node_name]["bias"] = node[1]["bias"]

    # recover and preseve the node dist_types
    for node_data in g.nodes(data=True):
        node_name = idx_col_expanded[node_data[0]]
        sm.nodes[node_name]["dist_type"] = node_data[1]["dist_type"]

    # recover the collapsed model from g
    sm_collapsed = StructureModel()
    sm_collapsed.add_nodes_from(list(idx_col.values()))
    for u, v, edge_dict in g.graph["graph_collapsed"].edges.data(True):
        sm_collapsed.add_edge(
            idx_col[u],
            idx_col[v],
            origin="learned",
            weight=edge_dict["weight"],
        )
    sm.graph["graph_collapsed"] = sm_collapsed

    return sm
示例#10
0
def generate_structure_dynamic(  # pylint: disable=too-many-arguments
    num_nodes: int,
    p: int,
    degree_intra: float,
    degree_inter: float,
    graph_type_intra: str = "erdos-renyi",
    graph_type_inter: str = "erdos-renyi",
    w_min_intra: float = 0.5,
    w_max_intra: float = 0.5,
    w_min_inter: float = 0.5,
    w_max_inter: float = 0.5,
    w_decay: float = 1.0,
) -> StructureModel:
    """
    Generates a dynamic DAG at random.

    Args:
        num_nodes: Number of nodes
        p: maximum lag to be considered in the structure
        degree_intra: expected degree on nodes from the current state
        degree_inter: expected degree on nodes from the lagged nodes
        graph_type_intra:
            - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1)
            - barabasi-albert: constructs a scale-free graph from an initial connected graph of (degree / 2) nodes
            - full: constructs a fully-connected graph - degree has no effect
        graph_type_inter:
            - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1)
            - full: connect all past nodes to all present nodes
        w_min_intra: minimum weight for intra-slice nodes
        w_max_intra: maximum weight for intra-slice nodes
        w_min_inter: minimum weight for inter-slice nodes
        w_max_inter: maximum weight for inter-slice nodes
        w_decay: exponent of weights decay for slices that are farther apart. Default is 1.0, which implies no decay

    Raises:
        ValueError: if graph type unknown or `num_nodes < 2`

    Returns:
        StructureModel containing all simulated nodes and edges (intra- and inter-slice)
    """
    sm_intra = generate_structure(
        num_nodes=num_nodes,
        degree=degree_intra,
        graph_type=graph_type_intra,
        w_min=w_min_intra,
        w_max=w_max_intra,
    )
    sm_inter = _generate_inter_structure(
        num_nodes=num_nodes,
        p=p,
        degree=degree_inter,
        graph_type=graph_type_inter,
        w_min=w_min_inter,
        w_max=w_max_inter,
        w_decay=w_decay,
    )
    res = StructureModel()
    res.add_nodes_from(sm_inter.nodes)
    res.add_nodes_from([f"{u}_lag0" for u in sm_intra.nodes])
    res.add_weighted_edges_from(sm_inter.edges.data("weight"))
    res.add_weighted_edges_from([(f"{u}_lag0", f"{v}_lag0", w)
                                 for u, v, w in sm_intra.edges.data("weight")])
    return res
示例#11
0
    def test_mixed_type_independence(self, seed, n_categories,
                                     weight_distribution,
                                     intercept_distribution):
        """
        Test whether the relation is accurate, implicitly tests sequence of
        nodes.
        """
        np.random.seed(seed)

        sm = StructureModel()
        nodes = list(str(x) for x in range(6))
        np.random.shuffle(nodes)
        sm.add_nodes_from(nodes)
        # binary -> categorical
        sm.add_weighted_edges_from([("0", "1", 10)])
        # binary -> continuous
        sm.add_weighted_edges_from([("2", "4", None)])
        # binary -> count
        sm.add_weighted_edges_from([("2", "6", 100)])

        schema = {
            "0": "binary",
            "1": "categorical:{}".format(n_categories),
            "2": "binary",
            "4": "continuous",
            "5": "categorical:{}".format(n_categories),
            "6": "count",
        }

        df = sem_generator(
            graph=sm,
            schema=schema,
            default_type="continuous",
            distributions={
                "weight": weight_distribution,
                "intercept": intercept_distribution,
                "count": 0.05,
            },
            noise_std=2,
            n_samples=100000,
            intercept=True,
            seed=seed,
        )

        atol = 0.05  # 5% difference bewteen joint & factored!
        # 1. dependent links
        # 0 -> 1 (we look at the class with the highest deviation from uniform
        # to avoid small values)
        c, _ = max(
            [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories))
             for c in range(n_categories)],
            key=operator.itemgetter(1),
        )
        joint_proba, factored_proba = calculate_proba(df, "0",
                                                      "1_{}".format(c))
        assert not np.isclose(joint_proba, factored_proba, rtol=0, atol=atol)
        # 2 -> 4
        assert not np.isclose(
            df["4"].mean(), df["4"][df["2"] == 1].mean(), rtol=0, atol=atol)
        # binary on count
        assert not np.isclose(
            df.loc[df["2"] == 0, "6"].mean(),
            df.loc[df["2"] == 1, "6"].mean(),
            rtol=0,
            atol=atol,
        )

        tol = 0.15  # relative tolerance of +- 15% of the
        # 2. independent links
        # categorical
        c, _ = max(
            [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories))
             for c in range(n_categories)],
            key=operator.itemgetter(1),
        )
        joint_proba, factored_proba = calculate_proba(df, "0",
                                                      "5_{}".format(c))
        assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0)

        # binary
        joint_proba, factored_proba = calculate_proba(df, "0", "2")
        assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0)

        # categorical
        c, _ = max(
            [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories))
             for c in range(n_categories)],
            key=operator.itemgetter(1),
        )
        d, _ = max(
            [(d, np.abs(df["5_{}".format(d)].mean() - 1 / n_categories))
             for d in range(n_categories)],
            key=operator.itemgetter(1),
        )
        joint_proba, factored_proba = calculate_proba(df, "1_{}".format(d),
                                                      "5_{}".format(c))
        assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0)

        # continuous
        # for gaussian distributions, zero variance is equivalent to independence
        assert np.isclose(df[["3", "4"]].corr().values[0, 1], 0, atol=tol)