def generator(num_nodes, seed, weight=None): np.random.seed(seed) sm = StructureModel() nodes = list("".join(x) for x in product( string.ascii_lowercase, string.ascii_lowercase))[:num_nodes] np.random.shuffle(nodes) sm.add_nodes_from(nodes) # one edge: sm.add_weighted_edges_from([("aa", "ab", weight)]) return sm
def test_baseline_probability_probit(self, graph, distribution): """ Test whether probability centered around 50% if no intercept given""" graph = StructureModel() graph.add_nodes_from(["A"]) data = generate_binary_data( graph, 1000000, distribution=distribution, noise_scale=0.1, seed=10, intercept=False, ) assert 0.45 < data[:, 0].mean() < 0.55
def test_intercept_probability_logit(self, graph, distribution): """ Test whether probability is not centered around 50% when using an intercept""" graph = StructureModel() graph.add_nodes_from(["A"]) data = generate_binary_data( graph, 1000000, distribution=distribution, noise_scale=0.1, seed=10, intercept=True, ) mean_prob = data[:, 0].mean() assert not np.isclose(mean_prob, 0.5, atol=0.05)
def test_intercept_probability(self, graph, distribution, n_categories): """ Test whether probability is not centered around 50% when using an intercept""" graph = StructureModel() graph.add_nodes_from(["A"]) data = generate_categorical_dataframe( graph, 1000000, distribution=distribution, n_categories=n_categories, noise_scale=0.1, seed=10, intercept=True, ) assert not np.allclose( data.mean(axis=0), 1 / n_categories, atol=0.01, rtol=0)
def test_baseline_probability(self, graph, distribution, n_categories): """ Test whether probability centered around 50% if no intercept given""" graph = StructureModel() graph.add_nodes_from(["A"]) data = generate_categorical_dataframe( graph, 10000, distribution=distribution, n_categories=n_categories, noise_scale=1.0, seed=10, intercept=False, ) # without intercept, the probabilities should be fairly uniform assert np.allclose(data.mean(axis=0), 1 / n_categories, atol=0.01, rtol=0)
def test_incorrect_weight_dist(self): sm = StructureModel() nodes = list(str(x) for x in range(6)) np.random.shuffle(nodes) sm.add_nodes_from(nodes) sm.add_weighted_edges_from([("0", "1", None), ("2", "4", None)]) with pytest.raises(ValueError, match="Unknown weight distribution"): _ = sem_generator( graph=sm, schema=None, default_type="continuous", distributions={"weight": "unknown"}, noise_std=2.0, n_samples=1000, intercept=False, seed=10, )
def from_pandas_lasso( X: pd.DataFrame, beta: float, max_iter: int = 100, h_tol: float = 1e-8, w_threshold: float = 0.0, tabu_edges: List[Tuple[str, str]] = None, tabu_parent_nodes: List[str] = None, tabu_child_nodes: List[str] = None, ) -> StructureModel: """ Learn the `StructureModel`, the graph structure with lasso regularisation describing conditional dependencies between variables in data presented as a pandas dataframe. Based on DAGs with NO TEARS. @inproceedings{zheng2018dags, author = {Zheng, Xun and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.}, booktitle = {Advances in Neural Information Processing Systems}, title = {{DAGs with NO TEARS: Continuous Optimization for Structure Learning}}, year = {2018}, codebase = {https://github.com/xunzheng/notears} } Args: X: input data. beta: Constant that multiplies the lasso term. max_iter: max number of dual ascent steps during optimisation. h_tol: exit if h(W) < h_tol (as opposed to strict definition of 0). w_threshold: fixed threshold for absolute edge weights. tabu_edges: list of edges(from, to) not to be included in the graph. tabu_parent_nodes: list of nodes banned from being a parent of any other nodes. tabu_child_nodes: list of nodes banned from being a child of any other nodes. Returns: StructureModel: graph of conditional dependencies between data variables. Raises: ValueError: If X does not contain data. """ data = deepcopy(X) non_numeric_cols = data.select_dtypes(exclude="number").columns if not non_numeric_cols.empty: raise ValueError( "All columns must have numeric data. " "Consider mapping the following columns to int {non_numeric_cols}". format(non_numeric_cols=non_numeric_cols)) col_idx = {c: i for i, c in enumerate(data.columns)} idx_col = {i: c for c, i in col_idx.items()} if tabu_edges: tabu_edges = [(col_idx[u], col_idx[v]) for u, v in tabu_edges] if tabu_parent_nodes: tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes] if tabu_child_nodes: tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes] g = from_numpy_lasso( data.values, beta, max_iter, h_tol, w_threshold, tabu_edges, tabu_parent_nodes, tabu_child_nodes, ) sm = StructureModel() sm.add_nodes_from(data.columns) sm.add_weighted_edges_from( [(idx_col[u], idx_col[v], w) for u, v, w in g.edges.data("weight")], origin="learned", ) return sm
def from_pandas( X: pd.DataFrame, max_iter: int = 100, h_tol: float = 1e-8, w_threshold: float = 0.0, tabu_edges: List[Tuple[str, str]] = None, tabu_parent_nodes: List[str] = None, tabu_child_nodes: List[str] = None, ) -> StructureModel: """ Learn the `StructureModel`, the graph structure describing conditional dependencies between variables in data presented as a pandas dataframe. The optimisation is to minimise a score function :math:`F(W)` over the graph's weighted adjacency matrix, :math:`W`, subject to the a constraint function :math:`h(W)`, where :math:`h(W) == 0` characterises an acyclic graph. :math:`h(W) > 0` is a continuous, differentiable function that encapsulated how acyclic the graph is (less == more acyclic). Full details of this approach to structure learning are provided in the publication: Based on DAGs with NO TEARS. @inproceedings{zheng2018dags, author = {Zheng, Xun and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.}, booktitle = {Advances in Neural Information Processing Systems}, title = {{DAGs with NO TEARS: Continuous Optimization for Structure Learning}}, year = {2018}, codebase = {https://github.com/xunzheng/notears} } Args: X: input data. max_iter: max number of dual ascent steps during optimisation. h_tol: exit if h(W) < h_tol (as opposed to strict definition of 0). w_threshold: fixed threshold for absolute edge weights. tabu_edges: list of edges(from, to) not to be included in the graph. tabu_parent_nodes: list of nodes banned from being a parent of any other nodes. tabu_child_nodes: list of nodes banned from being a child of any other nodes. Returns: StructureModel: graph of conditional dependencies between data variables. Raises: ValueError: If X does not contain data. """ data = deepcopy(X) non_numeric_cols = data.select_dtypes(exclude="number").columns if len(non_numeric_cols) > 0: raise ValueError( "All columns must have numeric data. " "Consider mapping the following columns to int {non_numeric_cols}". format(non_numeric_cols=non_numeric_cols)) col_idx = {c: i for i, c in enumerate(data.columns)} idx_col = {i: c for c, i in col_idx.items()} if tabu_edges: tabu_edges = [(col_idx[u], col_idx[v]) for u, v in tabu_edges] if tabu_parent_nodes: tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes] if tabu_child_nodes: tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes] g = from_numpy( data.values, max_iter, h_tol, w_threshold, tabu_edges, tabu_parent_nodes, tabu_child_nodes, ) sm = StructureModel() sm.add_nodes_from(data.columns) sm.add_weighted_edges_from( [(idx_col[u], idx_col[v], w) for u, v, w in g.edges.data("weight")], origin="learned", ) return sm
def from_pandas(X: pd.DataFrame, dist_type_schema: Dict[Union[str, int], str] = None, lasso_beta: float = 0.0, ridge_beta: float = 0.0, use_bias: bool = False, hidden_layer_units: Iterable[int] = None, max_iter: int = 100, w_threshold: float = None, tabu_edges: List[Tuple[str, str]] = None, tabu_parent_nodes: List[str] = None, tabu_child_nodes: List[str] = None, **kwargs) -> StructureModel: """ Learn the `StructureModel`, the graph structure describing conditional dependencies between variables in data presented as a pandas dataframe. The optimisation is to minimise a score function :math:`F(W)` over the graph's weighted adjacency matrix, :math:`W`, subject to the a constraint function :math:`h(W)`, where :math:`h(W) == 0` characterises an acyclic graph. :math:`h(W) > 0` is a continuous, differentiable function that encapsulated how acyclic the graph is (less == more acyclic). Full details of this approach to structure learning are provided in the publication: Based on DAGs with NO TEARS. @inproceedings{zheng2018dags, author = {Zheng, Xun and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.}, booktitle = {Advances in Neural Information Processing Systems}, title = {{DAGs with NO TEARS: Continuous Optimization for Structure Learning}}, year = {2018}, codebase = {https://github.com/xunzheng/notears} } Args: X: 2d input data, axis=0 is data rows, axis=1 is data columns. Data must be row oriented. dist_type_schema: The dist type schema corresponding to the passed in data X. It maps the pandas column name in X to the string alias of a dist type. A list of alias names can be found in ``dist_type/__init__.py``. If None, assumes that all data in X is continuous. lasso_beta: Constant that multiplies the lasso term (l1 regularisation). NOTE when using nonlinearities, the l1 loss only applies to the dag_layer. use_bias: Whether to fit a bias parameter in the NOTEARS algorithm. ridge_beta: Constant that multiplies the ridge term (l2 regularisation). When using nonlinear layers use of this parameter is recommended. hidden_layer_units: An iterable where its length determine the number of layers used, and the numbers determine the number of nodes used for the layer in order. w_threshold: fixed threshold for absolute edge weights. max_iter: max number of dual ascent steps during optimisation. tabu_edges: list of edges(from, to) not to be included in the graph. tabu_parent_nodes: list of nodes banned from being a parent of any other nodes. tabu_child_nodes: list of nodes banned from being a child of any other nodes. **kwargs: additional arguments for NOTEARS MLP model Returns: StructureModel: graph of conditional dependencies between data variables. Raises: ValueError: If X does not contain data. """ data = deepcopy(X) # if dist_type_schema is not None, convert dist_type_schema from cols to idx dist_type_schema = (dist_type_schema if dist_type_schema is None else { X.columns.get_loc(col): alias for col, alias in dist_type_schema.items() }) non_numeric_cols = data.select_dtypes(exclude="number").columns if len(non_numeric_cols) > 0: raise ValueError( "All columns must have numeric data. " "Consider mapping the following columns to int {non_numeric_cols}". format(non_numeric_cols=non_numeric_cols)) col_idx = {c: i for i, c in enumerate(data.columns)} idx_col = {i: c for c, i in col_idx.items()} if tabu_edges: tabu_edges = [(col_idx[u], col_idx[v]) for u, v in tabu_edges] if tabu_parent_nodes: tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes] if tabu_child_nodes: tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes] g = from_numpy(X=data.values, dist_type_schema=dist_type_schema, lasso_beta=lasso_beta, ridge_beta=ridge_beta, use_bias=use_bias, hidden_layer_units=hidden_layer_units, w_threshold=w_threshold, max_iter=max_iter, tabu_edges=tabu_edges, tabu_parent_nodes=tabu_parent_nodes, tabu_child_nodes=tabu_child_nodes, **kwargs) # set comprehension to ensure only unique dist types are extraced # NOTE: this prevents double-renaming caused by the same dist type used on expanded columns unique_dist_types = {node[1]["dist_type"] for node in g.nodes(data=True)} # use the dist types to update the idx_col mapping idx_col_expanded = deepcopy(idx_col) for dist_type in unique_dist_types: idx_col_expanded = dist_type.update_idx_col(idx_col_expanded) sm = StructureModel() # add expanded set of nodes sm.add_nodes_from(list(idx_col_expanded.values())) # recover the edge weights from g for u, v, edge_dict in g.edges.data(True): sm.add_edge( idx_col_expanded[u], idx_col_expanded[v], origin="learned", weight=edge_dict["weight"], mean_effect=edge_dict["mean_effect"], ) # retrieve all graphs attrs for key, val in g.graph.items(): sm.graph[key] = val # recover the node biases from g for node in g.nodes(data=True): node_name = idx_col_expanded[node[0]] sm.nodes[node_name]["bias"] = node[1]["bias"] # recover and preseve the node dist_types for node_data in g.nodes(data=True): node_name = idx_col_expanded[node_data[0]] sm.nodes[node_name]["dist_type"] = node_data[1]["dist_type"] # recover the collapsed model from g sm_collapsed = StructureModel() sm_collapsed.add_nodes_from(list(idx_col.values())) for u, v, edge_dict in g.graph["graph_collapsed"].edges.data(True): sm_collapsed.add_edge( idx_col[u], idx_col[v], origin="learned", weight=edge_dict["weight"], ) sm.graph["graph_collapsed"] = sm_collapsed return sm
def generate_structure_dynamic( # pylint: disable=too-many-arguments num_nodes: int, p: int, degree_intra: float, degree_inter: float, graph_type_intra: str = "erdos-renyi", graph_type_inter: str = "erdos-renyi", w_min_intra: float = 0.5, w_max_intra: float = 0.5, w_min_inter: float = 0.5, w_max_inter: float = 0.5, w_decay: float = 1.0, ) -> StructureModel: """ Generates a dynamic DAG at random. Args: num_nodes: Number of nodes p: maximum lag to be considered in the structure degree_intra: expected degree on nodes from the current state degree_inter: expected degree on nodes from the lagged nodes graph_type_intra: - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1) - barabasi-albert: constructs a scale-free graph from an initial connected graph of (degree / 2) nodes - full: constructs a fully-connected graph - degree has no effect graph_type_inter: - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1) - full: connect all past nodes to all present nodes w_min_intra: minimum weight for intra-slice nodes w_max_intra: maximum weight for intra-slice nodes w_min_inter: minimum weight for inter-slice nodes w_max_inter: maximum weight for inter-slice nodes w_decay: exponent of weights decay for slices that are farther apart. Default is 1.0, which implies no decay Raises: ValueError: if graph type unknown or `num_nodes < 2` Returns: StructureModel containing all simulated nodes and edges (intra- and inter-slice) """ sm_intra = generate_structure( num_nodes=num_nodes, degree=degree_intra, graph_type=graph_type_intra, w_min=w_min_intra, w_max=w_max_intra, ) sm_inter = _generate_inter_structure( num_nodes=num_nodes, p=p, degree=degree_inter, graph_type=graph_type_inter, w_min=w_min_inter, w_max=w_max_inter, w_decay=w_decay, ) res = StructureModel() res.add_nodes_from(sm_inter.nodes) res.add_nodes_from([f"{u}_lag0" for u in sm_intra.nodes]) res.add_weighted_edges_from(sm_inter.edges.data("weight")) res.add_weighted_edges_from([(f"{u}_lag0", f"{v}_lag0", w) for u, v, w in sm_intra.edges.data("weight")]) return res
def test_mixed_type_independence(self, seed, n_categories, weight_distribution, intercept_distribution): """ Test whether the relation is accurate, implicitly tests sequence of nodes. """ np.random.seed(seed) sm = StructureModel() nodes = list(str(x) for x in range(6)) np.random.shuffle(nodes) sm.add_nodes_from(nodes) # binary -> categorical sm.add_weighted_edges_from([("0", "1", 10)]) # binary -> continuous sm.add_weighted_edges_from([("2", "4", None)]) # binary -> count sm.add_weighted_edges_from([("2", "6", 100)]) schema = { "0": "binary", "1": "categorical:{}".format(n_categories), "2": "binary", "4": "continuous", "5": "categorical:{}".format(n_categories), "6": "count", } df = sem_generator( graph=sm, schema=schema, default_type="continuous", distributions={ "weight": weight_distribution, "intercept": intercept_distribution, "count": 0.05, }, noise_std=2, n_samples=100000, intercept=True, seed=seed, ) atol = 0.05 # 5% difference bewteen joint & factored! # 1. dependent links # 0 -> 1 (we look at the class with the highest deviation from uniform # to avoid small values) c, _ = max( [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories)) for c in range(n_categories)], key=operator.itemgetter(1), ) joint_proba, factored_proba = calculate_proba(df, "0", "1_{}".format(c)) assert not np.isclose(joint_proba, factored_proba, rtol=0, atol=atol) # 2 -> 4 assert not np.isclose( df["4"].mean(), df["4"][df["2"] == 1].mean(), rtol=0, atol=atol) # binary on count assert not np.isclose( df.loc[df["2"] == 0, "6"].mean(), df.loc[df["2"] == 1, "6"].mean(), rtol=0, atol=atol, ) tol = 0.15 # relative tolerance of +- 15% of the # 2. independent links # categorical c, _ = max( [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories)) for c in range(n_categories)], key=operator.itemgetter(1), ) joint_proba, factored_proba = calculate_proba(df, "0", "5_{}".format(c)) assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0) # binary joint_proba, factored_proba = calculate_proba(df, "0", "2") assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0) # categorical c, _ = max( [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories)) for c in range(n_categories)], key=operator.itemgetter(1), ) d, _ = max( [(d, np.abs(df["5_{}".format(d)].mean() - 1 / n_categories)) for d in range(n_categories)], key=operator.itemgetter(1), ) joint_proba, factored_proba = calculate_proba(df, "1_{}".format(d), "5_{}".format(c)) assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0) # continuous # for gaussian distributions, zero variance is equivalent to independence assert np.isclose(df[["3", "4"]].corr().values[0, 1], 0, atol=tol)