def test_auc_node_with_no_parents(self): """Should be possible to compute auc for state with no parent nodes""" train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(1)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(1)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) _, auc = roc_auc(bn, train, "a") assert math.isclose(auc, 0.5, abs_tol=0.01)
def test_auc_of_accurate_predictions(self): """AUC of accurate predictions should be 1""" # equal class (c) weighting to guarantee high ROC expected train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(1)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(1)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) _, auc = roc_auc(bn, train, "c") assert math.isclose(auc, 1, abs_tol=0.001)
def test_auc_with_missing_state_in_test(self): """AUC should still be calculated correctly with states missing in test set""" # equal class (c) weighting to guarantee high ROC expected train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(1)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(1)], columns=["a", "b", "c"], ) test = train[train["c"] == 1] assert len(test["c"].unique()) == 1 cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) _, auc = roc_auc(bn, test, "c") assert math.isclose(auc, 1, abs_tol=0.01)
def test_roc_of_accurate_predictions(self): """TPR should always be better than FPR for accurate predictions""" # equal class (c) weighting to guarantee high ROC expected train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(10)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(10)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) roc, _ = roc_auc(bn, train, "c") assert all(tpr > fpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
def test_roc_of_random_has_unit_gradient(self): """The ROC curve for random predictions should be a line from (0,0) to (1,1)""" # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold # points in roc curve) train = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(a * 1000 + b * 1000 + 1000)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02) test = pd.DataFrame( [[a, b, random.randint(0, 1)] for a in range(3) for b in range(3) for _ in range(1000)], columns=["a", "b", "c"], ) roc, _ = roc_auc(bn, test, "c") assert len(roc) > 3 assert all(math.isclose(a, b, abs_tol=0.03) for a, b in roc)
def test_auc_of_incorrect_close_to_zero(self): """The AUC of incorrect predictions should be close to zero""" # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold # points in roc curve) train = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(a * 1000 + b * 1000 + 1000)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02) # in test, c=0 is always more likely (opposite of train) test = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1000)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(1)], columns=["a", "b", "c"], ) _, auc = roc_auc(bn, test, "c") assert math.isclose(auc, 0, abs_tol=0.001)
def test_roc_of_incorrect_has_fpr_lt_tpr(self): """The ROC of incorrect predictions should have FPR < TPR""" # regardless of a or b, c=1 is always more likely to varying amounts (to create multiple threshold # points in roc curve) train = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(a * 1000 + b * 1000 + 1000)], columns=["a", "b", "c"], ) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) assert np.allclose(bn.cpds["c"].loc[1].values, 1, atol=0.02) # in test, c=0 is always more likely (opposite of train) test = pd.DataFrame( [[a, b, 0] for a in range(3) for b in range(3) for _ in range(1000)] + [[a, b, 1] for a in range(3) for b in range(3) for _ in range(1)], columns=["a", "b", "c"], ) roc, _ = roc_auc(bn, test, "c") assert len(roc) > 3 assert all(fpr > tpr for fpr, tpr in roc if tpr not in [0.0, 1.0])
def test_auc_for_nonnumeric_features(self): """AUC of accurate predictions should be 1 even after remapping numbers to strings""" # equal class (c) weighting to guarantee high ROC expected train = pd.DataFrame( [[a, b, 0] for a in range(0, 2) for b in range(0, 2) for _ in range(1)] + [[a, b, 1] for a in range(0, 2) for b in range(0, 2) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 0] for a in range(2, 4) for b in range(2, 4) for _ in range(a * 10 + b * 10 + 1000)] + [[a, b, 1] for a in range(2, 4) for b in range(2, 4) for _ in range(1)], columns=["a", "b", "c"], ) # remap values in column c train["c"] = train["c"].map({0: "f", 1: "g"}) cg = StructureModel() cg.add_weighted_edges_from([("a", "c", 1), ("b", "c", 1)]) bn = BayesianNetwork(cg) bn.fit_node_states(train) bn.fit_cpds(train) _, auc = roc_auc(bn, train, "c") assert math.isclose(auc, 1, abs_tol=0.001)
def test_number_of_nodes(self, num_nodes): """ Length of each row in generated data equals num_nodes """ graph = StructureModel() edges = [(n, n + 1, 1) for n in range(num_nodes - 1)] graph.add_weighted_edges_from(edges) data = generate_binary_data(graph, 100, seed=10) assert all(len(sample) == num_nodes for sample in data)
def test_number_of_columns(self, num_nodes, n_categories): """ Length of dataframe is in the correct shape""" graph = StructureModel() edges = [(n, n + 1, 1) for n in range(num_nodes - 1)] graph.add_weighted_edges_from(edges) data = generate_categorical_dataframe(graph, 100, seed=10, n_categories=n_categories) assert data.shape[1] == (num_nodes * n_categories)
def generator(num_nodes, seed, weight=None): np.random.seed(seed) sm = StructureModel() nodes = list("".join(x) for x in product( string.ascii_lowercase, string.ascii_lowercase))[:num_nodes] np.random.shuffle(nodes) sm.add_nodes_from(nodes) # one edge: sm.add_weighted_edges_from([("aa", "ab", weight)]) return sm
def test_incorrect_weight_dist(self): sm = StructureModel() nodes = list(str(x) for x in range(6)) np.random.shuffle(nodes) sm.add_nodes_from(nodes) sm.add_weighted_edges_from([("0", "1", None), ("2", "4", None)]) with pytest.raises(ValueError, match="Unknown weight distribution"): _ = sem_generator( graph=sm, schema=None, default_type="continuous", distributions={"weight": "unknown"}, noise_std=2.0, n_samples=1000, intercept=False, seed=10, )
def from_pandas_lasso( X: pd.DataFrame, beta: float, max_iter: int = 100, h_tol: float = 1e-8, w_threshold: float = 0.0, tabu_edges: List[Tuple[str, str]] = None, tabu_parent_nodes: List[str] = None, tabu_child_nodes: List[str] = None, ) -> StructureModel: """ Learn the `StructureModel`, the graph structure with lasso regularisation describing conditional dependencies between variables in data presented as a pandas dataframe. Based on DAGs with NO TEARS. @inproceedings{zheng2018dags, author = {Zheng, Xun and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.}, booktitle = {Advances in Neural Information Processing Systems}, title = {{DAGs with NO TEARS: Continuous Optimization for Structure Learning}}, year = {2018}, codebase = {https://github.com/xunzheng/notears} } Args: X: input data. beta: Constant that multiplies the lasso term. max_iter: max number of dual ascent steps during optimisation. h_tol: exit if h(W) < h_tol (as opposed to strict definition of 0). w_threshold: fixed threshold for absolute edge weights. tabu_edges: list of edges(from, to) not to be included in the graph. tabu_parent_nodes: list of nodes banned from being a parent of any other nodes. tabu_child_nodes: list of nodes banned from being a child of any other nodes. Returns: StructureModel: graph of conditional dependencies between data variables. Raises: ValueError: If X does not contain data. """ data = deepcopy(X) non_numeric_cols = data.select_dtypes(exclude="number").columns if not non_numeric_cols.empty: raise ValueError( "All columns must have numeric data. " "Consider mapping the following columns to int {non_numeric_cols}". format(non_numeric_cols=non_numeric_cols)) col_idx = {c: i for i, c in enumerate(data.columns)} idx_col = {i: c for c, i in col_idx.items()} if tabu_edges: tabu_edges = [(col_idx[u], col_idx[v]) for u, v in tabu_edges] if tabu_parent_nodes: tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes] if tabu_child_nodes: tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes] g = from_numpy_lasso( data.values, beta, max_iter, h_tol, w_threshold, tabu_edges, tabu_parent_nodes, tabu_child_nodes, ) sm = StructureModel() sm.add_nodes_from(data.columns) sm.add_weighted_edges_from( [(idx_col[u], idx_col[v], w) for u, v, w in g.edges.data("weight")], origin="learned", ) return sm
def from_pandas( X: pd.DataFrame, max_iter: int = 100, h_tol: float = 1e-8, w_threshold: float = 0.0, tabu_edges: List[Tuple[str, str]] = None, tabu_parent_nodes: List[str] = None, tabu_child_nodes: List[str] = None, ) -> StructureModel: """ Learn the `StructureModel`, the graph structure describing conditional dependencies between variables in data presented as a pandas dataframe. The optimisation is to minimise a score function :math:`F(W)` over the graph's weighted adjacency matrix, :math:`W`, subject to the a constraint function :math:`h(W)`, where :math:`h(W) == 0` characterises an acyclic graph. :math:`h(W) > 0` is a continuous, differentiable function that encapsulated how acyclic the graph is (less == more acyclic). Full details of this approach to structure learning are provided in the publication: Based on DAGs with NO TEARS. @inproceedings{zheng2018dags, author = {Zheng, Xun and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.}, booktitle = {Advances in Neural Information Processing Systems}, title = {{DAGs with NO TEARS: Continuous Optimization for Structure Learning}}, year = {2018}, codebase = {https://github.com/xunzheng/notears} } Args: X: input data. max_iter: max number of dual ascent steps during optimisation. h_tol: exit if h(W) < h_tol (as opposed to strict definition of 0). w_threshold: fixed threshold for absolute edge weights. tabu_edges: list of edges(from, to) not to be included in the graph. tabu_parent_nodes: list of nodes banned from being a parent of any other nodes. tabu_child_nodes: list of nodes banned from being a child of any other nodes. Returns: StructureModel: graph of conditional dependencies between data variables. Raises: ValueError: If X does not contain data. """ data = deepcopy(X) non_numeric_cols = data.select_dtypes(exclude="number").columns if len(non_numeric_cols) > 0: raise ValueError( "All columns must have numeric data. " "Consider mapping the following columns to int {non_numeric_cols}". format(non_numeric_cols=non_numeric_cols)) col_idx = {c: i for i, c in enumerate(data.columns)} idx_col = {i: c for c, i in col_idx.items()} if tabu_edges: tabu_edges = [(col_idx[u], col_idx[v]) for u, v in tabu_edges] if tabu_parent_nodes: tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes] if tabu_child_nodes: tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes] g = from_numpy( data.values, max_iter, h_tol, w_threshold, tabu_edges, tabu_parent_nodes, tabu_child_nodes, ) sm = StructureModel() sm.add_nodes_from(data.columns) sm.add_weighted_edges_from( [(idx_col[u], idx_col[v], w) for u, v, w in g.edges.data("weight")], origin="learned", ) return sm
def generate_structure_dynamic( # pylint: disable=too-many-arguments num_nodes: int, p: int, degree_intra: float, degree_inter: float, graph_type_intra: str = "erdos-renyi", graph_type_inter: str = "erdos-renyi", w_min_intra: float = 0.5, w_max_intra: float = 0.5, w_min_inter: float = 0.5, w_max_inter: float = 0.5, w_decay: float = 1.0, ) -> StructureModel: """ Generates a dynamic DAG at random. Args: num_nodes: Number of nodes p: maximum lag to be considered in the structure degree_intra: expected degree on nodes from the current state degree_inter: expected degree on nodes from the lagged nodes graph_type_intra: - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1) - barabasi-albert: constructs a scale-free graph from an initial connected graph of (degree / 2) nodes - full: constructs a fully-connected graph - degree has no effect graph_type_inter: - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1) - full: connect all past nodes to all present nodes w_min_intra: minimum weight for intra-slice nodes w_max_intra: maximum weight for intra-slice nodes w_min_inter: minimum weight for inter-slice nodes w_max_inter: maximum weight for inter-slice nodes w_decay: exponent of weights decay for slices that are farther apart. Default is 1.0, which implies no decay Raises: ValueError: if graph type unknown or `num_nodes < 2` Returns: StructureModel containing all simulated nodes and edges (intra- and inter-slice) """ sm_intra = generate_structure( num_nodes=num_nodes, degree=degree_intra, graph_type=graph_type_intra, w_min=w_min_intra, w_max=w_max_intra, ) sm_inter = _generate_inter_structure( num_nodes=num_nodes, p=p, degree=degree_inter, graph_type=graph_type_inter, w_min=w_min_inter, w_max=w_max_inter, w_decay=w_decay, ) res = StructureModel() res.add_nodes_from(sm_inter.nodes) res.add_nodes_from([f"{u}_lag0" for u in sm_intra.nodes]) res.add_weighted_edges_from(sm_inter.edges.data("weight")) res.add_weighted_edges_from([(f"{u}_lag0", f"{v}_lag0", w) for u, v, w in sm_intra.edges.data("weight")]) return res
def graph(): graph = StructureModel() edges = [(n, n + 1, 1) for n in range(5)] graph.add_weighted_edges_from(edges) return graph
def test_mixed_type_independence(self, seed, n_categories, weight_distribution, intercept_distribution): """ Test whether the relation is accurate, implicitly tests sequence of nodes. """ np.random.seed(seed) sm = StructureModel() nodes = list(str(x) for x in range(6)) np.random.shuffle(nodes) sm.add_nodes_from(nodes) # binary -> categorical sm.add_weighted_edges_from([("0", "1", 10)]) # binary -> continuous sm.add_weighted_edges_from([("2", "4", None)]) # binary -> count sm.add_weighted_edges_from([("2", "6", 100)]) schema = { "0": "binary", "1": "categorical:{}".format(n_categories), "2": "binary", "4": "continuous", "5": "categorical:{}".format(n_categories), "6": "count", } df = sem_generator( graph=sm, schema=schema, default_type="continuous", distributions={ "weight": weight_distribution, "intercept": intercept_distribution, "count": 0.05, }, noise_std=2, n_samples=100000, intercept=True, seed=seed, ) atol = 0.05 # 5% difference bewteen joint & factored! # 1. dependent links # 0 -> 1 (we look at the class with the highest deviation from uniform # to avoid small values) c, _ = max( [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories)) for c in range(n_categories)], key=operator.itemgetter(1), ) joint_proba, factored_proba = calculate_proba(df, "0", "1_{}".format(c)) assert not np.isclose(joint_proba, factored_proba, rtol=0, atol=atol) # 2 -> 4 assert not np.isclose( df["4"].mean(), df["4"][df["2"] == 1].mean(), rtol=0, atol=atol) # binary on count assert not np.isclose( df.loc[df["2"] == 0, "6"].mean(), df.loc[df["2"] == 1, "6"].mean(), rtol=0, atol=atol, ) tol = 0.15 # relative tolerance of +- 15% of the # 2. independent links # categorical c, _ = max( [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories)) for c in range(n_categories)], key=operator.itemgetter(1), ) joint_proba, factored_proba = calculate_proba(df, "0", "5_{}".format(c)) assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0) # binary joint_proba, factored_proba = calculate_proba(df, "0", "2") assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0) # categorical c, _ = max( [(c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories)) for c in range(n_categories)], key=operator.itemgetter(1), ) d, _ = max( [(d, np.abs(df["5_{}".format(d)].mean() - 1 / n_categories)) for d in range(n_categories)], key=operator.itemgetter(1), ) joint_proba, factored_proba = calculate_proba(df, "1_{}".format(d), "5_{}".format(c)) assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0) # continuous # for gaussian distributions, zero variance is equivalent to independence assert np.isclose(df[["3", "4"]].corr().values[0, 1], 0, atol=tol)