def test_bad_distribution_type(self): """Test that invalid sem-type other than "probit", "normal", "logit" is not accepted""" graph_type, degree, d_nodes = "erdos-renyi", 4, 10 sm = generate_structure(d_nodes, degree, graph_type) with pytest.raises(ValueError, match="Unknown binary distribution"): generate_binary_data(sm, distribution="invalid", n_samples=10, seed=10)
def test_f1score_generated_binary(self): """ Binary strucutre learned should have good f1 score """ np.random.seed(10) sm = generate_structure(5, 2.0) df = generate_binary_data(sm, 1000, intercept=False, noise_scale=0.1, seed=10) dist_type_schema = {i: "bin" for i in range(df.shape[1])} sm_fitted = from_numpy( df, dist_type_schema=dist_type_schema, lasso_beta=0.1, ridge_beta=0.0, w_threshold=0.1, use_bias=False, ) right_edges = sm.edges n_predictions_made = len(sm_fitted.edges) n_correct_predictions = len( set(sm_fitted.edges).intersection(set(right_edges))) n_relevant_predictions = len(right_edges) precision = n_correct_predictions / n_predictions_made recall = n_correct_predictions / n_relevant_predictions f1_score = 2 * (precision * recall) / (precision + recall) assert f1_score > 0.8
def test_dataframe(self, graph, distribution, noise_std, intercept, seed, kernel): """ Tests equivalence of dataframe wrapper """ data = generate_binary_data( graph, 100, distribution, noise_scale=noise_std, seed=seed, intercept=intercept, kernel=kernel, ) df = generate_binary_dataframe( graph, 100, distribution, noise_scale=noise_std, seed=seed, intercept=intercept, kernel=kernel, ) assert np.array_equal(data, df[list(graph.nodes())].values)
def test_number_of_nodes(self, num_nodes): """Length of each row in generated data equals num_nodes""" graph = StructureModel() edges = [(n, n + 1, 1) for n in range(num_nodes - 1)] graph.add_weighted_edges_from(edges) data = generate_binary_data(graph, 100, seed=10) assert all(len(sample) == num_nodes for sample in data)
def test_returns_ndarray(self, distribution): """Return value is an ndarray - test over all sem_types""" graph_type, degree, d_nodes = "erdos-renyi", 4, 10 sm = generate_structure(d_nodes, degree, graph_type) ndarray = generate_binary_data(sm, distribution=distribution, n_samples=10) assert isinstance(ndarray, np.ndarray)
def test_intercept(self, distribution): graph = StructureModel() graph.add_node("123") data_noint = generate_binary_data(graph, 100000, distribution, noise_scale=0, seed=10, intercept=False) data_intercept = generate_binary_data(graph, 100000, distribution, noise_scale=0, seed=10, intercept=True) assert not np.isclose(data_noint[:, 0].mean(), data_intercept[:, 0].mean())
def test_baseline_probability_probit(self, graph, distribution): """Test whether probability centered around 50% if no intercept given""" graph = StructureModel() graph.add_nodes_from(["A"]) data = generate_binary_data( graph, 1000000, distribution=distribution, noise_scale=0.1, seed=10, intercept=False, ) assert 0.45 < data[:, 0].mean() < 0.55
def test_intercept_probability_logit(self, graph, distribution): """Test whether probability is not centered around 50% when using an intercept""" graph = StructureModel() graph.add_nodes_from(["A"]) data = generate_binary_data( graph, 1000000, distribution=distribution, noise_scale=0.1, seed=10, intercept=True, ) mean_prob = data[:, 0].mean() assert not np.isclose(mean_prob, 0.5, atol=0.05)
def test_order_is_correct(self, graph_gen, num_nodes, seed): """ Check if the order of the nodes is the same order as `sm.nodes`, which in turn is the same order as the adjacency matrix. To do so, we create graphs with degree in {0,1} by doing permutations on identity. The edge values are always 100 and the noise is 1, so we expect `edge_from` < `edge_to` in absolute value almost every time. """ sm = graph_gen(num_nodes=num_nodes, seed=seed, weight=None) nodes = sm.nodes() node_seq = {node: ix for ix, node in enumerate(sm.nodes())} data = generate_binary_data( sm, n_samples=10000, distribution="normal", seed=seed, noise_scale=0.1, intercept=False, ) tol = 0.15 # since we dont have an intercept, the mean proba for the parent is 0.5, # which has the highest possible std for a binary feature (std= p(1-p)), # hence, any child necessarily has a lower probability. assert data[:, node_seq["aa"]].std() > data[:, node_seq["ab"]].std() for node in nodes: if node == "aa": continue joint_proba, factored_proba = calculate_proba( data, node_seq["aa"], node_seq[node]) if node == "ab": # this is the only link assert not np.isclose( joint_proba, factored_proba, rtol=tol, atol=0) else: assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0)
def test_number_of_samples(self, num_samples, graph): """Assert number of samples generated (rows) = num_samples""" data = generate_binary_data(graph, num_samples, "logit", 1, seed=10) assert len(data) == num_samples