def test_intercept(self, distribution, noise_scale): graph = StructureModel() graph.add_node("123") data_noint = generate_continuous_data( graph, n_samples=100000, distribution=distribution, noise_scale=noise_scale, seed=10, intercept=False, ) data_intercept = generate_continuous_data( graph, n_samples=100000, distribution=distribution, noise_scale=noise_scale, seed=10, intercept=True, ) assert not np.isclose(data_noint[:, 0].mean(), data_intercept[:, 0].mean()) assert np.isclose(data_noint[:, 0].std(), data_intercept[:, 0].std(), rtol=0.01)
def test_bad_distribution_type(self): """Test that invalid sem-type other than "gaussian", "normal", "student-t", "exponential", "gumbel" is not accepted""" graph_type, degree, d_nodes = "erdos-renyi", 4, 10 sm = generate_structure(d_nodes, degree, graph_type) with pytest.raises(ValueError, match="Unknown continuous distribution"): generate_continuous_data(sm, distribution="invalid", n_samples=10, seed=10)
def test_dataframe(self, graph, distribution, noise_std, intercept, seed, kernel): """ Tests equivalence of dataframe wrapper """ data = generate_continuous_data( graph, 1000, distribution, noise_scale=noise_std, seed=seed, intercept=intercept, kernel=kernel, ) df = generate_continuous_dataframe( graph, 1000, distribution, noise_scale=noise_std, seed=seed, intercept=intercept, kernel=kernel, ) assert np.array_equal(data, df[list(graph.nodes())].values)
def test_returns_ndarray(self, distribution): """Return value is an ndarray - test over all sem_types""" graph_type, degree, d_nodes = "erdos-renyi", 4, 10 sm = generate_structure(d_nodes, degree, graph_type) ndarray = generate_continuous_data(sm, distribution=distribution, n_samples=10) assert isinstance(ndarray, np.ndarray)
def test_number_of_samples(self, num_samples, graph): """Assert number of samples generated (rows) = num_samples""" data = generate_continuous_data(graph, num_samples, "gaussian", 1, seed=10) assert len(data) == num_samples
def test_number_of_nodes(self, num_nodes): """Length of each row in generated data equals num_nodes""" graph = StructureModel() edges = [(n, n + 1, 1) for n in range(num_nodes - 1)] graph.add_weighted_edges_from(edges) data = generate_continuous_data(graph, 100, seed=10) assert all(len(sample) == num_nodes for sample in data)
def test_linear_gumbel_parent_dist(self, graph): """Anderson-Darling test for data coming from a particular distribution, for gumbel.""" data = generate_continuous_data(graph, distribution="gumbel", noise_scale=1, n_samples=100000, seed=10) stat, crit, sig = anderson(data[:, 0], "gumbel_r") assert stat < crit[list(sig).index(5)]
def test_linear_studentt_parent_dist(self, graph): """ Kolmogorov-Smirnov test for data coming from a student-t (degree of freedom = 3). """ np.random.seed(10) data = generate_continuous_data(graph, distribution="student-t", noise_scale=1, n_samples=100000, seed=10) x = data[:, 0] _, p_val = stats.kstest(x, "t", args=[3]) assert p_val < 0.01
def test_order_is_correct(self, graph_gen, num_nodes, seed): """ Check if the order of the nodes is the same order as `sm.nodes`, which in turn is the same order as the adjacency matrix. To do so, we create graphs with degree in {0,1} by doing permutations on identity. The edge values are always 100 and the noise is 1, so we expect `edge_from` < `edge_to` in absolute value almost every time. """ sm = graph_gen(num_nodes=num_nodes, seed=seed, weight=100) nodes = sm.nodes() node_seq = {node: ix for ix, node in enumerate(sm.nodes())} data = generate_continuous_data( sm, n_samples=10000, distribution="normal", seed=seed, noise_scale=1.0, intercept=False, ) assert data[:, node_seq["aa"]].std() < data[:, node_seq["ab"]].std() tol = 0.15 # for gaussian distribution: var=0 iff independent: for node in nodes: if node == "aa": continue if node == "ab": assert not np.isclose( np.corrcoef( data[:, [node_seq["aa"], node_seq["ab"]]].T)[0, 1], 0, atol=tol, ) else: assert np.isclose( np.corrcoef(data[:, [node_seq["aa"], node_seq[node]]].T)[0, 1], 0, atol=tol, )
def test_linear_gauss_parent_dist(self, graph): """Anderson-Darling test for data coming from a particular distribution, for gaussian.""" data = generate_continuous_data(graph, 1000000, "gaussian", 1, seed=10) stat, crit, sig = anderson(data[:, 0], "norm") assert stat < crit[list(sig).index(5)]