def test_intercept(self, distribution, n_categories): graph = StructureModel() graph.add_node("A") data_noint = generate_categorical_dataframe( graph, 100000, distribution, noise_scale=0.1, n_categories=n_categories, seed=10, intercept=False, ) data_intercept = generate_categorical_dataframe( graph, 100000, distribution, noise_scale=0.1, n_categories=n_categories, seed=10, intercept=True, ) assert np.all(~np.isclose(data_intercept.mean(axis=0), data_noint.mean(axis=0), atol=0.05, rtol=0))
def test_intercept(self, distribution, n_categories, noise_scale): graph = StructureModel() graph.add_node("A") data_noint = generate_categorical_dataframe( graph, 100000, distribution, noise_scale=noise_scale, n_categories=n_categories, seed=10, intercept=False, ) data_intercept = generate_categorical_dataframe( graph, 100000, distribution, noise_scale=noise_scale, n_categories=n_categories, seed=10, intercept=True, ) # NOTE: as n_categories increases, the probability that at least one category with # intercept=True will be the same as intercept=False -> 1.0 num_similar = np.isclose(data_intercept.mean(axis=0), data_noint.mean(axis=0), atol=0.05, rtol=0).sum() assert num_similar < n_categories / 2
def test_dataframe(self, graph, distribution, noise_std, intercept, seed, kernel, n_categories): """ Tests equivalence of dataframe wrapper """ data = generate_categorical_dataframe( graph, 100, distribution, noise_scale=noise_std, seed=seed, intercept=intercept, kernel=kernel, n_categories=n_categories, ) df = generate_categorical_dataframe( graph, 100, distribution, noise_scale=noise_std, seed=seed, intercept=intercept, kernel=kernel, n_categories=n_categories, ) cols = [] for node in graph.nodes(): for cat in range(n_categories): cols.append(f"{node}_{cat}") assert np.array_equal(data, df[cols].values)
def test_bad_distribution_type(self): """Test that invalid sem-type other than "probit", "normal", "logit", "gumbel" is not accepted""" graph_type, degree, d_nodes = "erdos-renyi", 4, 10 sm = generate_structure(d_nodes, degree, graph_type) with pytest.raises(ValueError, match="Unknown categorical distribution"): generate_categorical_dataframe(sm, distribution="invalid", n_samples=10, seed=10)
def test_independence(self, graph_gen, seed, num_nodes, n_categories, distribution): """ test whether the relation is accurate, implicitely tests sequence of nodes. """ sm = graph_gen(num_nodes=num_nodes, seed=seed, weight=None) nodes = sm.nodes() df = generate_categorical_dataframe( sm, n_samples=100000, distribution=distribution, n_categories=n_categories, seed=seed, noise_scale=1, intercept=False, ) tol = 0.05 # independent links for node in nodes: if node == "aa": continue joint_proba, factored_proba = calculate_proba( df, "aa_0", node + "_0") if node == "ab": assert not np.isclose( joint_proba, factored_proba, rtol=tol, atol=0) else: assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0)
def test_number_of_samples(self, num_samples, graph): """Assert number of samples generated (rows) = num_samples""" data = generate_categorical_dataframe(graph, num_samples, "logit", 1, seed=10) assert len(data) == num_samples
def test_returns_dataframe(self, distribution): """Return value is an ndarray - test over all sem_types""" graph_type, degree, d_nodes = "erdos-renyi", 4, 10 sm = generate_structure(d_nodes, degree, graph_type) ndarray = generate_categorical_dataframe(sm, distribution=distribution, n_samples=10) assert isinstance(ndarray, pd.DataFrame)
def test_number_of_columns(self, num_nodes, n_categories): """Length of dataframe is in the correct shape""" graph = StructureModel() edges = [(n, n + 1, 1) for n in range(num_nodes - 1)] graph.add_weighted_edges_from(edges) data = generate_categorical_dataframe(graph, 100, seed=10, n_categories=n_categories) assert data.shape[1] == (num_nodes * n_categories)
def test_intercept_probability(self, graph, distribution, n_categories): """Test whether probability is not centered around 50% when using an intercept""" graph = StructureModel() graph.add_nodes_from(["A"]) data = generate_categorical_dataframe( graph, 1000000, distribution=distribution, n_categories=n_categories, noise_scale=0.1, seed=10, intercept=True, ) assert not np.allclose( data.mean(axis=0), 1 / n_categories, atol=0.01, rtol=0)
def test_baseline_probability(self, graph, distribution, n_categories): """Test whether probability centered around 50% if no intercept given""" graph = StructureModel() graph.add_nodes_from(["A"]) data = generate_categorical_dataframe( graph, 10000, distribution=distribution, n_categories=n_categories, noise_scale=1.0, seed=10, intercept=False, ) # without intercept, the probabilities should be fairly uniform assert np.allclose(data.mean(axis=0), 1 / n_categories, atol=0.01, rtol=0)