Пример #1
0
    def test_intercept(self, distribution, n_categories):
        graph = StructureModel()
        graph.add_node("A")

        data_noint = generate_categorical_dataframe(
            graph,
            100000,
            distribution,
            noise_scale=0.1,
            n_categories=n_categories,
            seed=10,
            intercept=False,
        )
        data_intercept = generate_categorical_dataframe(
            graph,
            100000,
            distribution,
            noise_scale=0.1,
            n_categories=n_categories,
            seed=10,
            intercept=True,
        )

        assert np.all(~np.isclose(data_intercept.mean(axis=0),
                                  data_noint.mean(axis=0),
                                  atol=0.05,
                                  rtol=0))
Пример #2
0
    def test_intercept(self, distribution, n_categories, noise_scale):
        graph = StructureModel()
        graph.add_node("A")

        data_noint = generate_categorical_dataframe(
            graph,
            100000,
            distribution,
            noise_scale=noise_scale,
            n_categories=n_categories,
            seed=10,
            intercept=False,
        )
        data_intercept = generate_categorical_dataframe(
            graph,
            100000,
            distribution,
            noise_scale=noise_scale,
            n_categories=n_categories,
            seed=10,
            intercept=True,
        )

        # NOTE: as n_categories increases, the probability that at least one category with
        # intercept=True will be the same as intercept=False -> 1.0
        num_similar = np.isclose(data_intercept.mean(axis=0),
                                 data_noint.mean(axis=0),
                                 atol=0.05,
                                 rtol=0).sum()
        assert num_similar < n_categories / 2
Пример #3
0
    def test_dataframe(self, graph, distribution, noise_std, intercept, seed,
                       kernel, n_categories):
        """
        Tests equivalence of dataframe wrapper
        """
        data = generate_categorical_dataframe(
            graph,
            100,
            distribution,
            noise_scale=noise_std,
            seed=seed,
            intercept=intercept,
            kernel=kernel,
            n_categories=n_categories,
        )
        df = generate_categorical_dataframe(
            graph,
            100,
            distribution,
            noise_scale=noise_std,
            seed=seed,
            intercept=intercept,
            kernel=kernel,
            n_categories=n_categories,
        )

        cols = []
        for node in graph.nodes():
            for cat in range(n_categories):
                cols.append(f"{node}_{cat}")
        assert np.array_equal(data, df[cols].values)
Пример #4
0
 def test_bad_distribution_type(self):
     """Test that invalid sem-type other than "probit", "normal", "logit", "gumbel" is not accepted"""
     graph_type, degree, d_nodes = "erdos-renyi", 4, 10
     sm = generate_structure(d_nodes, degree, graph_type)
     with pytest.raises(ValueError,
                        match="Unknown categorical distribution"):
         generate_categorical_dataframe(sm,
                                        distribution="invalid",
                                        n_samples=10,
                                        seed=10)
Пример #5
0
    def test_independence(self, graph_gen, seed, num_nodes, n_categories,
                          distribution):
        """
        test whether the relation is accurate, implicitely tests sequence of
        nodes.
        """
        sm = graph_gen(num_nodes=num_nodes, seed=seed, weight=None)
        nodes = sm.nodes()

        df = generate_categorical_dataframe(
            sm,
            n_samples=100000,
            distribution=distribution,
            n_categories=n_categories,
            seed=seed,
            noise_scale=1,
            intercept=False,
        )

        tol = 0.05

        # independent links
        for node in nodes:
            if node == "aa":
                continue
            joint_proba, factored_proba = calculate_proba(
                df, "aa_0", node + "_0")
            if node == "ab":
                assert not np.isclose(
                    joint_proba, factored_proba, rtol=tol, atol=0)
            else:
                assert np.isclose(joint_proba,
                                  factored_proba,
                                  rtol=tol,
                                  atol=0)
Пример #6
0
 def test_number_of_samples(self, num_samples, graph):
     """Assert number of samples generated (rows) = num_samples"""
     data = generate_categorical_dataframe(graph,
                                           num_samples,
                                           "logit",
                                           1,
                                           seed=10)
     assert len(data) == num_samples
Пример #7
0
 def test_returns_dataframe(self, distribution):
     """Return value is an ndarray - test over all sem_types"""
     graph_type, degree, d_nodes = "erdos-renyi", 4, 10
     sm = generate_structure(d_nodes, degree, graph_type)
     ndarray = generate_categorical_dataframe(sm,
                                              distribution=distribution,
                                              n_samples=10)
     assert isinstance(ndarray, pd.DataFrame)
Пример #8
0
    def test_number_of_columns(self, num_nodes, n_categories):
        """Length of dataframe is in the correct shape"""
        graph = StructureModel()
        edges = [(n, n + 1, 1) for n in range(num_nodes - 1)]
        graph.add_weighted_edges_from(edges)

        data = generate_categorical_dataframe(graph,
                                              100,
                                              seed=10,
                                              n_categories=n_categories)
        assert data.shape[1] == (num_nodes * n_categories)
Пример #9
0
 def test_intercept_probability(self, graph, distribution, n_categories):
     """Test whether probability is not centered around 50% when using an intercept"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_categorical_dataframe(
         graph,
         1000000,
         distribution=distribution,
         n_categories=n_categories,
         noise_scale=0.1,
         seed=10,
         intercept=True,
     )
     assert not np.allclose(
         data.mean(axis=0), 1 / n_categories, atol=0.01, rtol=0)
Пример #10
0
 def test_baseline_probability(self, graph, distribution, n_categories):
     """Test whether probability centered around 50% if no intercept given"""
     graph = StructureModel()
     graph.add_nodes_from(["A"])
     data = generate_categorical_dataframe(
         graph,
         10000,
         distribution=distribution,
         n_categories=n_categories,
         noise_scale=1.0,
         seed=10,
         intercept=False,
     )
     # without intercept, the probabilities should be fairly uniform
     assert np.allclose(data.mean(axis=0),
                        1 / n_categories,
                        atol=0.01,
                        rtol=0)