Пример #1
0
    def test_conditional_entropy_1d_condition(self) -> None:
        # Draw a sample from three-dimensional Gaussian distribution
        rng = np.random.default_rng(4)
        cov = np.asarray([[1, 0.6, 0.3], [0.6, 2, 0.1], [0.3, 0.1, 1]])
        data = rng.multivariate_normal([0, 0, 0], cov, size=1500)

        marginal = estimate_entropy(data, cond=data[:, 2])
        multidim = estimate_entropy(data[:, :2],
                                    cond=data[:, 2],
                                    multidim=True)

        # By the chain rule of entropy, H(X|Y) = H(X,Y) - H(Y)
        def expected(ix: int, iy: int) -> float:
            joint = 0.5 * math.log(
                (2 * math.pi * math.e)**2 *
                (cov[ix, ix] * cov[iy, iy] - cov[ix, iy]**2))
            single = 0.5 * math.log(2 * math.pi * math.e * cov[iy, iy])
            return joint - single

        self.assertAlmostEqual(marginal[0], expected(0, 2), delta=0.04)
        self.assertAlmostEqual(marginal[1], expected(1, 2), delta=0.09)
        self.assertLess(marginal[2], -4.5)

        expected_multi = 0.5 * (
            math.log(np.linalg.det(2 * math.pi * math.e * cov)) -
            math.log(2 * math.pi * math.e * 1))
        self.assertAlmostEqual(multidim, expected_multi, delta=0.1)
Пример #2
0
    def test_drop_nan_leaves_too_few_observations(self) -> None:
        data = [(np.nan, 2), (np.nan, 4), (5, np.nan), (7, np.nan), (9, 10),
                (11, 12)]
        cond = [(np.nan, 2), (3, 4), (5, np.nan), (7, 8), (9, 10), (11, 12)]

        # When multidim=False, there are three observations left
        # When multidim=True, there are just two
        for (multidim, k, should_throw) in [(False, 3, True),
                                            (False, 2, False), (True, 2, True),
                                            (True, 1, False)]:
            if should_throw:
                with self.assertRaises(ValueError) as cm:
                    estimate_entropy(data,
                                     cond=cond,
                                     multidim=multidim,
                                     k=k,
                                     drop_nan=True)
                self.assertEqual(str(cm.exception), K_TOO_LARGE_MSG,
                                 f"multidim={multidim}, k={k}")
            else:
                try:
                    estimate_entropy(data,
                                     cond=cond,
                                     multidim=multidim,
                                     k=k,
                                     drop_nan=True)
                except:
                    self.fail(
                        f"Exception occurred; multidim={multidim}, k={k}")
Пример #3
0
    def test_pandas_dataframe(self) -> None:
        rng = np.random.default_rng(1)
        data = pd.DataFrame({
            "N": rng.normal(0.0, 1.0, size=500),
            "Unif": rng.uniform(0.0, 0.5, size=500),
            "Exp": rng.exponential(1 / 2.0, size=500)
        })

        marginal = estimate_entropy(data)  # type: pd.DataFrame
        multidim = estimate_entropy(data, multidim=True)

        # multidim=False results in a DataFrame
        self.assertIsInstance(marginal, pd.DataFrame)
        self.assertEqual(marginal.shape, (1, 3))
        self.assertAlmostEqual(marginal.loc[0, "N"],
                               0.5 * math.log(2 * math.pi * math.e),
                               delta=0.04)
        self.assertAlmostEqual(marginal.loc[0, "Unif"],
                               math.log(0.5),
                               delta=0.03)
        self.assertAlmostEqual(marginal.loc[0, "Exp"],
                               1.0 - math.log(2.0),
                               delta=0.07)

        # multidim=True results in a NumPy scalar
        # There is no reference value, the check just guards for regressions
        self.assertEqual(multidim.shape, ())
        self.assertAlmostEqual(multidim.item(), 1.22, delta=0.02)
Пример #4
0
    def test_conditional_entropy_with_independent_condition(self) -> None:
        rng = np.random.default_rng(6)
        data = rng.normal(0.0, 1.0, size=1200)
        cond = rng.uniform(0.0, 1.0, size=1200)

        uncond_result = estimate_entropy(data)
        cond_result = estimate_entropy(data, cond=cond)

        self.assertAlmostEqual(cond_result, uncond_result, delta=0.05)
Пример #5
0
    def test_mi_as_sum_of_entropies(self) -> None:
        # Make up another distribution
        rng = np.random.default_rng(1)
        x = rng.chisquare(5, size=8000)
        y = rng.gamma(x, scale=1.0, size=x.shape)

        # We should have I(X;Y) = H(X) + H(Y) - H(X,Y)
        mi = estimate_mi(y, x)
        marginal = estimate_entropy(np.column_stack((x, y)))
        joint = estimate_entropy(np.column_stack((x,y)), multidim=True)

        self.assertAlmostEqual(np.sum(marginal) - joint, mi, delta=0.02)
Пример #6
0
    def test_mi_as_conditional_entropy_difference(self) -> None:
        # Make up some kind of distribution
        rng = np.random.default_rng(0)
        x = rng.gamma(shape=2.0, scale=1.0, size=2000)
        y = rng.normal(x, scale=1.0, size=x.shape)

        # We should have I(X;Y) = H(X) - H(X|Y)
        mi = estimate_mi(y, x)
        ent_x = estimate_entropy(x)
        cond_ent = estimate_entropy(x, cond=y)

        self.assertAlmostEqual(ent_x - cond_ent, mi, delta=0.02)
Пример #7
0
    def test_conditional_entropy_with_mask(self) -> None:
        # The remaining observations are identical, giving entropy of -inf
        # However, the masked-out variables follow very different distributions
        rng = np.random.default_rng(7)
        unif = rng.uniform(0, 1, size=1000)
        data = np.concatenate((unif, rng.beta(2, 3, size=400)))
        cond = np.concatenate((unif, rng.normal(0, 1, size=400)))
        mask = np.concatenate((np.full(1000, True), np.full(400, False)))

        unmasked = estimate_entropy(data, cond=cond)
        masked = estimate_entropy(data, cond=cond, mask=mask)

        self.assertLess(masked, unmasked - 1)
        self.assertLess(masked, -5)
Пример #8
0
    def test_conditional_entropy_nd_condition(self) -> None:
        # Draw a sample from three-dimensional Gaussian distribution
        rng = np.random.default_rng(5)
        cov = np.asarray([[1, 0.6, 0.3], [0.6, 2, 0.1], [0.3, 0.1, 1]])
        data = rng.multivariate_normal([0, 0, 0], cov, size=1000)

        marginal = estimate_entropy(data, cond=data)
        multidim = estimate_entropy(data, cond=data, multidim=True)

        # All of the entropies should be -inf, but practically this does not happen
        self.assertLess(marginal[0], -0.5)
        self.assertLess(marginal[1], -0.5)
        self.assertLess(marginal[2], -0.5)
        self.assertLess(multidim, -1.5)
Пример #9
0
    def test_entropy_bias(self) -> None:
        rng = np.random.default_rng(0)
        x = rng.normal(size=20_000)

        h_1 = estimate_entropy(x, k=1)
        h_100 = estimate_entropy(x, k=100)

        # Small k has positive bias, large k has negative bias
        expected = 0.5 * log(2 * pi * e)
        self.assertGreater(h_1, expected + 0.005)
        self.assertLess(h_100, expected - 0.005)

        # Still, both are reasonably close, and large k is closer
        self.assertAlmostEqual(h_1, expected, delta=0.04)
        self.assertAlmostEqual(h_100, expected, delta=0.01)
Пример #10
0
    def test_single_dimensional_variable_as_list(self) -> None:
        rng = np.random.default_rng(0)
        x = [rng.uniform(0, 2) for _ in range(400)]

        result = estimate_entropy(x)

        self.assertEqual(result.shape, ())
        self.assertAlmostEqual(result, math.log(2 - 0), delta=0.01)
Пример #11
0
    def test_nans_must_be_masked(self) -> None:
        rng = np.random.default_rng(3)
        data = rng.normal(0.0, 1.0, size=(800, 2))
        data[0:10, 0] = np.nan
        data[5:15, 1] = np.nan

        # Without masking, the NaNs are rejected
        with self.assertRaises(ValueError) as cm:
            estimate_entropy(data)
        self.assertEqual(str(cm.exception), NANS_LEFT_MSG)

        # With masking, a correct result is produced
        mask = np.full(800, True)
        mask[0:15] = False

        result = estimate_entropy(data, mask=mask)
        expected = 0.5 * math.log(2 * math.pi * math.e)
        self.assertAlmostEqual(result[0], expected, delta=0.03)
        self.assertAlmostEqual(result[1], expected, delta=0.03)
Пример #12
0
    def test_drop_nan_cond(self) -> None:
        # Independent condition
        rng = np.random.default_rng(10)
        data = rng.uniform(size=1000)
        cond = rng.uniform(size=1000)
        cond[:10] = np.nan

        result = estimate_entropy(data, cond=cond, drop_nan=True)

        self.assertAlmostEqual(result, 0.0, delta=0.04)
Пример #13
0
    def test_pandas_series(self) -> None:
        rng = np.random.default_rng(2)
        data = pd.Series(rng.normal(0.0, 1.0, size=500), name="N")

        result = estimate_entropy(data)  # type: pd.DataFrame

        self.assertIsInstance(result, pd.DataFrame)
        self.assertEqual(result.shape, (1, 1))
        self.assertAlmostEqual(result.loc[0, "N"],
                               0.5 * math.log(2 * math.pi * math.e),
                               delta=0.02)
Пример #14
0
    def test_drop_nan_separate_vars(self) -> None:
        rng = np.random.default_rng(8)
        data = np.column_stack(
            (rng.uniform(0, 2, size=2000), rng.uniform(0, 3, size=2000)))
        data[:1000, 0] = np.nan
        data[1000:, 1] = np.nan

        result = estimate_entropy(data, drop_nan=True)

        self.assertAlmostEqual(result[0], math.log(2), delta=0.04)
        self.assertAlmostEqual(result[1], math.log(3), delta=0.04)
Пример #15
0
    def test_conditional_entropy_bias(self) -> None:
        # This is especially interesting as errors might not cancel out in the chain rule
        # Use the 3D Gaussian distribution seen in the driver test
        rng = np.random.default_rng(0)
        cov = np.asarray([[1, 0.6, 0.3], [0.6, 2, 0.1], [0.3, 0.1, 1]])
        data = rng.multivariate_normal([0, 0, 0], cov, size=20_000)

        h_5 = estimate_entropy(data[:, :2],
                               cond=data[:, 2],
                               multidim=True,
                               k=5)
        h_50 = estimate_entropy(data[:, :2],
                                cond=data[:, 2],
                                multidim=True,
                                k=50)

        # Again, large k appears to have more negative bias
        expected = 0.5 * (log(np.linalg.det(2 * pi * e * cov)) -
                          log(2 * pi * e))
        self.assertAlmostEqual(h_5, expected, delta=0.005)
        self.assertAlmostEqual(h_50, expected, delta=0.03)
Пример #16
0
    def test_drop_nan_multidim(self) -> None:
        rng = np.random.default_rng(9)
        cov = np.asarray([[1, 0.6], [0.6, 2]])
        data = rng.multivariate_normal([0, 0], cov, size=1000)
        data[:50, 0] = np.nan
        data[950:, 1] = np.nan

        result = estimate_entropy(data, multidim=True, drop_nan=True)

        self.assertAlmostEqual(result,
                               math.log(2 * math.pi * math.e) +
                               0.5 * math.log(2 - 0.6**2),
                               delta=0.05)
Пример #17
0
    def test_multidim_interpretation(self) -> None:
        # Generate a two-dimensional Gaussian variable
        rng = np.random.default_rng(1)
        cov = np.asarray([[1, 0.6], [0.6, 2]])
        data = rng.multivariate_normal([0, 0], cov, size=1500)

        marginal = estimate_entropy(data)
        multidim = estimate_entropy(data, multidim=True)

        # If multidim=False, we get marginal entropies
        self.assertEqual(marginal.shape, (2, ))
        self.assertAlmostEqual(marginal[0],
                               0.5 * math.log(2 * math.pi * math.e * 1),
                               delta=0.03)
        self.assertAlmostEqual(marginal[1],
                               0.5 * math.log(2 * math.pi * math.e * 2),
                               delta=0.03)

        # If multidim=True, we get the combined entropy
        self.assertEqual(multidim.shape, ())
        self.assertAlmostEqual(multidim.item(),
                               math.log(2 * math.pi * math.e) +
                               0.5 * math.log(2 - 0.6**2),
                               delta=0.04)
Пример #18
0
    def test_drop_nan_cond_multidim(self) -> None:
        # See test_conditional_entropy_1d_condition
        rng = np.random.default_rng(11)
        cov = np.asarray([[1, 0.6, 0.3], [0.6, 2, 0.1], [0.3, 0.1, 1]])
        data = rng.multivariate_normal([0, 0, 0], cov, size=1500)
        data[10:20, 0] = np.nan
        data[20:30, 1] = np.nan
        data[25:40, 2] = np.nan

        result = estimate_entropy(data[:, :2],
                                  cond=data[:, 2],
                                  multidim=True,
                                  drop_nan=True)

        expected = 0.5 * (math.log(np.linalg.det(2 * math.pi * math.e * cov)) -
                          math.log(2 * math.pi * math.e * 1))
        self.assertAlmostEqual(result, expected, delta=0.1)
Пример #19
0
 def test_mask_is_not_boolean(self) -> None:
     with self.assertRaises(TypeError) as cm:
         estimate_entropy(np.zeros(5), mask=[1, 2, 3, 4, 5])
     self.assertEqual(str(cm.exception), INVALID_MASK_TYPE_MSG)
Пример #20
0
 def test_cond_has_wrong_dimension(self) -> None:
     for dim in [(), (20, 2, 1)]:
         with self.subTest(dim=dim):
             with self.assertRaises(ValueError) as cm:
                 estimate_entropy(np.zeros(20), cond=np.zeros(dim))
             self.assertEqual(str(cm.exception), COND_WRONG_DIMENSION_MSG)
Пример #21
0
 def test_cond_must_have_same_length_as_x(self) -> None:
     with self.assertRaises(ValueError) as cm:
         estimate_entropy(np.zeros(5), cond=np.zeros(7))
     self.assertEqual(str(cm.exception), X_COND_DIFFERENT_LENGTH_MSG)
Пример #22
0
 def test_mask_leaves_too_few_observations(self) -> None:
     with self.assertRaises(ValueError) as cm:
         estimate_entropy(np.zeros(5),
                          mask=[False, False, False, True, True])
     self.assertEqual(str(cm.exception), K_TOO_LARGE_MSG)
Пример #23
0
 def test_mask_has_wrong_dimension(self) -> None:
     with self.assertRaises(ValueError) as cm:
         estimate_entropy(np.zeros((5, 2)), mask=np.full((5, 2), True))
     self.assertEqual(str(cm.exception), MASK_WRONG_DIMENSION_MSG)
Пример #24
0
 def test_input_shorter_than_k(self) -> None:
     with self.assertRaises(ValueError) as cm:
         estimate_entropy(np.zeros(3), k=3)
     self.assertEqual(str(cm.exception), K_TOO_LARGE_MSG)
Пример #25
0
 def test_k_must_be_integer(self) -> None:
     with self.assertRaises(TypeError):
         estimate_entropy(np.zeros(20), k=2.71828)  # type: ignore
Пример #26
0
 def test_k_must_be_positive(self) -> None:
     for k in [-2, 0]:
         with self.subTest(k=k):
             with self.assertRaises(ValueError) as cm:
                 estimate_entropy(np.zeros(20), k=k)
             self.assertEqual(str(cm.exception), K_NEGATIVE_MSG)
Пример #27
0
 def test_mask_has_wrong_size(self) -> None:
     with self.assertRaises(ValueError) as cm:
         estimate_entropy(np.zeros(5), mask=[True, False])
     self.assertEqual(str(cm.exception), INVALID_MASK_LENGTH_MSG)