示例#1
0
    def conditional_entropy(self, col_a, col_b, n_samples=1000):
        """ Conditional entropy, H(A|B), of a given b.

        Implementation notes
        --------------------
        Uses MonteCarlo integration at least in the joint entropy component.

        Parameters
        ----------
        col_a : indexer
            The name of the first column
        col_b : indexer
            The name of the second column
        n_samples : int
            The number of samples to use for the Monte Carlo approximation
            (if nored if `col` is categorical).

        Returns
        -------
        h_c : float
            The conditional entropy of `col_a` given `col_b`.
        """
        col_idxs = [
            self._converters['col2idx'][col_a],
            self._converters['col2idx'][col_b]
        ]
        h_ab = mu.joint_entropy(self._models, col_idxs, n_samples)
        h_b = self.entropy(col_b, n_samples)
        h_c = h_ab - h_b

        return h_c
示例#2
0
    def conditional_entropy(self, col_a, col_b, n_samples=1000):
        """ Conditional entropy, H(A|B), of a given b.

        Implementation notes
        --------------------
        Uses MonteCarlo integration at least in the joint entropy component.

        Parameters
        ----------
        col_a : indexer
            The name of the first column
        col_b : indexer
            The name of the second column
        n_samples : int
            The number of samples to use for the Monte Carlo approximation
            (if nored if `col` is categorical).

        Returns
        -------
        h_c : float
            The conditional entropy of `col_a` given `col_b`.
        """
        col_idxs = [self._converters['col2idx'][col_a],
                    self._converters['col2idx'][col_b]]
        h_ab = mu.joint_entropy(self._models, col_idxs, n_samples)
        h_b = self.entropy(col_b, n_samples)
        h_c = h_ab - h_b

        return h_c
示例#3
0
    def mutual_information(self,
                           col_a,
                           col_b,
                           normed=True,
                           linfoot=False,
                           n_samples=1000):
        """ The mutual information, I(A, B), between two columns.

        Parameters
        ----------
        col_a : indexer
            The name of the first column
        col_b : indexer
            The name of the second column
        normed : bool
            If True, the mutual information, I, is normed according to the
            symmertic uncertainty, U = 2*I(A, B)/(H(A) + H(B)).
        n_samples : int
            The number of samples to use for the Monte Carlo approximation
            (if nored if `col` is categorical).

        Returns
        -------
        mi : float
            The mutual information between `col_a` and `col_b`.
        """

        if linfoot:
            normed = False

        idx_a = self._converters['col2idx'][col_a]
        idx_b = self._converters['col2idx'][col_b]

        models = []
        for model in self._models:
            if model['col_assignment'][idx_a] == \
                    model['col_assignment'][idx_b]:
                models.append(model)

        if len(models) == 0:
            mi = 0.0
        else:
            h_a = self.entropy(col_a, n_samples=n_samples)
            h_b = self.entropy(col_b, n_samples=n_samples)
            h_ab = mu.joint_entropy(models, [idx_a, idx_b], n_samples)
            mi = h_a + h_b - h_ab

            # XXX: Differential entropy can be negative. Here we prevent
            # negative mutual information.
            mi = max(mi, 0.)
            if normed:
                # normalize using symmetric uncertainty
                mi = 2. * mi / (h_a + h_b)

        if linfoot:
            mi = (1. - exp(-2 * mi))**.5

        return mi
示例#4
0
    def mutual_information(self, col_a, col_b, normed=True, linfoot=False,
                           n_samples=1000):
        """ The mutual information, I(A, B), between two columns.

        Parameters
        ----------
        col_a : indexer
            The name of the first column
        col_b : indexer
            The name of the second column
        normed : bool
            If True, the mutual information, I, is normed according to the
            symmertic uncertainty, U = 2*I(A, B)/(H(A) + H(B)).
        n_samples : int
            The number of samples to use for the Monte Carlo approximation
            (if nored if `col` is categorical).

        Returns
        -------
        mi : float
            The mutual information between `col_a` and `col_b`.
        """

        if linfoot:
            normed = False

        idx_a = self._converters['col2idx'][col_a]
        idx_b = self._converters['col2idx'][col_b]

        models = []
        for model in self._models:
            if model['col_assignment'][idx_a] == \
                    model['col_assignment'][idx_b]:
                models.append(model)

        if len(models) == 0:
            mi = 0.0
        else:
            h_a = self.entropy(col_a, n_samples=n_samples)
            h_b = self.entropy(col_b, n_samples=n_samples)
            h_ab = mu.joint_entropy(models, [idx_a, idx_b], n_samples)
            mi = h_a + h_b - h_ab

            # XXX: Differential entropy can be negative. Here we prevent
            # negative mutual information.
            mi = max(mi, 0.)
            if normed:
                # normalize using symmetric uncertainty
                mi = 2.*mi/(h_a + h_b)

        if linfoot:
            mi = (1. - exp(-2*mi))**.5

        return mi