Exemplo n.º 1
0
    def getMaxVariationOfInformation_slow(self, otherCover, N_nodes=None):
        """Return maximum variation of information.

        Note that the time complexity is O(N*M), where N is the number
        of communities in `self` and M is the number of communities in
        `otherCover`.
        """

        def p_log2_p(p):
            return (0.0 if p == 0 else -p*np.log2(p))

        # Find out the number of nodes if not given. `Nf` is defined
        # just to make the code more readable.
        if N_nodes is None:
            all_nodes = set()
            for c in self.comm:
                all_nodes.update(c)
            N_nodes = len(all_nodes)
            del all_nodes
        Nf = float(N_nodes)

        ret_val = 1.0
        for XF, YF in [(self.comm, otherCover.comm),
                       (otherCover.comm, self.comm)]:
            H_norm = []
            for X_k in XF:
                # Calculate the entropies H(X_k|Y_l) for all Y_l in YF
                # and find the minimum. H_min is initialized to
                # H(X_k), which is the maximum value and will be the
                # final value if no Y_l is accepted.
                px = len(X_k)/Nf
                H_min = H_X = ith.entropy_X([px, 1-px])
                for Y_l in YF:
                    cut_size = len(X_k.intersection(Y_l))
                    hP_same = (p_log2_p(cut_size/Nf)
                               + p_log2_p((Nf - len(X_k.union(Y_l)))/Nf))
                    hP_diff = (p_log2_p((len(X_k) - cut_size)/Nf)
                               + p_log2_p((len(Y_l) - cut_size)/Nf))
                    if (hP_same > hP_diff):
                        py = len(Y_l)/Nf
                        H_Y = ith.entropy_X([py, 1-py])
                        H_min = min(H_min, hP_same + hP_diff - H_Y)

                        #print (X_k, Y_l, hP_same, hP_diff, hP_same > hP_diff,
                        #       (hP_same + hP_diff - H_Y)/H_X)
                H_norm.append((0 if H_X == 0 else H_min/H_X))

            ret_val -= 0.5*sum(H_norm)/len(XF)
            #print H_norm, sum(H_norm)/len(XF)

        return ret_val
Exemplo n.º 2
0
    def getMaxVariationOfInformation_slow(self, otherCover, N_nodes=None):
        """Return maximum variation of information.

        Note that the time complexity is O(N*M), where N is the number
        of communities in `self` and M is the number of communities in
        `otherCover`.
        """

        def p_log2_p(p):
            return (0.0 if p == 0 else -p*np.log2(p))

        # Find out the number of nodes if not given. `Nf` is defined
        # just to make the code more readable.
        if N_nodes is None:
            all_nodes = set()
            for c in self.comm:
                all_nodes.update(c)
            N_nodes = len(all_nodes)
            del all_nodes
        Nf = float(N_nodes)

        ret_val = 1.0
        for XF, YF in [(self.comm, otherCover.comm),
                       (otherCover.comm, self.comm)]:
            H_norm = []
            for X_k in XF:
                # Calculate the entropies H(X_k|Y_l) for all Y_l in YF
                # and find the minimum. H_min is initialized to
                # H(X_k), which is the maximum value and will be the
                # final value if no Y_l is accepted.
                px = len(X_k)/Nf
                H_min = H_X = ith.entropy_X([px, 1-px])
                for Y_l in YF:
                    cut_size = len(X_k.intersection(Y_l))
                    hP_same = (p_log2_p(cut_size/Nf)
                               + p_log2_p((Nf - len(X_k.union(Y_l)))/Nf))
                    hP_diff = (p_log2_p((len(X_k) - cut_size)/Nf)
                               + p_log2_p((len(Y_l) - cut_size)/Nf))
                    if (hP_same > hP_diff):
                        py = len(Y_l)/Nf
                        H_Y = ith.entropy_X([py, 1-py])
                        H_min = min(H_min, hP_same + hP_diff - H_Y)

                        #print (X_k, Y_l, hP_same, hP_diff, hP_same > hP_diff,
                        #       (hP_same + hP_diff - H_Y)/H_X)
                H_norm.append((0 if H_X == 0 else H_min/H_X))

            ret_val -= 0.5*sum(H_norm)/len(XF)
            #print H_norm, sum(H_norm)/len(XF)

        return ret_val
Exemplo n.º 3
0
 def entropy(self):
     """Entropy of the node partition."""
     try:
         return self._entropy
     except AttributeError:
         len_c = self.C_sizes
         self._entropy = ith.entropy_X(np.array(len_c)/float(sum(len_c)))
         return self._entropy
Exemplo n.º 4
0
 def entropy(self):
     """Entropy of the node partition."""
     try:
         return self._entropy
     except AttributeError:
         len_c = self.C_sizes
         self._entropy = ith.entropy_X(np.array(len_c)/float(sum(len_c)))
         return self._entropy
Exemplo n.º 5
0
    def test_entropy_X(self):
        P = [0.5, 0.5]
        ans_str = "%.10f" % it.entropy_X(P)
        corr_ans = "%.10f" % 1.0
        self.assertEqual(ans_str, corr_ans)

        P = [0.2, 0.2, 0.2, 0.2, 0.2]
        ans_str = "%.10f" % it.entropy_X(P)
        corr_ans = "%.10f" % -np.log2(0.2)
        self.assertEqual(ans_str, corr_ans)

        P = [0.1, 0.2, 0.7]
        ans_str = "%.10f" % it.entropy_X(P)
        corr_ans = "%.10f" % 1.1567796494470395
        self.assertEqual(ans_str, corr_ans)

        P = [1, 0]
        self.assertEqual(it.entropy_X(P), 0)
Exemplo n.º 6
0
    def getMaxVariationOfInformation(self, otherCover):
        """Return maximum variation of information.

        The maximum variation of information can be used to compare
        two families with overlapping node set.

        The definition comes from Appendix B of
          Andrea Lancichinetti, Santo Fortunato, Janos Kertesz (2009)
          `Detecting the overlapping and hierarchical community
          structure of complex networks'.

        Parameters
        ----------
        otherCover : NodeCover object
           The other community sructure to compare with.
        N_nodes : int
           The total number of nodes in all communities. If None, the
           size of the union of all communities in both community
           structures is used. Note that if there are nodes that do
           not belong in any community this will not give the correct
           answer.

        Return
        ------
        mv : float
           The maximum variation of information.

        Notes
        -----
        Time complexity is roughly O(N)+O(M^2), where N is the total
        number of nodes and M is largest number of communities for one
        node.

        If one community structure consists of only one community
        covering all nodes, this measure is always 0.5 (unless of
        course the other one is identical, in which case 1.0 is
        returned.)
        """
        def p_log2_p(p):
            return (0.0 if p == 0 else -p*np.log2(p))
        
        # Find out the number of nodes.
        Nf = float(max(self.N_nodes, otherCover.N_nodes))

        # Construct bipartite community network.
        commNet = self._getOverlapNetwork(otherCover)
        Nc_A = len(self)
        Nc_B = len(otherCover)

        # List of community sizes.
        comm_sizes = self.getCommunitySizes() + otherCover.getCommunitySizes()

        ret_val = 1.0
        for IX, IY in [(xrange(Nc_A), xrange(Nc_A, Nc_A+Nc_B)),
                       (xrange(Nc_A, Nc_A+Nc_B), xrange(Nc_A))]:
            # IX contains the IDs of first communities (first node
            # cover), IY the IDs of the second communities. These are
            # flipped on the second iteration to calculate the same
            # thing the other way around.
            H_norm = []
            for X_k in IX:
                # Calculate the entropies H(X_k|Y_l) for all Y_l in IY
                # and find the minimum. H_min is initialized to
                # H(X_k), which is the maximum value and will be the
                # final value if no Y_l is accepted.
                px = comm_sizes[X_k]/Nf
                H_min = H_X = ith.entropy_X([px, 1-px])
                for Y_l in commNet[X_k]:
                    cut_size = commNet[X_k][Y_l]
                    union_size = comm_sizes[X_k] + comm_sizes[Y_l] - cut_size
                    hP_same = (p_log2_p(cut_size/Nf)
                               + p_log2_p((Nf - union_size)/Nf))
                    hP_diff = (p_log2_p((comm_sizes[X_k] - cut_size)/Nf)
                               + p_log2_p((comm_sizes[Y_l] - cut_size)/Nf))
                    if (hP_same > hP_diff):
                        py = comm_sizes[Y_l]/Nf
                        H_Y = ith.entropy_X([py, 1-py])
                        H_min = min(H_min, hP_same + hP_diff - H_Y)

                        #print (X_k, Y_l, hP_same, hP_diff, hP_same > hP_diff,
                        #       (hP_same + hP_diff - H_Y)/H_X)
                H_norm.append((0 if H_X == 0 else H_min/H_X))

            ret_val -= 0.5*sum(H_norm)/len(IX)
            #print H_norm, sum(H_norm)/len(IX)

        return ret_val
Exemplo n.º 7
0
    def getMaxVariationOfInformation(self, otherCover):
        """Return maximum variation of information.

        The maximum variation of information can be used to compare
        two families with overlapping node set.

        The definition comes from Appendix B of
          Andrea Lancichinetti, Santo Fortunato, Janos Kertesz (2009)
          `Detecting the overlapping and hierarchical community
          structure of complex networks'.

        Parameters
        ----------
        otherCover : NodeCover object
           The other community sructure to compare with.
        N_nodes : int
           The total number of nodes in all communities. If None, the
           size of the union of all communities in both community
           structures is used. Note that if there are nodes that do
           not belong in any community this will not give the correct
           answer.

        Return
        ------
        mv : float
           The maximum variation of information.

        Notes
        -----
        Time complexity is roughly O(N)+O(M^2), where N is the total
        number of nodes and M is largest number of communities for one
        node.

        If one community structure consists of only one community
        covering all nodes, this measure is always 0.5 (unless of
        course the other one is identical, in which case 1.0 is
        returned.)
        """
        def p_log2_p(p):
            return (0.0 if p == 0 else -p*np.log2(p))
        
        # Find out the number of nodes.
        Nf = float(max(self.N_nodes, otherCover.N_nodes))

        # Construct bipartite community network.
        commNet = self._getOverlapNetwork(otherCover)
        Nc_A = len(self)
        Nc_B = len(otherCover)

        # List of community sizes.
        comm_sizes = self.getCommunitySizes() + otherCover.getCommunitySizes()

        ret_val = 1.0
        for IX, IY in [(xrange(Nc_A), xrange(Nc_A, Nc_A+Nc_B)),
                       (xrange(Nc_A, Nc_A+Nc_B), xrange(Nc_A))]:
            # IX contains the IDs of first communities (first node
            # cover), IY the IDs of the second communities. These are
            # flipped on the second iteration to calculate the same
            # thing the other way around.
            H_norm = []
            for X_k in IX:
                # Calculate the entropies H(X_k|Y_l) for all Y_l in IY
                # and find the minimum. H_min is initialized to
                # H(X_k), which is the maximum value and will be the
                # final value if no Y_l is accepted.
                px = comm_sizes[X_k]/Nf
                H_min = H_X = ith.entropy_X([px, 1-px])
                for Y_l in commNet[X_k]:
                    cut_size = commNet[X_k][Y_l]
                    union_size = comm_sizes[X_k] + comm_sizes[Y_l] - cut_size
                    hP_same = (p_log2_p(cut_size/Nf)
                               + p_log2_p((Nf - union_size)/Nf))
                    hP_diff = (p_log2_p((comm_sizes[X_k] - cut_size)/Nf)
                               + p_log2_p((comm_sizes[Y_l] - cut_size)/Nf))
                    if (hP_same > hP_diff):
                        py = comm_sizes[Y_l]/Nf
                        H_Y = ith.entropy_X([py, 1-py])
                        H_min = min(H_min, hP_same + hP_diff - H_Y)

                        #print (X_k, Y_l, hP_same, hP_diff, hP_same > hP_diff,
                        #       (hP_same + hP_diff - H_Y)/H_X)
                H_norm.append((0 if H_X == 0 else H_min/H_X))

            ret_val -= 0.5*sum(H_norm)/len(IX)
            #print H_norm, sum(H_norm)/len(IX)

        return ret_val