def getMaxVariationOfInformation_slow(self, otherCover, N_nodes=None): """Return maximum variation of information. Note that the time complexity is O(N*M), where N is the number of communities in `self` and M is the number of communities in `otherCover`. """ def p_log2_p(p): return (0.0 if p == 0 else -p*np.log2(p)) # Find out the number of nodes if not given. `Nf` is defined # just to make the code more readable. if N_nodes is None: all_nodes = set() for c in self.comm: all_nodes.update(c) N_nodes = len(all_nodes) del all_nodes Nf = float(N_nodes) ret_val = 1.0 for XF, YF in [(self.comm, otherCover.comm), (otherCover.comm, self.comm)]: H_norm = [] for X_k in XF: # Calculate the entropies H(X_k|Y_l) for all Y_l in YF # and find the minimum. H_min is initialized to # H(X_k), which is the maximum value and will be the # final value if no Y_l is accepted. px = len(X_k)/Nf H_min = H_X = ith.entropy_X([px, 1-px]) for Y_l in YF: cut_size = len(X_k.intersection(Y_l)) hP_same = (p_log2_p(cut_size/Nf) + p_log2_p((Nf - len(X_k.union(Y_l)))/Nf)) hP_diff = (p_log2_p((len(X_k) - cut_size)/Nf) + p_log2_p((len(Y_l) - cut_size)/Nf)) if (hP_same > hP_diff): py = len(Y_l)/Nf H_Y = ith.entropy_X([py, 1-py]) H_min = min(H_min, hP_same + hP_diff - H_Y) #print (X_k, Y_l, hP_same, hP_diff, hP_same > hP_diff, # (hP_same + hP_diff - H_Y)/H_X) H_norm.append((0 if H_X == 0 else H_min/H_X)) ret_val -= 0.5*sum(H_norm)/len(XF) #print H_norm, sum(H_norm)/len(XF) return ret_val
def entropy(self): """Entropy of the node partition.""" try: return self._entropy except AttributeError: len_c = self.C_sizes self._entropy = ith.entropy_X(np.array(len_c)/float(sum(len_c))) return self._entropy
def test_entropy_X(self): P = [0.5, 0.5] ans_str = "%.10f" % it.entropy_X(P) corr_ans = "%.10f" % 1.0 self.assertEqual(ans_str, corr_ans) P = [0.2, 0.2, 0.2, 0.2, 0.2] ans_str = "%.10f" % it.entropy_X(P) corr_ans = "%.10f" % -np.log2(0.2) self.assertEqual(ans_str, corr_ans) P = [0.1, 0.2, 0.7] ans_str = "%.10f" % it.entropy_X(P) corr_ans = "%.10f" % 1.1567796494470395 self.assertEqual(ans_str, corr_ans) P = [1, 0] self.assertEqual(it.entropy_X(P), 0)
def getMaxVariationOfInformation(self, otherCover): """Return maximum variation of information. The maximum variation of information can be used to compare two families with overlapping node set. The definition comes from Appendix B of Andrea Lancichinetti, Santo Fortunato, Janos Kertesz (2009) `Detecting the overlapping and hierarchical community structure of complex networks'. Parameters ---------- otherCover : NodeCover object The other community sructure to compare with. N_nodes : int The total number of nodes in all communities. If None, the size of the union of all communities in both community structures is used. Note that if there are nodes that do not belong in any community this will not give the correct answer. Return ------ mv : float The maximum variation of information. Notes ----- Time complexity is roughly O(N)+O(M^2), where N is the total number of nodes and M is largest number of communities for one node. If one community structure consists of only one community covering all nodes, this measure is always 0.5 (unless of course the other one is identical, in which case 1.0 is returned.) """ def p_log2_p(p): return (0.0 if p == 0 else -p*np.log2(p)) # Find out the number of nodes. Nf = float(max(self.N_nodes, otherCover.N_nodes)) # Construct bipartite community network. commNet = self._getOverlapNetwork(otherCover) Nc_A = len(self) Nc_B = len(otherCover) # List of community sizes. comm_sizes = self.getCommunitySizes() + otherCover.getCommunitySizes() ret_val = 1.0 for IX, IY in [(xrange(Nc_A), xrange(Nc_A, Nc_A+Nc_B)), (xrange(Nc_A, Nc_A+Nc_B), xrange(Nc_A))]: # IX contains the IDs of first communities (first node # cover), IY the IDs of the second communities. These are # flipped on the second iteration to calculate the same # thing the other way around. H_norm = [] for X_k in IX: # Calculate the entropies H(X_k|Y_l) for all Y_l in IY # and find the minimum. H_min is initialized to # H(X_k), which is the maximum value and will be the # final value if no Y_l is accepted. px = comm_sizes[X_k]/Nf H_min = H_X = ith.entropy_X([px, 1-px]) for Y_l in commNet[X_k]: cut_size = commNet[X_k][Y_l] union_size = comm_sizes[X_k] + comm_sizes[Y_l] - cut_size hP_same = (p_log2_p(cut_size/Nf) + p_log2_p((Nf - union_size)/Nf)) hP_diff = (p_log2_p((comm_sizes[X_k] - cut_size)/Nf) + p_log2_p((comm_sizes[Y_l] - cut_size)/Nf)) if (hP_same > hP_diff): py = comm_sizes[Y_l]/Nf H_Y = ith.entropy_X([py, 1-py]) H_min = min(H_min, hP_same + hP_diff - H_Y) #print (X_k, Y_l, hP_same, hP_diff, hP_same > hP_diff, # (hP_same + hP_diff - H_Y)/H_X) H_norm.append((0 if H_X == 0 else H_min/H_X)) ret_val -= 0.5*sum(H_norm)/len(IX) #print H_norm, sum(H_norm)/len(IX) return ret_val