def v_measure(self, beta=1): """Computes Rosenberg and Hirschberg's V-measure (EMNLP '07), which ranges between 0 and 1 (1 is best). The beta parameter can be used to weigh homogeneity or completeness; the default is balanced harmonic mean, beta > 1 favors homogeneity.""" h_c = entropy_of_multinomial(self.gold_sizes.values()) h_k = entropy_of_multinomial( [sum(table.values()) for table in self.by_test.values()]) if h_c == 0: h**o = 1 else: h_c_given_k = self.conditional_entropy_gold_given_test() h**o = 1 - h_c_given_k / h_c if h_k == 0: comp = 1 else: h_k_given_c = conditional_entropy_Y_Given_X( dict(self.as_confusion_items())) comp = 1 - h_k_given_c / h_k return fscore(h**o, comp, beta) #computes the harmonic mean
def normalized_vi(self): """Calculates NVI (Reichart and Rappoport '09), which is VI/H(C), variation of information normalized by the entropy of the true clustering. This metric has value 0 for perfect clusterings and 1 for the single-cluster clustering; 'reasonable' clusterings have scores in between.""" hc = entropy_of_multinomial(self.gold_sizes.values()) if hc == 0: return 0 return self.variation_of_information() / hc
def normalized_mutual_information(self): """Normalized mutual information (Strehl and Ghosh JMLR '02 "Cluster Ensembles"), eq 2: mutual information normalized by the square root of the product of entropies. The value is between 0 and 1, and is 1 for identical clusterings.""" denom = (sqrt( entropy_of_multinomial(self.gold_sizes.values()) * entropy_of_multinomial([sum(table.values()) for table in self.by_test.values()]))) if denom == 0: if entropy_of_multinomial(self.gold_sizes.values()) == 0: #gold clustering is entirely uninformative #so anything we do is good return 1 else: #induced clustering is entirely uninformative return 0 return self.mutual_information() / denom
def entropy(self): from Probably import entropy_of_multinomial return entropy_of_multinomial(self.values())