def fowlkes_mallows(y_true, y_pred, contingency=None, PBV=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if PBV is None: PBV = utility.pair_based_values(contingency) tp, fp, fn, tn = PBV return np.sqrt((tp * tp) / ((tp + fp) * (tp + fn)))
def recall(y_true, y_pred, contingency=None, PBV=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if PBV is None: PBV = utility.pair_based_values(contingency) tp, _, fn, _ = PBV return tp / (tp + fn)
def adjusted_mutual_information(y_true, y_pred, contingency=None, a_i=None, b_j=None, mi=None, real_clustering_entropy=None, predicted_clustering_entropy=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if a_i is None: a_i = utility.compute_a_i(contingency) if b_j is None: b_j = utility.compute_b_j if mi is None: mi = mutual_information(y_true, y_pred, contingency, a_i, b_j) if real_clustering_entropy is None: real_clustering_entropy = sklearn.metrics.cluster.entropy(y_true) if predicted_clustering_entropy is None: predicted_clustering_entropy = sklearn.metrics.cluster.entropy(y_pred) J_a_i = np.repeat(a_i, np.ma.size(b_j), axis=1) I_b_j = np.repeat(b_j, np.ma.size(a_i), axis=0) N = np.sum(contingency) emi = expected_mutual_information(contingency, N) maxmi = np.sqrt(real_clustering_entropy * predicted_clustering_entropy) return (mi - emi) / (maxmi - emi)
def jaccard_index(y_true, y_pred, contingency=None, PBV=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if PBV is None: PBV = utility.pair_based_values(contingency) tp, fp, fn, tn = PBV return tp / (tp + fp + fn)
def q2(y_true, y_pred, contingency=None, cond_entrop=None, a_i=None, b_j=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if cond_entrop is None: cond_entrop = cond_entropy(y_true, y_pred, contingency) if a_i is None: a_i = utility.compute_a_i(contingency) if b_j is None: b_j = utility.compute_b_j(contingency) J = np.ma.size(b_j) N = np.sum(a_i) combi = comb(a_i + J - 1, J - 1) log = np.ma.log(combi) log = log.filled(0) q0 = cond_entrop + (np.sum(log) / N) logmin = comb(b_j + J - 1, J - 1) logmin = np.ma.log(logmin) logmin = logmin.filled(0) maxq0 = sklearn.metrics.cluster.entropy(y_true) + np.log(J) minq0 = np.sum(logmin) / N return (maxq0 - q0) / (maxq0 - minq0)
def v_measure(y_true, y_pred, beta, mutual_information=None, contingency=None, predicted_clustering_entropy=None, real_clustering_entropy=None, homog=None, compl=None): # Enhancement that include a beta parameter ; Optionnal, since no paper used the v-measure criterion with beta != 1 if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if mutual_information is None: mutual_information = sklearn.metrics.mutual_info_score( y_true, y_pred, contingency) if predicted_clustering_entropy is None: predicted_clustering_entropy = sklearn.metrics.entropy(y_pred) if real_clustering_entropy is None: real_clustering_entropy = sklearn.metrics.entropy(y_true) if homog is None: homog = homogeneity(y_true, y_pred, contingency, mutual_information, real_clustering_entropy) if compl is None: compl = completness(y_true, y_pred, contingency, mutual_information, predicted_clustering_entropy) if homog + compl == 0: return 0 return ((1 + beta) * homog * compl) / (beta * homog + compl)
def accuracy(y_true, y_pred, contingency=None, PBV=None): # Note : Identical to Rand Index criterion, in context of pair counting if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if PBV is None: PBV = utility.pair_based_values(contingency) tp, fp, fn, tn = PBV return (tp + tn) / (tp + tn + fp + fn)
def ep_ratio(y_true, y_pred, contingency=None, a_i=None, entr=None, pur=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if entr is None: entr = cond_entropy(y_true, y_pred, contingency, a_i) if pur is None: pur = purity(y_true, y_pred, contingency) return entr / pur
def false_alarm_rate(y_true, y_pred, contingency=None, PBV=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if PBV == None: PBV = utility.pair_based_values(contingency) _, fp, fn, _ = PBV if fp + fn == 0: return 0 return fp / (fp + fn)
def balanced_accuracy(y_true, y_pred, contingency=None, PBV=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if PBV is None: PBV = utility.pair_based_values(contingency) tp, fp, fn, tn = PBV a = tp / (tp + fn) b = tn / (tn + fp) return 0.5 * a + 0.5 * b
def goodness(y_true, y_pred, contingency=None, PBV=None, pre=None, rec=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if PBV is None: PBV = utility.pair_based_values(contingency) if pre is None: pre = precision(y_true, y_pred, contingency, PBV) if rec is None: rec = recall(y_true, y_pred, contingency, PBV) return 0.5 * (pre + rec)
def clustering_error(y_true, y_pred, contingency=None, PBV=None): # Note : It might be different to consider CE and Accuracy, especially considering subtle variation like micro/macro/weighted averaging. # However, in data pairs context, CE is always equal to 1 - Accuracy if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if PBV is None: PBV = utility.pair_based_values(contingency) print( "'Clustering Error' is redundant criterion ; Please use Accuracy criterion instead, and compute 1 - Accuracy" )
def completness(y_true, y_pred, contingency=None, mutual_information=None, predicted_clustering_entropy=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if mutual_information is None: mutual_information = sklearn.metrics.cluster.mutual_info_score( (y_true, y_pred, contingency)) if predicted_clustering_entropy is None: predicted_clustering_entropy = sklearn.metrics.cluster.entropy(y_pred) return mutual_information / predicted_clustering_entropy
def homogeneity(y_true, y_pred, contingency=None, mutual_information=None, real_clustering_entropy=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if mutual_information is None: mutual_information = sklearn.metrics.cluster.mutual_info_score( y_true, y_pred, contingency) if real_clustering_entropy is None: real_clustering_entropy = sklearn.metrics.cluster.entropy(y_true) return mutual_information / real_clustering_entropy
def test_entropy(): wine = sklearn.datasets.load_wine() var2 = sklearn.cluster.KMeans(4) var2.fit(wine['data']) pred2 = var2.predict(wine['data']) contingency = utility.compute_contingency(wine['target'], pred2) entropy = new_metrics.cond_entropy(wine['target'], pred2, contingency) print("true", wine['target']) print("predicted", pred2) print("entropy", entropy)
def cond_entropy(y_true, y_pred, contingency=None, a_i=None): # Compute the average of each predicted-cluster's entropy, which is based on membership diversity of each data on reals clusters # Must be distinguished from entropy if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if a_i is None: a_i = utility.compute_a_i(contingency) J_a_i = np.repeat(a_i, np.size(contingency, axis=1), axis=1) log_value = np.true_divide(contingency, J_a_i) log_value = np.ma.log(log_value) # Required to deal with 0's log_value = log_value.filled(0) N = np.ma.sum(a_i) return -np.sum(contingency * log_value) / N
def mutual_information(y_true, y_pred, contingency=None, a_i=None, b_j=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if a_i is None: a_i = utility.compute_a_i(contingency) if b_j is None: b_j = utility.compute_b_j J_a_i = np.repeat(a_i, np.ma.size(b_j), axis=1) I_b_j = np.repeat(b_j, np.ma.size(a_i), axis=0) N = np.sum(a_i) tmp = np.true_divide(N * contingency, J_a_i * I_b_j) log_value = np.ma.log(tmp) log_value = log_value.filled(0) return np.sum(contingency * log_value) / N
def test_get_metrics(): dataset = sklearn.datasets.load_breast_cancer() var2 = sklearn.cluster.KMeans(4) var2.fit(dataset['data']) true = dataset['target'] pred = var2.predict(dataset['data']) print("H(reel)", sklearn.metrics.cluster.entropy(true)) print("H(pred)", sklearn.metrics.cluster.entropy(pred)) MI = sklearn.metrics.cluster.mutual_info_score(true, pred) print("MI", MI) cont = utility.compute_contingency(true, pred) rep = main.get_metrics(true, pred) for a in rep: print(a, "\t", rep[a]) print(cont)
def f_beta_score(y_true, y_pred, beta, contingency=None, PBV=None, pre=None, rec=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if PBV is None: PBV = utility.pair_based_values(contingency) if pre is None: pre = precision(y_true, y_pred, contingency, PBV) if rec is None: rec = recall(y_true, y_pred, contingency, PBV) beta2 = np.power(beta, 2) if pre * rec == 0: return 0 return ((1 + beta2) * pre * rec) / (beta2 * pre + rec)
def normalized_mutual_information(y_true, y_pred, contingency=None, a_i=None, b_j=None, mi=None, real_clustering_entropy=None, predicted_clustering_entropy=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if a_i is None: a_i = utility.compute_a_i(contingency) if b_j is None: b_j = utility.compute_b_j if mi is None: mi = mutual_information(y_true, y_pred, contingency, a_i, b_j) if real_clustering_entropy is None: real_clustering_entropy = sklearn.metrics.cluster.entropy(y_true) if predicted_clustering_entropy is None: predicted_clustering_entropy = sklearn.metrics.cluster.entropy(y_pred) return mi / np.sqrt(real_clustering_entropy * predicted_clustering_entropy)
def variation_of_information(y_true, y_pred, contingency=None, a_i=None, b_j=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if a_i is None: a_i = utility.compute_a_i(contingency) if b_j is None: b_j = utility.compute_b_j(contingency) J_a_i = np.repeat(a_i, np.ma.size(b_j), axis=1) I_b_j = np.repeat(b_j, np.ma.size(a_i), axis=0) N = np.sum(a_i) log1 = np.true_divide(contingency, J_a_i) log1 = np.ma.log(log1) log1 = log1.filled(0) log2 = np.true_divide(contingency, I_b_j) log2 = np.ma.log(log2) log2 = log2.filled(0) return -np.sum(contingency * (log1 + log2)) / N
def test_mutual_information_cond_entropy(): wine = sklearn.datasets.load_wine() var2 = sklearn.cluster.KMeans(4) var2.fit(wine['data']) pred2 = var2.predict(wine['data']) cont = utility.compute_contingency(wine['target'], pred2) print("cont", cont) # Implementation perso CE = new_metrics.cond_entropy(wine['target'], pred2) print("found CE : ", CE) # Calcul depuis CE = H(label) - MI MI = sklearn.metrics.cluster.mutual_info_score(wine['target'], pred2) H_label = sklearn.metrics.cluster.entropy(wine['target']) sub = H_label - MI print("CI from H_label - MI : ", sub) # Calcul depuis hom = 1 - [CE / H(label)] H_cluster = sklearn.metrics.cluster.entropy(wine['target']) homogeneity = sklearn.metrics.cluster.homogeneity_score( wine['target'], pred2) ce_complet = H_label * (1 - homogeneity) print("CI from homogeneity : ", ce_complet)
def compute_prediction_metrics(y_true, y_pred): # Calculate criteria score for each one present in CRITERION_LIST # Calculating values that are presents in several formula of criterion contingency = utility.compute_contingency(y_true, y_pred) a_i = utility.compute_a_i(contingency) b_j = utility.compute_b_j(contingency) true_clustering_entropy = sklearn.metrics.cluster.entropy(y_true) predicted_clustering_entropy = sklearn.metrics.cluster.entropy(y_pred) PBV = utility.pair_based_values(contingency) # Getting all criterion values mi = converted_metrics.mutual_information(y_true, y_pred, contingency, a_i, b_j) ari = converted_metrics.adjusted_rand_index(y_true, y_pred, contingency, PBV, a_i, b_j) ami = converted_metrics.adjusted_mutual_information( y_true, y_pred, contingency, a_i, b_j, mi, true_clustering_entropy, predicted_clustering_entropy) compl = converted_metrics.homogeneity(y_true, y_pred, contingency, mi, predicted_clustering_entropy) homog = converted_metrics.completness(y_true, y_pred, contingency, mi, true_clustering_entropy) vmeasure = converted_metrics.v_measure(y_true, y_pred, 1, mi, contingency, predicted_clustering_entropy, true_clustering_entropy, homog, compl) entropy = new_metrics.cond_entropy(y_true, y_pred, contingency, a_i) #accuracy = converted_metrics.accuracy(y_true, y_pred, contingency, PBV) Is equal to ARI in pair-based value context precision = converted_metrics.precision(y_true, y_pred, contingency, PBV) recall = converted_metrics.recall(y_true, y_pred, contingency, PBV) falsealarm = converted_metrics.false_alarm_rate(y_true, y_pred, contingency, PBV) fm = converted_metrics.fowlkes_mallows(y_true, y_pred, contingency, PBV) f1 = converted_metrics.f_beta_score(y_true, y_pred, 1, contingency, PBV, precision, recall) purity = new_metrics.purity(y_true, y_pred, contingency) inversed_purity = new_metrics.inversed_purity(y_true, y_pred, contingency) epratio = new_metrics.ep_ratio(y_true, y_pred, contingency, a_i, entropy, purity) jaccard = converted_metrics.jaccard_index(y_true, y_pred, contingency, PBV) nmi = converted_metrics.normalized_mutual_information( y_true, y_pred, contingency, a_i, b_j, mi, true_clustering_entropy, predicted_clustering_entropy) ri = new_metrics.rand_index(y_true, y_pred, contingency, PBV) vi = new_metrics.variation_of_information(y_true, y_pred, contingency, a_i, b_j) # clustering error not calculated : always equal to 1 - accuracy goodness = converted_metrics.goodness(y_true, y_pred, contingency, PBV) bal_accuracy = converted_metrics.balanced_accuracy(y_true, y_pred, contingency, PBV) q2 = new_metrics.q2(y_true, y_pred, contingency, entropy, a_i, b_j) metrics_dictionnary = { "mi": mi, "ari": ari, "ami": ami, "compl": compl, "homog": homog, "vmeasure": vmeasure, "entropy": entropy, "precision": precision, "recall": recall, "falsealarm": falsealarm, "fm": fm, "f1": f1, "purity": purity, "inv_purity": inversed_purity, "epratio": epratio, "jaccard": jaccard, "nmi": nmi, "ri": ri, "vi": vi, "goodness": goodness, "balacc": bal_accuracy, "q2": q2 } if set(metrics_dictionnary.keys()) != CRITERION_LIST: print( "ERROR : One or several criterion are not computed in main.compute_prediction_metrics" ) return metrics_dictionnary
def inversed_purity(y_true, y_pred, contingency=None): # Note : Simply the definition of Purity, where the predicted and the real clusters roles are inverted if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) return purity(y_pred, y_true, np.transpose(contingency))
def purity(y_true, y_pred, contingency=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) max_i = np.max(contingency, axis=1) return np.sum(max_i) / np.ma.size(y_true)