def q2(y_true, y_pred, contingency=None, cond_entrop=None, a_i=None, b_j=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if cond_entrop is None: cond_entrop = cond_entropy(y_true, y_pred, contingency) if a_i is None: a_i = utility.compute_a_i(contingency) if b_j is None: b_j = utility.compute_b_j(contingency) J = np.ma.size(b_j) N = np.sum(a_i) combi = comb(a_i + J - 1, J - 1) log = np.ma.log(combi) log = log.filled(0) q0 = cond_entrop + (np.sum(log) / N) logmin = comb(b_j + J - 1, J - 1) logmin = np.ma.log(logmin) logmin = logmin.filled(0) maxq0 = sklearn.metrics.cluster.entropy(y_true) + np.log(J) minq0 = np.sum(logmin) / N return (maxq0 - q0) / (maxq0 - minq0)
def adjusted_mutual_information(y_true, y_pred, contingency=None, a_i=None, b_j=None, mi=None, real_clustering_entropy=None, predicted_clustering_entropy=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if a_i is None: a_i = utility.compute_a_i(contingency) if b_j is None: b_j = utility.compute_b_j if mi is None: mi = mutual_information(y_true, y_pred, contingency, a_i, b_j) if real_clustering_entropy is None: real_clustering_entropy = sklearn.metrics.cluster.entropy(y_true) if predicted_clustering_entropy is None: predicted_clustering_entropy = sklearn.metrics.cluster.entropy(y_pred) J_a_i = np.repeat(a_i, np.ma.size(b_j), axis=1) I_b_j = np.repeat(b_j, np.ma.size(a_i), axis=0) N = np.sum(contingency) emi = expected_mutual_information(contingency, N) maxmi = np.sqrt(real_clustering_entropy * predicted_clustering_entropy) return (mi - emi) / (maxmi - emi)
def cond_entropy(y_true, y_pred, contingency=None, a_i=None): # Compute the average of each predicted-cluster's entropy, which is based on membership diversity of each data on reals clusters # Must be distinguished from entropy if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if a_i is None: a_i = utility.compute_a_i(contingency) J_a_i = np.repeat(a_i, np.size(contingency, axis=1), axis=1) log_value = np.true_divide(contingency, J_a_i) log_value = np.ma.log(log_value) # Required to deal with 0's log_value = log_value.filled(0) N = np.ma.sum(a_i) return -np.sum(contingency * log_value) / N
def mutual_information(y_true, y_pred, contingency=None, a_i=None, b_j=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if a_i is None: a_i = utility.compute_a_i(contingency) if b_j is None: b_j = utility.compute_b_j J_a_i = np.repeat(a_i, np.ma.size(b_j), axis=1) I_b_j = np.repeat(b_j, np.ma.size(a_i), axis=0) N = np.sum(a_i) tmp = np.true_divide(N * contingency, J_a_i * I_b_j) log_value = np.ma.log(tmp) log_value = log_value.filled(0) return np.sum(contingency * log_value) / N
def normalized_mutual_information(y_true, y_pred, contingency=None, a_i=None, b_j=None, mi=None, real_clustering_entropy=None, predicted_clustering_entropy=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if a_i is None: a_i = utility.compute_a_i(contingency) if b_j is None: b_j = utility.compute_b_j if mi is None: mi = mutual_information(y_true, y_pred, contingency, a_i, b_j) if real_clustering_entropy is None: real_clustering_entropy = sklearn.metrics.cluster.entropy(y_true) if predicted_clustering_entropy is None: predicted_clustering_entropy = sklearn.metrics.cluster.entropy(y_pred) return mi / np.sqrt(real_clustering_entropy * predicted_clustering_entropy)
def adjusted_rand_index(y_true, y_pred, contingency=None, PBV=None, a_i=None, b_j=None): if contingency is None: contingency = compute_contingency(y_true, y_pred) if PBV is None: PBV = utility.pair_based_values(contingency) if a_i is None: a_i = utility.compute_a_i(contingency) if b_j is None: b_j = utility.compute_b_j(contingency) tp, fp, fn, tn = PBV N = np.sum(a_i) ri = np.sum(comb(contingency, 2)) # note : not truly ri, missing division by comb(N,2) eri = (np.sum(comb(a_i, 2)) * np.sum(comb(b_j, 2))) / comb(N, 2) # idem maxri = 0.5 * (np.sum(comb(a_i, 2)) + np.sum(comb(b_j, 2))) # idem return (ri - eri) / (maxri - eri)
def variation_of_information(y_true, y_pred, contingency=None, a_i=None, b_j=None): if contingency is None: contingency = utility.compute_contingency(y_true, y_pred) if a_i is None: a_i = utility.compute_a_i(contingency) if b_j is None: b_j = utility.compute_b_j(contingency) J_a_i = np.repeat(a_i, np.ma.size(b_j), axis=1) I_b_j = np.repeat(b_j, np.ma.size(a_i), axis=0) N = np.sum(a_i) log1 = np.true_divide(contingency, J_a_i) log1 = np.ma.log(log1) log1 = log1.filled(0) log2 = np.true_divide(contingency, I_b_j) log2 = np.ma.log(log2) log2 = log2.filled(0) return -np.sum(contingency * (log1 + log2)) / N
def compute_prediction_metrics(y_true, y_pred): # Calculate criteria score for each one present in CRITERION_LIST # Calculating values that are presents in several formula of criterion contingency = utility.compute_contingency(y_true, y_pred) a_i = utility.compute_a_i(contingency) b_j = utility.compute_b_j(contingency) true_clustering_entropy = sklearn.metrics.cluster.entropy(y_true) predicted_clustering_entropy = sklearn.metrics.cluster.entropy(y_pred) PBV = utility.pair_based_values(contingency) # Getting all criterion values mi = converted_metrics.mutual_information(y_true, y_pred, contingency, a_i, b_j) ari = converted_metrics.adjusted_rand_index(y_true, y_pred, contingency, PBV, a_i, b_j) ami = converted_metrics.adjusted_mutual_information( y_true, y_pred, contingency, a_i, b_j, mi, true_clustering_entropy, predicted_clustering_entropy) compl = converted_metrics.homogeneity(y_true, y_pred, contingency, mi, predicted_clustering_entropy) homog = converted_metrics.completness(y_true, y_pred, contingency, mi, true_clustering_entropy) vmeasure = converted_metrics.v_measure(y_true, y_pred, 1, mi, contingency, predicted_clustering_entropy, true_clustering_entropy, homog, compl) entropy = new_metrics.cond_entropy(y_true, y_pred, contingency, a_i) #accuracy = converted_metrics.accuracy(y_true, y_pred, contingency, PBV) Is equal to ARI in pair-based value context precision = converted_metrics.precision(y_true, y_pred, contingency, PBV) recall = converted_metrics.recall(y_true, y_pred, contingency, PBV) falsealarm = converted_metrics.false_alarm_rate(y_true, y_pred, contingency, PBV) fm = converted_metrics.fowlkes_mallows(y_true, y_pred, contingency, PBV) f1 = converted_metrics.f_beta_score(y_true, y_pred, 1, contingency, PBV, precision, recall) purity = new_metrics.purity(y_true, y_pred, contingency) inversed_purity = new_metrics.inversed_purity(y_true, y_pred, contingency) epratio = new_metrics.ep_ratio(y_true, y_pred, contingency, a_i, entropy, purity) jaccard = converted_metrics.jaccard_index(y_true, y_pred, contingency, PBV) nmi = converted_metrics.normalized_mutual_information( y_true, y_pred, contingency, a_i, b_j, mi, true_clustering_entropy, predicted_clustering_entropy) ri = new_metrics.rand_index(y_true, y_pred, contingency, PBV) vi = new_metrics.variation_of_information(y_true, y_pred, contingency, a_i, b_j) # clustering error not calculated : always equal to 1 - accuracy goodness = converted_metrics.goodness(y_true, y_pred, contingency, PBV) bal_accuracy = converted_metrics.balanced_accuracy(y_true, y_pred, contingency, PBV) q2 = new_metrics.q2(y_true, y_pred, contingency, entropy, a_i, b_j) metrics_dictionnary = { "mi": mi, "ari": ari, "ami": ami, "compl": compl, "homog": homog, "vmeasure": vmeasure, "entropy": entropy, "precision": precision, "recall": recall, "falsealarm": falsealarm, "fm": fm, "f1": f1, "purity": purity, "inv_purity": inversed_purity, "epratio": epratio, "jaccard": jaccard, "nmi": nmi, "ri": ri, "vi": vi, "goodness": goodness, "balacc": bal_accuracy, "q2": q2 } if set(metrics_dictionnary.keys()) != CRITERION_LIST: print( "ERROR : One or several criterion are not computed in main.compute_prediction_metrics" ) return metrics_dictionnary