示例#1
0
def fowlkes_mallows(y_true, y_pred, contingency=None, PBV=None):
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if PBV is None:
        PBV = utility.pair_based_values(contingency)
    tp, fp, fn, tn = PBV
    return np.sqrt((tp * tp) / ((tp + fp) * (tp + fn)))
示例#2
0
def recall(y_true, y_pred, contingency=None, PBV=None):
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if PBV is None:
        PBV = utility.pair_based_values(contingency)
    tp, _, fn, _ = PBV
    return tp / (tp + fn)
示例#3
0
def adjusted_mutual_information(y_true,
                                y_pred,
                                contingency=None,
                                a_i=None,
                                b_j=None,
                                mi=None,
                                real_clustering_entropy=None,
                                predicted_clustering_entropy=None):
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if a_i is None:
        a_i = utility.compute_a_i(contingency)
    if b_j is None:
        b_j = utility.compute_b_j
    if mi is None:
        mi = mutual_information(y_true, y_pred, contingency, a_i, b_j)
    if real_clustering_entropy is None:
        real_clustering_entropy = sklearn.metrics.cluster.entropy(y_true)
    if predicted_clustering_entropy is None:
        predicted_clustering_entropy = sklearn.metrics.cluster.entropy(y_pred)
    J_a_i = np.repeat(a_i, np.ma.size(b_j), axis=1)
    I_b_j = np.repeat(b_j, np.ma.size(a_i), axis=0)

    N = np.sum(contingency)

    emi = expected_mutual_information(contingency, N)
    maxmi = np.sqrt(real_clustering_entropy * predicted_clustering_entropy)

    return (mi - emi) / (maxmi - emi)
示例#4
0
def jaccard_index(y_true, y_pred, contingency=None, PBV=None):
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if PBV is None:
        PBV = utility.pair_based_values(contingency)
    tp, fp, fn, tn = PBV
    return tp / (tp + fp + fn)
示例#5
0
def q2(y_true, y_pred, contingency=None, cond_entrop=None, a_i=None, b_j=None):
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if cond_entrop is None:
        cond_entrop = cond_entropy(y_true, y_pred, contingency)
    if a_i is None:
        a_i = utility.compute_a_i(contingency)
    if b_j is None:
        b_j = utility.compute_b_j(contingency)

    J = np.ma.size(b_j)
    N = np.sum(a_i)

    combi = comb(a_i + J - 1, J - 1)
    log = np.ma.log(combi)
    log = log.filled(0)

    q0 = cond_entrop + (np.sum(log) / N)

    logmin = comb(b_j + J - 1, J - 1)
    logmin = np.ma.log(logmin)
    logmin = logmin.filled(0)

    maxq0 = sklearn.metrics.cluster.entropy(y_true) + np.log(J)
    minq0 = np.sum(logmin) / N

    return (maxq0 - q0) / (maxq0 - minq0)
示例#6
0
def v_measure(y_true,
              y_pred,
              beta,
              mutual_information=None,
              contingency=None,
              predicted_clustering_entropy=None,
              real_clustering_entropy=None,
              homog=None,
              compl=None):
    # Enhancement that include a beta parameter ; Optionnal, since no paper used the v-measure criterion with beta != 1
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if mutual_information is None:
        mutual_information = sklearn.metrics.mutual_info_score(
            y_true, y_pred, contingency)
    if predicted_clustering_entropy is None:
        predicted_clustering_entropy = sklearn.metrics.entropy(y_pred)
    if real_clustering_entropy is None:
        real_clustering_entropy = sklearn.metrics.entropy(y_true)
    if homog is None:
        homog = homogeneity(y_true, y_pred, contingency, mutual_information,
                            real_clustering_entropy)
    if compl is None:
        compl = completness(y_true, y_pred, contingency, mutual_information,
                            predicted_clustering_entropy)
    if homog + compl == 0:
        return 0
    return ((1 + beta) * homog * compl) / (beta * homog + compl)
示例#7
0
def accuracy(y_true, y_pred, contingency=None, PBV=None):
    # Note : Identical to Rand Index criterion, in context of pair counting
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if PBV is None:
        PBV = utility.pair_based_values(contingency)
    tp, fp, fn, tn = PBV
    return (tp + tn) / (tp + tn + fp + fn)
示例#8
0
def ep_ratio(y_true, y_pred, contingency=None, a_i=None, entr=None, pur=None):
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if entr is None:
        entr = cond_entropy(y_true, y_pred, contingency, a_i)
    if pur is None:
        pur = purity(y_true, y_pred, contingency)
    return entr / pur
示例#9
0
def false_alarm_rate(y_true, y_pred, contingency=None, PBV=None):
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if PBV == None:
        PBV = utility.pair_based_values(contingency)
    _, fp, fn, _ = PBV
    if fp + fn == 0:
        return 0
    return fp / (fp + fn)
示例#10
0
def balanced_accuracy(y_true, y_pred, contingency=None, PBV=None):
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if PBV is None:
        PBV = utility.pair_based_values(contingency)
    tp, fp, fn, tn = PBV
    a = tp / (tp + fn)
    b = tn / (tn + fp)
    return 0.5 * a + 0.5 * b
示例#11
0
def goodness(y_true, y_pred, contingency=None, PBV=None, pre=None, rec=None):
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if PBV is None:
        PBV = utility.pair_based_values(contingency)
    if pre is None:
        pre = precision(y_true, y_pred, contingency, PBV)
    if rec is None:
        rec = recall(y_true, y_pred, contingency, PBV)
    return 0.5 * (pre + rec)
示例#12
0
def clustering_error(y_true, y_pred, contingency=None, PBV=None):
    # Note : It might be different to consider CE and Accuracy, especially considering subtle variation like micro/macro/weighted averaging.
    # However, in data pairs context, CE is always equal to 1 - Accuracy
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if PBV is None:
        PBV = utility.pair_based_values(contingency)
    print(
        "'Clustering Error' is redundant criterion ; Please use Accuracy criterion instead, and compute  1 - Accuracy"
    )
示例#13
0
def completness(y_true,
                y_pred,
                contingency=None,
                mutual_information=None,
                predicted_clustering_entropy=None):
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if mutual_information is None:
        mutual_information = sklearn.metrics.cluster.mutual_info_score(
            (y_true, y_pred, contingency))
    if predicted_clustering_entropy is None:
        predicted_clustering_entropy = sklearn.metrics.cluster.entropy(y_pred)
    return mutual_information / predicted_clustering_entropy
示例#14
0
def homogeneity(y_true,
                y_pred,
                contingency=None,
                mutual_information=None,
                real_clustering_entropy=None):
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if mutual_information is None:
        mutual_information = sklearn.metrics.cluster.mutual_info_score(
            y_true, y_pred, contingency)
    if real_clustering_entropy is None:
        real_clustering_entropy = sklearn.metrics.cluster.entropy(y_true)
    return mutual_information / real_clustering_entropy
示例#15
0
def test_entropy():
    wine = sklearn.datasets.load_wine()
    var2 = sklearn.cluster.KMeans(4)
    var2.fit(wine['data'])
    pred2 = var2.predict(wine['data'])

    contingency = utility.compute_contingency(wine['target'], pred2)

    entropy = new_metrics.cond_entropy(wine['target'], pred2, contingency)

    print("true", wine['target'])
    print("predicted", pred2)
    print("entropy", entropy)
示例#16
0
def cond_entropy(y_true, y_pred, contingency=None, a_i=None):
    # Compute the average of each predicted-cluster's entropy, which is based on membership diversity of each data on reals clusters
    # Must be distinguished from entropy
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if a_i is None:
        a_i = utility.compute_a_i(contingency)

    J_a_i = np.repeat(a_i, np.size(contingency, axis=1), axis=1)
    log_value = np.true_divide(contingency, J_a_i)
    log_value = np.ma.log(log_value)  # Required to deal with 0's
    log_value = log_value.filled(0)
    N = np.ma.sum(a_i)
    return -np.sum(contingency * log_value) / N
示例#17
0
def mutual_information(y_true, y_pred, contingency=None, a_i=None, b_j=None):
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if a_i is None:
        a_i = utility.compute_a_i(contingency)
    if b_j is None:
        b_j = utility.compute_b_j
    J_a_i = np.repeat(a_i, np.ma.size(b_j), axis=1)
    I_b_j = np.repeat(b_j, np.ma.size(a_i), axis=0)
    N = np.sum(a_i)

    tmp = np.true_divide(N * contingency, J_a_i * I_b_j)
    log_value = np.ma.log(tmp)
    log_value = log_value.filled(0)

    return np.sum(contingency * log_value) / N
示例#18
0
def test_get_metrics():
    dataset = sklearn.datasets.load_breast_cancer()
    var2 = sklearn.cluster.KMeans(4)
    var2.fit(dataset['data'])
    true = dataset['target']
    pred = var2.predict(dataset['data'])
    print("H(reel)", sklearn.metrics.cluster.entropy(true))
    print("H(pred)", sklearn.metrics.cluster.entropy(pred))
    MI = sklearn.metrics.cluster.mutual_info_score(true, pred)
    print("MI", MI)
    cont = utility.compute_contingency(true, pred)
    rep = main.get_metrics(true, pred)
    for a in rep:
        print(a, "\t", rep[a])

    print(cont)
示例#19
0
def f_beta_score(y_true,
                 y_pred,
                 beta,
                 contingency=None,
                 PBV=None,
                 pre=None,
                 rec=None):
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if PBV is None:
        PBV = utility.pair_based_values(contingency)
    if pre is None:
        pre = precision(y_true, y_pred, contingency, PBV)
    if rec is None:
        rec = recall(y_true, y_pred, contingency, PBV)
    beta2 = np.power(beta, 2)
    if pre * rec == 0:
        return 0
    return ((1 + beta2) * pre * rec) / (beta2 * pre + rec)
示例#20
0
def normalized_mutual_information(y_true,
                                  y_pred,
                                  contingency=None,
                                  a_i=None,
                                  b_j=None,
                                  mi=None,
                                  real_clustering_entropy=None,
                                  predicted_clustering_entropy=None):
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if a_i is None:
        a_i = utility.compute_a_i(contingency)
    if b_j is None:
        b_j = utility.compute_b_j
    if mi is None:
        mi = mutual_information(y_true, y_pred, contingency, a_i, b_j)
    if real_clustering_entropy is None:
        real_clustering_entropy = sklearn.metrics.cluster.entropy(y_true)
    if predicted_clustering_entropy is None:
        predicted_clustering_entropy = sklearn.metrics.cluster.entropy(y_pred)
    return mi / np.sqrt(real_clustering_entropy * predicted_clustering_entropy)
示例#21
0
def variation_of_information(y_true,
                             y_pred,
                             contingency=None,
                             a_i=None,
                             b_j=None):
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    if a_i is None:
        a_i = utility.compute_a_i(contingency)
    if b_j is None:
        b_j = utility.compute_b_j(contingency)
    J_a_i = np.repeat(a_i, np.ma.size(b_j), axis=1)
    I_b_j = np.repeat(b_j, np.ma.size(a_i), axis=0)
    N = np.sum(a_i)

    log1 = np.true_divide(contingency, J_a_i)
    log1 = np.ma.log(log1)
    log1 = log1.filled(0)
    log2 = np.true_divide(contingency, I_b_j)
    log2 = np.ma.log(log2)
    log2 = log2.filled(0)

    return -np.sum(contingency * (log1 + log2)) / N
示例#22
0
def test_mutual_information_cond_entropy():
    wine = sklearn.datasets.load_wine()
    var2 = sklearn.cluster.KMeans(4)
    var2.fit(wine['data'])
    pred2 = var2.predict(wine['data'])
    cont = utility.compute_contingency(wine['target'], pred2)
    print("cont", cont)

    # Implementation perso
    CE = new_metrics.cond_entropy(wine['target'], pred2)
    print("found CE : ", CE)

    # Calcul depuis CE = H(label) - MI
    MI = sklearn.metrics.cluster.mutual_info_score(wine['target'], pred2)
    H_label = sklearn.metrics.cluster.entropy(wine['target'])
    sub = H_label - MI
    print("CI from H_label - MI : ", sub)

    # Calcul depuis hom = 1 - [CE / H(label)]
    H_cluster = sklearn.metrics.cluster.entropy(wine['target'])
    homogeneity = sklearn.metrics.cluster.homogeneity_score(
        wine['target'], pred2)
    ce_complet = H_label * (1 - homogeneity)
    print("CI from homogeneity : ", ce_complet)
示例#23
0
def compute_prediction_metrics(y_true, y_pred):
    # Calculate criteria score for each one present in CRITERION_LIST

    # Calculating values that are presents in several formula of criterion
    contingency = utility.compute_contingency(y_true, y_pred)
    a_i = utility.compute_a_i(contingency)
    b_j = utility.compute_b_j(contingency)
    true_clustering_entropy = sklearn.metrics.cluster.entropy(y_true)
    predicted_clustering_entropy = sklearn.metrics.cluster.entropy(y_pred)
    PBV = utility.pair_based_values(contingency)

    # Getting all criterion values
    mi = converted_metrics.mutual_information(y_true, y_pred, contingency, a_i,
                                              b_j)
    ari = converted_metrics.adjusted_rand_index(y_true, y_pred, contingency,
                                                PBV, a_i, b_j)
    ami = converted_metrics.adjusted_mutual_information(
        y_true, y_pred, contingency, a_i, b_j, mi, true_clustering_entropy,
        predicted_clustering_entropy)
    compl = converted_metrics.homogeneity(y_true, y_pred, contingency, mi,
                                          predicted_clustering_entropy)
    homog = converted_metrics.completness(y_true, y_pred, contingency, mi,
                                          true_clustering_entropy)
    vmeasure = converted_metrics.v_measure(y_true, y_pred, 1, mi, contingency,
                                           predicted_clustering_entropy,
                                           true_clustering_entropy, homog,
                                           compl)
    entropy = new_metrics.cond_entropy(y_true, y_pred, contingency, a_i)
    #accuracy = converted_metrics.accuracy(y_true, y_pred, contingency, PBV)  Is equal to ARI in pair-based value context
    precision = converted_metrics.precision(y_true, y_pred, contingency, PBV)
    recall = converted_metrics.recall(y_true, y_pred, contingency, PBV)
    falsealarm = converted_metrics.false_alarm_rate(y_true, y_pred,
                                                    contingency, PBV)
    fm = converted_metrics.fowlkes_mallows(y_true, y_pred, contingency, PBV)
    f1 = converted_metrics.f_beta_score(y_true, y_pred, 1, contingency, PBV,
                                        precision, recall)
    purity = new_metrics.purity(y_true, y_pred, contingency)
    inversed_purity = new_metrics.inversed_purity(y_true, y_pred, contingency)
    epratio = new_metrics.ep_ratio(y_true, y_pred, contingency, a_i, entropy,
                                   purity)
    jaccard = converted_metrics.jaccard_index(y_true, y_pred, contingency, PBV)
    nmi = converted_metrics.normalized_mutual_information(
        y_true, y_pred, contingency, a_i, b_j, mi, true_clustering_entropy,
        predicted_clustering_entropy)
    ri = new_metrics.rand_index(y_true, y_pred, contingency, PBV)
    vi = new_metrics.variation_of_information(y_true, y_pred, contingency, a_i,
                                              b_j)
    # clustering error not calculated : always equal to 1 - accuracy
    goodness = converted_metrics.goodness(y_true, y_pred, contingency, PBV)
    bal_accuracy = converted_metrics.balanced_accuracy(y_true, y_pred,
                                                       contingency, PBV)
    q2 = new_metrics.q2(y_true, y_pred, contingency, entropy, a_i, b_j)

    metrics_dictionnary = {
        "mi": mi,
        "ari": ari,
        "ami": ami,
        "compl": compl,
        "homog": homog,
        "vmeasure": vmeasure,
        "entropy": entropy,
        "precision": precision,
        "recall": recall,
        "falsealarm": falsealarm,
        "fm": fm,
        "f1": f1,
        "purity": purity,
        "inv_purity": inversed_purity,
        "epratio": epratio,
        "jaccard": jaccard,
        "nmi": nmi,
        "ri": ri,
        "vi": vi,
        "goodness": goodness,
        "balacc": bal_accuracy,
        "q2": q2
    }

    if set(metrics_dictionnary.keys()) != CRITERION_LIST:
        print(
            "ERROR : One or several criterion are not computed in main.compute_prediction_metrics"
        )

    return metrics_dictionnary
示例#24
0
def inversed_purity(y_true, y_pred, contingency=None):
    # Note : Simply the definition of Purity, where the predicted and the real clusters roles are inverted
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    return purity(y_pred, y_true, np.transpose(contingency))
示例#25
0
def purity(y_true, y_pred, contingency=None):
    if contingency is None:
        contingency = utility.compute_contingency(y_true, y_pred)
    max_i = np.max(contingency, axis=1)
    return np.sum(max_i) / np.ma.size(y_true)