示例#1
0
文件: __init__.py 项目: BioXiao/cgat
def adjustedMutualInformation(cluster1, cluster2):
    '''
    Using the scikit-learn algorithms, calculate the adjusted mutual
    information for two clusterings.  Assume cluster1 is the
    reference/ground truth clustering.
    The adjusted MI accounts for higher scores by chance, particularly
    in the case where a larger number of clusters leads to a higher MI.

    AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [max(H(U), H(V)) - E(MI(U, V))]
    where E(MI(U, V)) is the expected mutual information given the
    number of clusters and H(U), H(V) are the entropies of clusterings
    U and V.
    '''

    cont = contingency(cluster1, cluster2)
    mi = mutualInformation(cluster1, cluster2)
    sample_size = float(sum([len(cluster1[x]) for x in cluster1.keys()]))

    # Given the number of samples, what is the expected number
    # of overlaps that would occur by chance?
    emi = supervised.expected_mutual_information(cont, sample_size)

    # calculate the entropy for each clustering
    h_clust1, h_clust2 = entropy(cluster1), entropy(cluster2)

    if abs(h_clust1) == 0.0:
        h_clust1 = 0.0
    else:
        pass
    if abs(h_clust2) == 0.0:
        h_clust2 = 0.0
    else:
        pass

    ami = (mi - emi) / (max(h_clust1, h_clust2) - emi)

    # bug: entropy will return -0 in some instances
    # make sure this is actually 0 else ami will return None
    # instead of 0.0

    if np.isnan(ami):
        ami = np.nan_to_num(ami)
    else:
        pass

    return ami
示例#2
0
def adjustedMutualInformation(cluster1, cluster2):
    '''
    Using the scikit-learn algorithms, calculate the adjusted mutual
    information for two clusterings.  Assume cluster1 is the
    reference/ground truth clustering.
    The adjusted MI accounts for higher scores by chance, particularly
    in the case where a larger number of clusters leads to a higher MI.

    AMI(U, V) = [MI(U, V) - E(MI(U, V))] / [max(H(U), H(V)) - E(MI(U, V))]
    where E(MI(U, V)) is the expected mutual information given the
    number of clusters and H(U), H(V) are the entropies of clusterings
    U and V.
    '''

    cont = contingency(cluster1, cluster2)
    mi = mutualInformation(cluster1, cluster2)
    sample_size = float(sum([len(cluster1[x]) for x in cluster1.keys()]))

    # Given the number of samples, what is the expected number
    # of overlaps that would occur by chance?
    emi = supervised.expected_mutual_information(cont, sample_size)

    # calculate the entropy for each clustering
    h_clust1, h_clust2 = entropy(cluster1), entropy(cluster2)

    if abs(h_clust1) == 0.0:
        h_clust1 = 0.0
    else:
        pass
    if abs(h_clust2) == 0.0:
        h_clust2 = 0.0
    else:
        pass

    ami = (mi - emi) / (max(h_clust1, h_clust2) - emi)

    # bug: entropy will return -0 in some instances
    # make sure this is actually 0 else ami will return None
    # instead of 0.0

    if np.isnan(ami):
        ami = np.nan_to_num(ami)
    else:
        pass

    return ami
示例#3
0
def _ami(ab_cts, average_method='arithmetic'):
    """Adjusted mutual information between two discrete categorical random variables
    based on counts observed and provided in ab_cts.

    Code adapted directly from scikit learn AMI to
    accomodate having counts/contingency table instead of rows/instances:
    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_mutual_info_score.html

    Parameters
    ----------
    ab_cts : np.ndarray [len(a_classes) x len(b_classes)
        Counts for each combination of classes in random variables a and b
        organized in a rectangular array.
    average_method : str
        See sklearn documentation for details

    Returns
    -------
    ami : float
        Adjusted mutual information score for variables a and b"""
    a_freq = np.sum(ab_cts, axis=1)
    a_freq = a_freq / np.sum(a_freq)
    b_freq = np.sum(ab_cts, axis=0)
    b_freq = b_freq / np.sum(b_freq)
    n_samples = np.sum(ab_cts)
    """ Calculate the MI for the two clusterings
    contingency is a joint count distribution [a_classes x b_classes]"""
    mi = mutual_info_score(None, None, contingency=ab_cts)
    """Calculate the expected value for the mutual information"""
    emi = expected_mutual_information(ab_cts, n_samples)
    """Calculate entropy"""
    h_true, h_pred = _entropy(a_freq), _entropy(b_freq)
    normalizer = _generalized_average(h_true, h_pred, average_method)
    denominator = normalizer - emi

    if denominator < 0:
        denominator = min(denominator, -np.finfo('float64').eps)
    else:
        denominator = max(denominator, np.finfo('float64').eps)
    ami = (mi - emi) / denominator
    return ami