Exemplo n.º 1
0
def mixed_estimator(C, Y, estimator, G=None, max_number_partitions=None, sorting=False):
    """
    THe mixed estimator for mutual information of Mandros et al. KDD'2020. Discretizes with
    equal-frequency. Estimator is the mutual information estimator of choice, G is the set of already
    discrete variables. Will perfom an initial sorting on marginal mutual informations in two bins
    if sorting=True"""
    if isinstance(C, pd.Series) or isinstance(C, pd.DataFrame):
        C = C.to_numpy()

    if isinstance(Y, pd.Series) or isinstance(Y, pd.DataFrame):
        Y = Y.to_numpy()

    number_of_attributes_in_C = number_of_columns(C)
    num_samples = len(C)

    if max_number_partitions is None:
        max_number_partitions = math.ceil(math.log10(len(C)))

    # sorting in decreasing marginal mutual information
    if sorting and number_of_attributes_in_C > 1:
        generator = sort_generator(
            estimator=estimator,
            G=G,
            Y=Y,
            X=C,
        )
        sorted_attributes = sorted(generator, key=itemgetter(0))
        sorted_column_indices = [row[1] for row in sorted_attributes]
        C = C[:, sorted_column_indices]

    discrete_C = None
    best_score = float("-inf")
    for i in range(number_of_attributes_in_C):
        generator = max_generator(
            estimator=estimator,
            G=G,
            Y=Y,
            X=get_column(C, i),
            max_number_partitions=max_number_partitions
        )

        top_score, top_discretized_attribute = max(generator, key=itemgetter(0))

        if top_score > best_score:
            best_score = top_score
            discrete_C = append_two_arrays(discrete_C, top_discretized_attribute)
        else:
            discrete_C = append_two_arrays(discrete_C, np.zeros((num_samples, 1)))

    return top_score, discrete_C
Exemplo n.º 2
0
def fraction_of_information_plugin(X,
                                   Y,
                                   with_cross_tab=False,
                                   contingency_table=None,
                                   entropy_Y=None):
    """
    The plugin estimator for the fraction of information F(X;Y)=I(X,Y)/H(Y). 
    It can be computed either using cross_tab from Pandas, or with
    numpy. A precomputed contingency table and entropy of Y can be provided if
    it is available"""

    if isinstance(X, pd.Series) or isinstance(X, pd.DataFrame):
        X = X.to_numpy()

    if isinstance(Y, pd.Series) or isinstance(Y, pd.DataFrame):
        Y = Y.to_numpy()

    if with_cross_tab == True or contingency_table != None:
        return fraction_of_information_from_cross_tab(
            X, Y, contingency_table=contingency_table, entropy_Y=entropy_Y)
    else:
        entropyX = entropy_plugin(X)

        if entropy_Y == None:
            entropy_Y = entropy_plugin(Y)
        dataXY = append_two_arrays(X, Y)
        entropyXY = entropy_plugin(dataXY)
        return (entropyX + entropy_Y - entropyXY) / entropy_Y
Exemplo n.º 3
0
def test_mixed_fraction_information_permutation_another():
    """ Tests choose_no_overflow for simple cases """
    # input
    X = np.arange(8)
    Y = np.array([1, 1, 2, 2, 3, 3, 4, 4])

    error = 1e-10

    # do
    res = mixed_estimator(append_two_arrays(X, X), Y, fraction_of_information_permutation, max_number_partitions=4)

    # assert
    assert res[0] == approx(0.4285714285714287, rel=error)
    assert (np.array_equal(res[1][:, 0], [0, 0, 1, 1, 2, 2, 3, 3]))
    assert (np.array_equal(res[1][:, 1], [0, 0, 0, 0, 0, 0, 0, 0]))
Exemplo n.º 4
0
def mutual_information_plugin(X,
                              Y,
                              with_cross_tab=False,
                              contingency_table=None):
    """
    The plugin estimator for mutual information I(X;Y) between two attribute sets X 
    and Y. It can be computed either using cross_tab from Pandas, or with
    numpy. A precomputed contingency table can be provided if it is available"""
    if isinstance(X, pd.Series) or isinstance(X, pd.DataFrame):
        X = X.to_numpy()

    if isinstance(Y, pd.Series) or isinstance(Y, pd.DataFrame):
        Y = Y.to_numpy()

    if with_cross_tab == True or contingency_table != None:
        return mutual_information_from_cross_tab(
            X, Y, contingency_table=contingency_table)
    else:
        entropyX = entropy_plugin(X)
        entropy_Y = entropy_plugin(Y)
        dataXY = append_two_arrays(X, Y)
        entropyXY = entropy_plugin(dataXY)
        return entropyX + entropy_Y - entropyXY
Exemplo n.º 5
0
def sort_generator(estimator, G, Y, X):
    for i in range(number_of_columns(X)):
        discrete_candidate = pd.qcut(get_column(X, i), 2, labels=False, duplicates='drop')
        joint_columns = append_two_arrays(discrete_candidate, G)
        result = estimator(joint_columns, Y)
        yield result, i
Exemplo n.º 6
0
def max_generator(estimator, G, Y, X, max_number_partitions):
    for num_bins in range(2, max_number_partitions + 1):
        discrete_candidate = pd.qcut(X, num_bins, labels=False, duplicates='drop')
        joint_columns = append_two_arrays(discrete_candidate, G)
        result = estimator(joint_columns, Y)
        yield result, discrete_candidate