예제 #1
0
def test_mutual_info_classif_mixed():
    # Here the target is discrete and there are two continuous and one
    # discrete feature. The idea of this test is clear from the code.
    rng = check_random_state(0)
    X = rng.rand(1000, 3)
    X[:, 1] += X[:, 0]
    y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int)
    X[:, 2] = X[:, 2] > 0.5

    mi = mutual_info_classif(X,
                             y,
                             discrete_features=[2],
                             n_neighbors=3,
                             random_state=0)
    assert_array_equal(np.argsort(-mi), [2, 0, 1])
    for n_neighbors in [5, 7, 9]:
        mi_nn = mutual_info_classif(X,
                                    y,
                                    discrete_features=[2],
                                    n_neighbors=n_neighbors,
                                    random_state=0)
        # Check that the continuous values have an higher MI with greater
        # n_neighbors
        assert mi_nn[0] > mi[0]
        assert mi_nn[1] > mi[1]
        # The n_neighbors should not have any effect on the discrete value
        # The MI should be the same
        assert mi_nn[2] == mi[2]
예제 #2
0
def test_mutual_info_classif_discrete():
    X = np.array([[0, 0, 0], [1, 1, 0], [2, 0, 1], [2, 0, 1], [2, 0, 1]])
    y = np.array([0, 1, 2, 2, 1])

    # Here X[:, 0] is the most informative feature, and X[:, 1] is weakly
    # informative.
    mi = mutual_info_classif(X, y, discrete_features=True)
    assert_array_equal(np.argsort(-mi), np.array([0, 2, 1]))
예제 #3
0
def test_mutual_info_classif_mixed():
    # Here the target is discrete and there are two continuous and one
    # discrete feature. The idea of this test is clear from the code.
    np.random.seed(0)
    X = np.random.rand(1000, 3)
    X[:, 1] += X[:, 0]
    y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int)
    X[:, 2] = X[:, 2] > 0.5

    mi = mutual_info_classif(X, y, discrete_features=[2], random_state=0)
    assert_array_equal(np.argsort(-mi), [2, 0, 1])
def test_mutual_info_classif_mixed():
    # Here the target is discrete and there are two continuous and one
    # discrete feature. The idea of this test is clear from the code.
    np.random.seed(0)
    X = np.random.rand(1000, 3)
    X[:, 1] += X[:, 0]
    y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int)
    X[:, 2] = X[:, 2] > 0.5

    mi = mutual_info_classif(X, y, discrete_features=[2], random_state=0)
    assert_array_equal(np.argsort(-mi), [2, 0, 1])
예제 #5
0
def test_mutual_info_classif_discrete():
    X = np.array([[0, 0, 0],
                  [1, 1, 0],
                  [2, 0, 1],
                  [2, 0, 1],
                  [2, 0, 1]])
    y = np.array([0, 1, 2, 2, 1])

    # Here X[:, 0] is the most informative feature, and X[:, 1] is weakly
    # informative.
    mi = mutual_info_classif(X, y, discrete_features=True)
    assert_array_equal(np.argsort(-mi), np.array([0, 2, 1]))
예제 #6
0
def test_mutual_info_classif_mixed():
    # Here the target is discrete and there are two continuous and one
    # discrete feature. The idea of this test is clear from the code.
    rng = check_random_state(0)
    X = rng.rand(1000, 3)
    X[:, 1] += X[:, 0]
    y = ((0.5 * X[:, 0] + X[:, 2]) > 0.5).astype(int)
    X[:, 2] = X[:, 2] > 0.5

    mi = mutual_info_classif(X, y, discrete_features=[2], n_neighbors=3,
                             random_state=0)
    assert_array_equal(np.argsort(-mi), [2, 0, 1])
    for n_neighbors in [5, 7, 9]:
        mi_nn = mutual_info_classif(X, y, discrete_features=[2],
                                    n_neighbors=n_neighbors, random_state=0)
        # Check that the continuous values have an higher MI with greater
        # n_neighbors
        assert_greater(mi_nn[0], mi[0])
        assert_greater(mi_nn[1], mi[1])
        # The n_neighbors should not have any effect on the discrete value
        # The MI should be the same
        assert_equal(mi_nn[2], mi[2])
예제 #7
0
def get_mutual_information(inputs, targets, token2idx, stop_words = None, mask_token = None):

    # convert X to CSC format
    data, row, col = convert_X_to_ijv_format(inputs)
    counts = csc_matrix((data, (row, col)), shape = (inputs.shape[0], len(token2idx)))

#     tf_idf_transformer = TfidfTransformer(norm = 'l2', use_idf = True, smooth_idf = True, sublinear_tf = True)
#     tf_idf_transformer.fit(counts)
#     counts = tf_idf_transformer.transform(counts)

    mi = mutual_info_classif(counts, targets)

    mi[token2idx[mask_token]] = 0.0
    for stop_word in stop_words:
        if stop_word in token2idx:
            mi[token2idx[stop_word]] = 0.0

    print('Maximum mutual information:', np.max(mi))
    print('Minimum mutual information:', np.min(mi))
    mi += 1e-9
    return mi