def test_paired_distances():
    """ Test the pairwise_distance helper function. """
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((5, 4))
    for metric, func in PAIRED_DISTANCES.iteritems():
        S = paired_distances(X, Y, metric=metric)
        S2 = func(X, Y)
        assert_array_almost_equal(S, S2)
        if metric in PAIRWISE_DISTANCE_FUNCTIONS:
            # Check the the pairwise_distances implementation
            # gives the same value
            distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y)
            distances = np.diag(distances)
            assert_array_almost_equal(distances, S)

    # Test that a value error is raised when the lengths of X and Y should not
    # differ
    Y = rng.random_sample((3, 4))
    assert_raises(ValueError, paired_distances, X, Y)
Exemplo n.º 2
0
def test_agglomerative_clustering():
    """
    Check that we obtain the correct number of clusters with
    agglomerative clustering.
    """
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    n_samples = 100
    X = rng.randn(n_samples, 50)
    connectivity = grid_to_graph(*mask.shape)
    for linkage in ("ward", "complete", "average"):
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=connectivity,
                                             linkage=linkage)
        clustering.fit(X)
        # test caching
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=connectivity,
                                             memory=mkdtemp(),
                                             linkage=linkage)
        clustering.fit(X)
        labels = clustering.labels_
        assert_true(np.size(np.unique(labels)) == 10)
        # Turn caching off now
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=connectivity,
                                             linkage=linkage)
        # Check that we obtain the same solution with early-stopping of the
        # tree building
        clustering.compute_full_tree = False
        clustering.fit(X)
        np.testing.assert_array_equal(clustering.labels_, labels)
        clustering.connectivity = None
        clustering.fit(X)
        assert_true(np.size(np.unique(clustering.labels_)) == 10)
        # Check that we raise a TypeError on dense matrices
        clustering = AgglomerativeClustering(
            n_clusters=10,
            connectivity=sparse.lil_matrix(connectivity.toarray()[:10, :10]),
            linkage=linkage)
        assert_raises(ValueError, clustering.fit, X)

    # Test that using ward with another metric than euclidean raises an
    # exception
    clustering = AgglomerativeClustering(n_clusters=10,
                                         connectivity=connectivity.toarray(),
                                         affinity="manhattan",
                                         linkage="ward")
    assert_raises(ValueError, clustering.fit, X)

    # Test using another metric than euclidean works with linkage complete
    for affinity in PAIRED_DISTANCES.keys():
        # Compare our (structured) implementation to scipy
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=np.ones(
                                                 (n_samples, n_samples)),
                                             affinity=affinity,
                                             linkage="complete")
        clustering.fit(X)
        clustering2 = AgglomerativeClustering(n_clusters=10,
                                              connectivity=None,
                                              affinity=affinity,
                                              linkage="complete")
        clustering2.fit(X)
        assert_almost_equal(
            normalized_mutual_info_score(clustering2.labels_,
                                         clustering.labels_), 1)
Exemplo n.º 3
0
def test_agglomerative_clustering():
    """
    Check that we obtain the correct number of clusters with
    agglomerative clustering.
    """
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    n_samples = 100
    X = rng.randn(n_samples, 50)
    connectivity = grid_to_graph(*mask.shape)
    for linkage in ("ward", "complete", "average"):
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=connectivity,
                                             linkage=linkage)
        clustering.fit(X)
        # test caching
        try:
            tempdir = mkdtemp()
            clustering = AgglomerativeClustering(
                n_clusters=10, connectivity=connectivity,
                memory=tempdir,
                linkage=linkage)
            clustering.fit(X)
            labels = clustering.labels_
            assert_true(np.size(np.unique(labels)) == 10)
        finally:
            shutil.rmtree(tempdir)
        # Turn caching off now
        clustering = AgglomerativeClustering(
            n_clusters=10, connectivity=connectivity, linkage=linkage)
        # Check that we obtain the same solution with early-stopping of the
        # tree building
        clustering.compute_full_tree = False
        clustering.fit(X)
        assert_almost_equal(normalized_mutual_info_score(clustering.labels_,
                                                         labels), 1)
        clustering.connectivity = None
        clustering.fit(X)
        assert_true(np.size(np.unique(clustering.labels_)) == 10)
        # Check that we raise a TypeError on dense matrices
        clustering = AgglomerativeClustering(
            n_clusters=10,
            connectivity=sparse.lil_matrix(
                connectivity.toarray()[:10, :10]),
            linkage=linkage)
        assert_raises(ValueError, clustering.fit, X)

    # Test that using ward with another metric than euclidean raises an
    # exception
    clustering = AgglomerativeClustering(
        n_clusters=10,
        connectivity=connectivity.toarray(),
        affinity="manhattan",
        linkage="ward")
    assert_raises(ValueError, clustering.fit, X)

    # Test using another metric than euclidean works with linkage complete
    for affinity in PAIRED_DISTANCES.keys():
        # Compare our (structured) implementation to scipy
        clustering = AgglomerativeClustering(
            n_clusters=10,
            connectivity=np.ones((n_samples, n_samples)),
            affinity=affinity,
            linkage="complete")
        clustering.fit(X)
        clustering2 = AgglomerativeClustering(
            n_clusters=10,
            connectivity=None,
            affinity=affinity,
            linkage="complete")
        clustering2.fit(X)
        assert_almost_equal(normalized_mutual_info_score(clustering2.labels_,
                                                         clustering.labels_),
                            1)

    # Test that using a distance matrix (affinity = 'precomputed') has same
    # results (with connectivity constraints)
    clustering = AgglomerativeClustering(n_clusters=10,
                                         connectivity=connectivity,
                                         linkage="complete")
    clustering.fit(X)
    X_dist = pairwise_distances(X)
    clustering2 = AgglomerativeClustering(n_clusters=10,
                                          connectivity=connectivity,
                                          affinity='precomputed',
                                          linkage="complete")
    clustering2.fit(X_dist)
    assert_array_equal(clustering.labels_, clustering2.labels_)
Exemplo n.º 4
0
    assert_array_almost_equal(K1, K2)


def test_pairwise_kernels_filter_param():
    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    Y = rng.random_sample((2, 4))
    K = rbf_kernel(X, Y, gamma=0.1)
    params = {"gamma": 0.1, "blabla": ":)"}
    K2 = pairwise_kernels(X, Y, metric="rbf", filter_params=True, **params)
    assert_array_almost_equal(K, K2)

    assert_raises(TypeError, pairwise_kernels, X, Y, "rbf", **params)


@pytest.mark.parametrize('metric, func', PAIRED_DISTANCES.items())
def test_paired_distances(metric, func):
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((5, 4))

    S = paired_distances(X, Y, metric=metric)
    S2 = func(X, Y)
    assert_array_almost_equal(S, S2)
    S3 = func(csr_matrix(X), csr_matrix(Y))
    assert_array_almost_equal(S, S3)
    if metric in PAIRWISE_DISTANCE_FUNCTIONS:
        # Check the pairwise_distances implementation
Exemplo n.º 5
0
    assert_array_almost_equal(K1, K2)


def test_pairwise_kernels_filter_param():
    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    Y = rng.random_sample((2, 4))
    K = rbf_kernel(X, Y, gamma=0.1)
    params = {"gamma": 0.1, "blabla": ":)"}
    K2 = pairwise_kernels(X, Y, metric="rbf", filter_params=True, **params)
    assert_array_almost_equal(K, K2)

    assert_raises(TypeError, pairwise_kernels, X, Y, "rbf", **params)


@pytest.mark.parametrize('metric, func', PAIRED_DISTANCES.items())
def test_paired_distances(metric, func):
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((5, 4))

    S = paired_distances(X, Y, metric=metric)
    S2 = func(X, Y)
    assert_array_almost_equal(S, S2)
    S3 = func(csr_matrix(X), csr_matrix(Y))
    assert_array_almost_equal(S, S3)
    if metric in PAIRWISE_DISTANCE_FUNCTIONS:
        # Check the pairwise_distances implementation
from scipy import stats

from ..embed import select_dimension, AdjacencySpectralEmbed
from ..utils import import_graph, fit_plug_in_variance_estimator
from ..align import SignFlips
from ..align import SeedlessProcrustes
from .base import BaseInference
from sklearn.utils import check_array
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.metrics.pairwise import PAIRED_DISTANCES
from sklearn.metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS
from hyppo.ksample import KSample
from hyppo._utils import gaussian

_VALID_DISTANCES = list(PAIRED_DISTANCES.keys())
_VALID_KERNELS = list(PAIRWISE_KERNEL_FUNCTIONS.keys())
_VALID_KERNELS.append("gaussian")  # can use hyppo's medial gaussian kernel too
_VALID_METRICS = _VALID_DISTANCES + _VALID_KERNELS

_VALID_TESTS = ["cca", "dcorr", "hhg", "rv", "hsic", "mgc"]


class LatentDistributionTest(BaseInference):
    """Two-sample hypothesis test for the problem of determining whether two random
    dot product graphs have the same distributions of latent positions.

    This test can operate on two graphs where there is no known matching
    between the vertices of the two graphs, or even when the number of vertices
    is different. Currently, testing is only supported for undirected graphs.
Exemplo n.º 7
0
def test_agglomerative_clustering():
    # Check that we obtain the correct number of clusters with
    # agglomerative clustering.
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=bool)
    n_samples = 100
    X = rng.randn(n_samples, 50)
    connectivity = grid_to_graph(*mask.shape)
    for linkage in ("ward", "complete", "average", "single"):
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=connectivity,
                                             linkage=linkage)
        clustering.fit(X)
        # test caching
        try:
            tempdir = mkdtemp()
            clustering = AgglomerativeClustering(
                n_clusters=10,
                connectivity=connectivity,
                memory=tempdir,
                linkage=linkage,
            )
            clustering.fit(X)
            labels = clustering.labels_
            assert np.size(np.unique(labels)) == 10
        finally:
            shutil.rmtree(tempdir)
        # Turn caching off now
        clustering = AgglomerativeClustering(n_clusters=10,
                                             connectivity=connectivity,
                                             linkage=linkage)
        # Check that we obtain the same solution with early-stopping of the
        # tree building
        clustering.compute_full_tree = False
        clustering.fit(X)
        assert_almost_equal(
            normalized_mutual_info_score(clustering.labels_, labels), 1)
        clustering.connectivity = None
        clustering.fit(X)
        assert np.size(np.unique(clustering.labels_)) == 10
        # Check that we raise a TypeError on dense matrices
        clustering = AgglomerativeClustering(
            n_clusters=10,
            connectivity=sparse.lil_matrix(connectivity.toarray()[:10, :10]),
            linkage=linkage,
        )
        with pytest.raises(ValueError):
            clustering.fit(X)

    # Test that using ward with another metric than euclidean raises an
    # exception
    clustering = AgglomerativeClustering(
        n_clusters=10,
        connectivity=connectivity.toarray(),
        metric="manhattan",
        linkage="ward",
    )
    with pytest.raises(ValueError):
        clustering.fit(X)

    # Test using another metric than euclidean works with linkage complete
    for metric in PAIRED_DISTANCES.keys():
        # Compare our (structured) implementation to scipy
        clustering = AgglomerativeClustering(
            n_clusters=10,
            connectivity=np.ones((n_samples, n_samples)),
            metric=metric,
            linkage="complete",
        )
        clustering.fit(X)
        clustering2 = AgglomerativeClustering(n_clusters=10,
                                              connectivity=None,
                                              metric=metric,
                                              linkage="complete")
        clustering2.fit(X)
        assert_almost_equal(
            normalized_mutual_info_score(clustering2.labels_,
                                         clustering.labels_), 1)

    # Test that using a distance matrix (affinity = 'precomputed') has same
    # results (with connectivity constraints)
    clustering = AgglomerativeClustering(n_clusters=10,
                                         connectivity=connectivity,
                                         linkage="complete")
    clustering.fit(X)
    X_dist = pairwise_distances(X)
    clustering2 = AgglomerativeClustering(
        n_clusters=10,
        connectivity=connectivity,
        metric="precomputed",
        linkage="complete",
    )
    clustering2.fit(X_dist)
    assert_array_equal(clustering.labels_, clustering2.labels_)