Exemplo n.º 1
0
def test_on_trivial_input(inp):
    """Test that with one cluster, and one point, we always get one cluster,
    regardless of its location."""
    n_points_per_cluster, n_clusters, dim, pts = inp
    fs = FirstSimpleGap()
    fs = fs.fit(pts)
    assert fs.n_clusters_ == n_clusters

    fh = FirstHistogramGap()
    fh = fh.fit(pts)
    assert fh.n_clusters_ == n_clusters
Exemplo n.º 2
0
def test_max_fraction_clusters(inp, max_frac):
    """ Check that ``FirstSimpleGap`` and ``FirstHistogramGap`` respect the
    ``max_num_clusters`` constraint, if it is set."""
    n_points_per_cluster, n_clusters, _, pts = inp
    max_num_clusters = max_frac * n_points_per_cluster * n_clusters

    fs = FirstSimpleGap(max_fraction=max_frac)
    _ = fs.fit_predict(pts)
    assert fs.n_clusters_ <= np.floor(max_num_clusters)

    fh = FirstHistogramGap(max_fraction=max_frac)
    _ = fh.fit_predict(pts)
    assert fh.n_clusters_ <= np.floor(max_num_clusters)
Exemplo n.º 3
0
def test_firsthistogramgap(inp):
    """For a multimodal distribution, check that the ``FirstHistogramGap`` with
    appropriate parameters finds the right number of clusters, and that each
    has the correct number of points ``n_points_per_cluster``."""
    n_points_per_cluster, n_clusters, _, pts = inp
    fh = FirstHistogramGap(freq_threshold=0,
                           max_fraction=1.,
                           n_bins_start=5,
                           affinity='euclidean',
                           memory=None,
                           linkage='single')
    preds = fh.fit_predict(pts)
    unique, counts = np.unique(preds, return_counts=True)
    # check that the nb of clusters corresponds to the nb of synth. clusters
    assert unique.shape[0] == n_clusters
    # check that the nb of pts in a cluster corresponds to what we expect
    assert_almost_equal(counts, n_points_per_cluster)
Exemplo n.º 4
0
def test_precomputed_distances(inp):
    """Verify that the clustering based on a distance matrix is the same as
    the clustering on points used to calculate that distance matrix."""
    n_points_per_cluster, n_clusters, _, pts = inp

    dist_matrix = distance_matrix(pts, pts, p=2)
    fh_matrix = FirstHistogramGap(freq_threshold=0,
                                  max_fraction=1.,
                                  n_bins_start=5,
                                  affinity='precomputed',
                                  memory=None,
                                  linkage='single')
    preds_mat = fh_matrix.fit_predict(dist_matrix)

    fh = FirstHistogramGap(freq_threshold=0,
                           max_fraction=1.,
                           n_bins_start=5,
                           affinity='euclidean',
                           memory=None,
                           linkage='single')
    preds = fh.fit_predict(pts)

    indices_cluster = set(preds)

    def get_partition_from_preds(preds):
        """From a vector of predictions (labels), get a set of frozensets,
        where each frozenset represents a cluster, and has the indices of rows
        of its elements."""
        return set(
            [frozenset(np.where(preds == c)[0]) for c in indices_cluster])

    assert get_partition_from_preds(preds) == \
           get_partition_from_preds(preds_mat)
Exemplo n.º 5
0
def test_precomputed_distances(inp):
    """Verify that the clustering based on ``distance_matrix`` is the same
    as the clustering on points, that were used to calculate
    that distance matrix."""
    n_points_per_cluster, n_clusters, _, pts = inp
    dist_matrix = distance_matrix(pts, pts, p=2)

    fh_matrix = FirstHistogramGap(freq_threshold=0,
                                  max_fraction=None,
                                  n_bins_start=5,
                                  affinity='precomputed',
                                  memory=None,
                                  linkage='single')
    preds_mat = fh_matrix.fit_predict(dist_matrix)

    fh = FirstHistogramGap(freq_threshold=0,
                           max_fraction=None,
                           n_bins_start=5,
                           affinity='euclidean',
                           memory=None,
                           linkage='single')
    preds = fh.fit_predict(pts)

    assert_almost_equal(preds, preds_mat)