def test_silhouette():
    # Tests the Silhouette Coefficient.
    dataset = datasets.load_iris()
    X = dataset.data
    y = dataset.target
    D = pairwise_distances(X, metric='euclidean')
    # Given that the actual labels are used, we can assume that S would be
    # positive.
    silhouette = silhouette_score(D, y, metric='precomputed')
    assert(silhouette > 0)
    # Test without calculating D
    silhouette_metric = silhouette_score(X, y, metric='euclidean')
    assert_almost_equal(silhouette, silhouette_metric)
    # Test with sampling
    silhouette = silhouette_score(D, y, metric='precomputed',
                                  sample_size=int(X.shape[0] / 2),
                                  random_state=0)
    silhouette_metric = silhouette_score(X, y, metric='euclidean',
                                         sample_size=int(X.shape[0] / 2),
                                         random_state=0)
    assert(silhouette > 0)
    assert(silhouette_metric > 0)
    assert_almost_equal(silhouette_metric, silhouette)
    # Test with sparse X
    X_sparse = csr_matrix(X)
    D = pairwise_distances(X_sparse, metric='euclidean')
    silhouette = silhouette_score(D, y, metric='precomputed')
    assert(silhouette > 0)
def test_non_encoded_labels():
    dataset = datasets.load_iris()
    X = dataset.data
    labels = dataset.target
    assert_equal(
        silhouette_score(X, labels * 2 + 10), silhouette_score(X, labels))
    assert_array_equal(
        silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels))
Пример #3
0
def main():
    args, atom_indices, project, project_root = parse_cmdline()

    # load all of the data from disk
    xyzlist, sampled_frames = load_trajs(project, os.path.dirname(args.project_yaml),
                                       atom_indices, args.stride, args.fraction)
    assignments = io.loadh(args.assignments, 'arr_0')
    # pick only the assignments that had their xyz data loaded
    assignments = np.concatenate([assignments[i, sampled_frames[i]] for i in range(len(sampled_frames))])

    # make sure we didn't mess up the subsampling and get nonsense data
    assert not np.any(assignments < 0), 'assignments negative? stride/sampling messed up probs. did you use a different strid than you clustered with?'
    #assert np.all(np.unique(assignments) == np.arange(np.max(assignments)+1)), 'assignments dont go from 0 to max. did you use a different strid than you clustered with?'

    n_real_atoms = len(atom_indices)
    n_padded_atoms = xyzlist.shape[2]
    assert n_padded_atoms >= n_real_atoms

    pairwise = calculate_pairwise_rmsd(xyzlist, n_real_atoms)

    print 'computing silhouette...'
    score = silhouette_score(pairwise, assignments, metric='precomputed')
    print 'silhouette score: %f' % score

    path = os.path.join(args.output, 'silhouette.dat')
    print 'saving results to flat text file (append): %s...' % path
    if not os.path.exists(args.output):
        os.makedirs(args.output)

    with open(path, 'a') as f:
        f.write('%f\n' % score)
 def calculate_scores(self):
   x, c, labels = self.x, self.c, self.labels
   self.v_measure = v_measure_score(c, labels)
   self.complete = completeness_score(c, labels)
   self.adjusted_mutual = adjusted_mutual_info_score(c, labels)
   self.adjusted_rand = adjusted_rand_score(c, labels)
   self.silhouette = silhouette_score(x, c)
   self.purity, self.partial_purity = self.__purity__()
Пример #5
0
def test_no_nan():
    # Assert Silhouette Coefficient != nan when there is 1 sample in a class.
    # This tests for the condition that caused issue 960.
    # Note that there is only one sample in cluster 0. This used to cause the
    # silhouette_score to return nan (see bug #960).
    labels = np.array([1, 0, 1, 1, 1])
    # The distance matrix doesn't actually matter.
    D = np.random.RandomState(0).rand(len(labels), len(labels))
    silhouette = silhouette_score(D, labels, metric='precomputed')
    assert_false(np.isnan(silhouette))
Пример #6
0
def my_kmeans(feature_vector, no_of_centers=8):
    start = time()
    km = KMeans(n_clusters=no_of_centers).fit(feature_vector)
    end = time()
    labels = km.labels_
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    print 'The no of non noisy clusters is {} with no of centers = {}'.format(n_clusters, no_of_centers)
    print "Time taken to finish {} seconds".format(end - start)
    if option == 1:
        cluster_entropy(labels)
    else:
        print 'The silhouette score is {}'.format(silhouette_score(feature_vector, labels, metric='euclidean'))
Пример #7
0
def test_silhouette():
    # Tests the Silhouette Coefficient.
    dataset = datasets.load_iris()
    X_dense = dataset.data
    X_csr = csr_matrix(X_dense)
    X_dok = sp.dok_matrix(X_dense)
    X_lil = sp.lil_matrix(X_dense)
    y = dataset.target

    for X in [X_dense, X_csr, X_dok, X_lil]:
        D = pairwise_distances(X, metric='euclidean')
        # Given that the actual labels are used, we can assume that S would be
        # positive.
        score_precomputed = silhouette_score(D, y, metric='precomputed')
        assert_greater(score_precomputed, 0)
        # Test without calculating D
        score_euclidean = silhouette_score(X, y, metric='euclidean')
        assert_almost_equal(score_precomputed, score_euclidean)

        if X is X_dense:
            score_dense_without_sampling = score_precomputed
        else:
            assert_almost_equal(score_euclidean,
                                score_dense_without_sampling)

        # Test with sampling
        score_precomputed = silhouette_score(D, y, metric='precomputed',
                                             sample_size=int(X.shape[0] / 2),
                                             random_state=0)
        score_euclidean = silhouette_score(X, y, metric='euclidean',
                                           sample_size=int(X.shape[0] / 2),
                                           random_state=0)
        assert_greater(score_precomputed, 0)
        assert_greater(score_euclidean, 0)
        assert_almost_equal(score_euclidean, score_precomputed)

        if X is X_dense:
            score_dense_with_sampling = score_precomputed
        else:
            assert_almost_equal(score_euclidean, score_dense_with_sampling)
Пример #8
0
def my_agg_clustering(feature_vector, no_of_centers, metric_name):
    start = time()
    ag_c = AgglomerativeClustering(n_clusters=no_of_centers, affinity=metric_name).fit(feature_vector)
    end = time()
    labels = ag_c.labels_
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    print 'The no of non noisy clusters is {} with no of centers = {} with metric = {}'.format(
        n_clusters, no_of_centers, metric_name)
    print "Time taken to finish {} seconds".format(end - start)
    if option == 1:
        cluster_entropy(labels)
    else:
        print 'The silhouette score is {}'.format(silhouette_score(feature_vector, labels, metric=metric_name))
Пример #9
0
def test_silhouette_paper_example():
    # Explicitly check per-sample results against Rousseeuw (1987)
    # Data from Table 1
    lower = [5.58,
             7.00, 6.50,
             7.08, 7.00, 3.83,
             4.83, 5.08, 8.17, 5.83,
             2.17, 5.75, 6.67, 6.92, 4.92,
             6.42, 5.00, 5.58, 6.00, 4.67, 6.42,
             3.42, 5.50, 6.42, 6.42, 5.00, 3.92, 6.17,
             2.50, 4.92, 6.25, 7.33, 4.50, 2.25, 6.33, 2.75,
             6.08, 6.67, 4.25, 2.67, 6.00, 6.17, 6.17, 6.92, 6.17,
             5.25, 6.83, 4.50, 3.75, 5.75, 5.42, 6.08, 5.83, 6.67, 3.67,
             4.75, 3.00, 6.08, 6.67, 5.00, 5.58, 4.83, 6.17, 5.67, 6.50, 6.92]
    D = np.zeros((12, 12))
    D[np.tril_indices(12, -1)] = lower
    D += D.T

    names = ['BEL', 'BRA', 'CHI', 'CUB', 'EGY', 'FRA', 'IND', 'ISR', 'USA',
             'USS', 'YUG', 'ZAI']

    # Data from Figure 2
    labels1 = [1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1]
    expected1 = {'USA': .43, 'BEL': .39, 'FRA': .35, 'ISR': .30, 'BRA': .22,
                 'EGY': .20, 'ZAI': .19, 'CUB': .40, 'USS': .34, 'CHI': .33,
                 'YUG': .26, 'IND': -.04}
    score1 = .28

    # Data from Figure 3
    labels2 = [1, 2, 3, 3, 1, 1, 2, 1, 1, 3, 3, 2]
    expected2 = {'USA': .47, 'FRA': .44, 'BEL': .42, 'ISR': .37, 'EGY': .02,
                 'ZAI': .28, 'BRA': .25, 'IND': .17, 'CUB': .48, 'USS': .44,
                 'YUG': .31, 'CHI': .31}
    score2 = .33

    for labels, expected, score in [(labels1, expected1, score1),
                                    (labels2, expected2, score2)]:
        expected = [expected[name] for name in names]
        # we check to 2dp because that's what's in the paper
        pytest.approx(expected,
                      silhouette_samples(D, np.array(labels),
                                         metric='precomputed'),
                      abs=1e-2)
        pytest.approx(score,
                      silhouette_score(D, np.array(labels),
                                       metric='precomputed'),
                      abs=1e-2)
Пример #10
0
def my_dbscan(feature_vector, metric_name, eps=None, minpts=None):
    start = time()
    if eps is None and minpts is None:
        db = DBSCAN(metric=metric_name).fit(feature_vector)
    elif minpts is None:
        db = DBSCAN(eps=eps, metric=metric_name).fit(feature_vector)
    elif eps is None:
        db = DBSCAN(min_samples=minpts, metric=metric_name).fit(feature_vector)
    else:
        db = DBSCAN(eps=eps, min_samples=minpts, metric=metric_name).fit(feature_vector)
    end = time()
    labels = db.labels_
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)  # ignoring noise if present
    print 'The no of non noisy clusters is {} with metric = {}'.format(n_clusters, metric_name)
    print "Time taken to finish {} seconds".format(end - start)
    if option == 1:
        cluster_entropy(labels)
    else:
        print 'The silhouette score is {}'.format(silhouette_score(feature_vector, labels, metric=metric_name))
def test_cluster_size_1():
    # Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster
    # (cluster 0). We also test the case where there are identical samples
    # as the only members of a cluster (cluster 2). To our knowledge, this case
    # is not discussed in reference material, and we choose for it a sample
    # score of 1.
    X = [[0.], [1.], [1.], [2.], [3.], [3.]]
    labels = np.array([0, 1, 1, 1, 2, 2])

    # Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention
    # Cluster 1: intra-cluster = [.5, .5, 1]
    #            inter-cluster = [1, 1, 1]
    #            silhouette    = [.5, .5, 0]
    # Cluster 2: intra-cluster = [0, 0]
    #            inter-cluster = [arbitrary, arbitrary]
    #            silhouette    = [1., 1.]

    silhouette = silhouette_score(X, labels)
    assert_false(np.isnan(silhouette))
    ss = silhouette_samples(X, labels)
    assert_array_equal(ss, [0, .5, .5, 0, 1, 1])
Пример #12
0
def test_non_numpy_labels():
    dataset = datasets.load_iris()
    X = dataset.data
    y = dataset.target
    assert_equal(
        silhouette_score(list(X), list(y)), silhouette_score(X, y))
Пример #13
0
-1 代表噪声

增大eps,更多的点会被包含在一个簇中。这让簇变大,但可能也会导致多个簇合并成一个
增大min_samples,核心点会变得更少,更多的点被标记为噪声

参数eps 在某种程度上更加重要,因为它决定了点与点之间“接近”的含义。
将eps 设置得非常小,意味着没有点是核心样本,可能会导致所有点都被标记为噪声。
将eps 设置得非常大,可能会导致所有点形成单个簇

设置min_samples 主要是为了判断稀疏区域内的点被标记为异常值还是形成自己的簇。
如果增大min_samples,任何一个包含少于min_samples 个样本的簇现在将被标记为噪声。
因此,min_samples 决定簇的最小尺寸
'''
print(clusters)
print(len(set(clusters)))

if len(set(clusters)) > 1:
    print('{} {} {}'.format(eps, min_samples, silhouette_score(X, clusters)))
# 0.5 5 -0.12276159423271887
# 0.7 5 0.3593629426203677

'''
如果全是-1就抛异常
1 < n_labels 不成立
def check_number_of_labels(n_labels, n_samples):
    if not 1 < n_labels < n_samples:
        raise ValueError("Number of labels is %d. Valid values are 2 "
                         "to n_samples - 1 (inclusive)" % n_labels)

'''