예제 #1
0
def test_silhouette():
    # Tests the Silhouette Coefficient.
    dataset = datasets.load_iris()
    X = dataset.data
    y = dataset.target
    D = pairwise_distances(X, metric='euclidean')
    # Given that the actual labels are used, we can assume that S would be
    # positive.
    silhouette = silhouette_score(D, y, metric='precomputed')
    assert(silhouette > 0)
    # Test without calculating D
    silhouette_metric = silhouette_score(X, y, metric='euclidean')
    assert_almost_equal(silhouette, silhouette_metric)
    # Test with sampling
    silhouette = silhouette_score(D, y, metric='precomputed',
                                  sample_size=int(X.shape[0] / 2),
                                  random_state=0)
    silhouette_metric = silhouette_score(X, y, metric='euclidean',
                                         sample_size=int(X.shape[0] / 2),
                                         random_state=0)
    assert(silhouette > 0)
    assert(silhouette_metric > 0)
    assert_almost_equal(silhouette_metric, silhouette)
    # Test with sparse X
    X_sparse = csr_matrix(X)
    D = pairwise_distances(X_sparse, metric='euclidean')
    silhouette = silhouette_score(D, y, metric='precomputed')
    assert(silhouette > 0)
예제 #2
0
def test3(isKmeans):
    n_clusters = 16
    data = []

    with open(data3Test, 'rb') as csvfile:
        #skip header
        _ = csvfile.next()
        for line in csvfile:
            line = line.strip()
            if len(line) > 0:
                data.append(line.split(','))

    # with open(data3,'rb') as csvfile:
    #     for line in csvfile:
    #         line = line.replace("\n", "")
    #         temp = line.split('   ')
    #         temp1 = []
    #         for item in temp:
    #             if len(item.strip()) > 0:
    #                 temp1.append(item.strip())
    #         data.append(temp1)

    # convert to numpy array
    data = np.array(data)
    print data
    #
    # # km = KMeans(16).fit(data)
    #
    if isKmeans == True:
        clusterer = KMeans(n_clusters)
        cluster_labels = clusterer.fit_predict(data)
        #
        print cluster_labels
        print silhouette_score(data, cluster_labels, metric='euclidean')
        showChartKmeans(clusterer, False, data, n_clusters)

        compareResult(cluster_labels)
    else:
        ward = AgglomerativeClustering(n_clusters,
                                       affinity='euclidean',
                                       linkage='ward')
        ward.fit(data)

        print ward.labels_
        print silhouette_score(data, ward.labels_, metric='euclidean')
        showChartHierarchical(ward, False, data, n_clusters)

        compareResult(ward.labels_)
예제 #3
0
def test3(isKmeans):
    n_clusters = 16
    data = []

    with open(data3Test,'rb') as csvfile:
        #skip header
        _ = csvfile.next()
        for line in csvfile:
            line = line.strip()
            if len(line) > 0:
                data.append(line.split(','))

    # with open(data3,'rb') as csvfile:
    #     for line in csvfile:
    #         line = line.replace("\n", "")
    #         temp = line.split('   ')
    #         temp1 = []
    #         for item in temp:
    #             if len(item.strip()) > 0:
    #                 temp1.append(item.strip())
    #         data.append(temp1)

    # convert to numpy array
    data = np.array(data)
    print data
    #
    # # km = KMeans(16).fit(data)
    #
    if isKmeans == True:
        clusterer = KMeans(n_clusters)
        cluster_labels = clusterer.fit_predict(data)
        #
        print cluster_labels
        print silhouette_score(data, cluster_labels, metric='euclidean')
        showChartKmeans(clusterer, False, data, n_clusters)

        compareResult(cluster_labels)
    else:
        ward = AgglomerativeClustering(n_clusters, affinity='euclidean', linkage='ward')
        ward.fit(data)

        print ward.labels_
        print silhouette_score(data, ward.labels_, metric='euclidean')
        showChartHierarchical(ward, False, data, n_clusters)

        compareResult(ward.labels_)
예제 #4
0
def test2(isKmeans):
    n_clusters = 15
    data = []

    with open(data2Test, 'rb') as csvfile:
        #skip header
        _ = csvfile.next()
        for line in csvfile:
            line = line.strip()
            if len(line) > 0:
                value1, value2 = line.split(',')
                data.append([value1, value2])

    # with open(data2,'rb') as csvfile:
    #     for line in csvfile:
    #         line = line.replace("\n", "")
    #         temp = line.split('    ')
    #         key1 = ''
    #         key2 = ''
    #
    #         key1 = temp[1]
    #
    #         if len(temp[2]) > 0:
    #             key2 = temp[2]
    #         else:
    #             key2 = temp[3]
    #         data.append([key1, key2])

    #convert to numpy array
    data = np.array(data)

    if isKmeans == True:
        clusterer = KMeans(n_clusters)
        cluster_labels = clusterer.fit_predict(data)

        print cluster_labels
        print silhouette_score(data, cluster_labels, metric='euclidean')
        showChartKmeans(clusterer, False, data, n_clusters)
    else:
        ward = AgglomerativeClustering(n_clusters,
                                       affinity='euclidean',
                                       linkage='ward')
        ward.fit(data)
        print ward.labels_
        print silhouette_score(data, ward.labels_, metric='euclidean')
        showChartHierarchical(ward, False, data, n_clusters)
예제 #5
0
파일: metrics.py 프로젝트: luamzi/metrics
    def silhouette_score(self,
                         data,
                         labels,
                         metric='euclidean',
                         sample_size=None,
                         random_state=None,
                         **kwds):
        """Compute the mean Silhouette Coefficient of all samples.
    
            The Silhouette Coefficient is calculated using the mean intra-cluster
            distance (``a``) and the mean nearest-cluster distance (``b``) for each
            sample.  The Silhouette Coefficient for a sample is ``(b - a) / max(a,
            b)``.  To clarify, ``b`` is the distance between a sample and the nearest
            cluster that the sample is not a part of.
            Note that Silhouette Coefficient is only defined if number of labels
            is 2 <= n_labels <= n_samples - 1.
            This function returns the mean Silhouette Coefficient over all samples.
            To obtain the values for each sample, use :func:`silhouette_samples`.
            The best value is 1 and the worst value is -1. Values near 0 indicate
            overlapping clusters. Negative values generally indicate that a sample has
            been assigned to the wrong cluster, as a different cluster is more similar.
            Read more in the :ref:`User Guide <silhouette_coefficient>`.
            Parameters
    
        Args:
            data : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
                     [n_samples_a, n_features] otherwise
                Array of pairwise distances between samples, or a feature array.
            labels : array, shape = [n_samples]
                 Predicted labels for each sample.
            metric : string, or callable
                The metric to use when calculating distance between instances in a
                feature array. If metric is a string, it must be one of the options
                allowed by :func:`metrics.pairwise.pairwise_distances
                <sklearn.metrics.pairwise.pairwise_distances>`. If data is the distance
                array itself, use ``metric="precomputed"``.
            sample_size : int or None
                The size of the sample to use when computing the Silhouette Coefficient
                on a random subset of the data.
                If ``sample_size is None``, no sampling is used.
            random_state : int, RandomState instance or None, optional (default=None)
                The generator used to randomly select a subset of samples.  If int,
                random_state is the seed used by the random number generator; If
                RandomState instance, random_state is the random number generator; If
                None, the random number generator is the RandomState instance used by
                `np.random`. Used when ``sample_size is not None``.
            **kwds : optional keyword parameters
                Any further parameters are passed directly to the distance function.
                If using a scipy.spatial.distance metric, the parameters are still
                metric dependent. See the scipy docs for usage examples.
    
        Returns:
            silhouette : float
                Mean Silhouette Coefficient for all samples.
        """

        return unsupervised.silhouette_score(data, labels, metric, sample_size,
                                             random_state, **kwds)
예제 #6
0
def test1():
    n_clusters = 100
    data = []

    with open(data1Test, 'rb') as csvfile:
        #skip header
        _ = csvfile.next()
        for line in csvfile:
            line = line.strip()
            if len(line) > 0:
                value1, value2 = line.split(',')
                data.append([value1, value2])

    # with open(data1,'rb') as csvfile:
    #     for line in csvfile:
    #         line = line.replace("\n", "")
    #         temp = line.split('    ')
    #         key1 = ''
    #         key2 = ''
    #
    #         key1 = temp[1]
    #
    #         if len(temp[2]) > 0:
    #             key2 = temp[2]
    #         else:
    #             key2 = temp[3]
    #         data.append([key1, key2])

    #convert to numpy array
    data = np.array(data)

    # km = KMeans(10).fit(data)

    clusterer = KMeans(n_clusters)
    cluster_labels = clusterer.fit_predict(data)

    print cluster_labels

    # print km.labels_
    # print km.cluster_centers_
    print silhouette_score(data,
                           cluster_labels,
                           sample_size=5000,
                           metric='euclidean')
    showChartKmeans(clusterer, False, data, n_clusters)
예제 #7
0
def test2(isKmeans):
    n_clusters = 15
    data = []

    with open(data2Test,'rb') as csvfile:
        #skip header
        _ = csvfile.next()
        for line in csvfile:
            line = line.strip()
            if len(line) > 0:
                value1, value2 = line.split(',')
                data.append([value1, value2])

    # with open(data2,'rb') as csvfile:
    #     for line in csvfile:
    #         line = line.replace("\n", "")
    #         temp = line.split('    ')
    #         key1 = ''
    #         key2 = ''
    #
    #         key1 = temp[1]
    #
    #         if len(temp[2]) > 0:
    #             key2 = temp[2]
    #         else:
    #             key2 = temp[3]
    #         data.append([key1, key2])

    #convert to numpy array
    data = np.array(data)

    if isKmeans == True:
        clusterer = KMeans(n_clusters)
        cluster_labels = clusterer.fit_predict(data)

        print cluster_labels
        print silhouette_score(data, cluster_labels, metric='euclidean')
        showChartKmeans(clusterer, False, data, n_clusters)
    else:
        ward = AgglomerativeClustering(n_clusters, affinity='euclidean', linkage='ward')
        ward.fit(data)
        print ward.labels_
        print silhouette_score(data, ward.labels_, metric='euclidean')
        showChartHierarchical(ward, False, data, n_clusters)
예제 #8
0
def test_no_nan():
    # Assert Silhouette Coefficient != nan when there is 1 sample in a class.
    # This tests for the condition that caused issue 960.
    # Note that there is only one sample in cluster 0. This used to cause the
    # silhouette_score to return nan (see bug #960).
    labels = np.array([1, 0, 1, 1, 1])
    # The distance matrix doesn't actually matter.
    D = np.random.RandomState(0).rand(len(labels), len(labels))
    silhouette = silhouette_score(D, labels, metric='precomputed')
    assert_false(np.isnan(silhouette))
예제 #9
0
def test1():
    n_clusters = 100
    data = []

    with open(data1Test,'rb') as csvfile:
        #skip header
        _ = csvfile.next()
        for line in csvfile:
            line = line.strip()
            if len(line) > 0:
                value1, value2 = line.split(',')
                data.append([value1, value2])

    # with open(data1,'rb') as csvfile:
    #     for line in csvfile:
    #         line = line.replace("\n", "")
    #         temp = line.split('    ')
    #         key1 = ''
    #         key2 = ''
    #
    #         key1 = temp[1]
    #
    #         if len(temp[2]) > 0:
    #             key2 = temp[2]
    #         else:
    #             key2 = temp[3]
    #         data.append([key1, key2])

    #convert to numpy array
    data = np.array(data)

    # km = KMeans(10).fit(data)

    clusterer = KMeans(n_clusters)
    cluster_labels = clusterer.fit_predict(data)

    print cluster_labels

    # print km.labels_
    # print km.cluster_centers_
    print silhouette_score(data, cluster_labels, sample_size=5000, metric='euclidean')
    showChartKmeans(clusterer, False, data, n_clusters)
예제 #10
0
 def silhouette_example(dset='CB1', neuropil='Optic_Glomeruli', seed=0, clusterer='kmedoids'):
     print('Reading dataset')
     X = ExpressionDataset(neuropil_dir(dset, neuropil)).Xarray()
     print('Reading clusterings')
     df = read_clusterings_cache(dset=dset, neuropil=neuropil)
     # Pick just one seed (you might want to change this...)
     df = df.query('repeat==%d and clusterer=="%s"' % (seed, clusterer))
     # Compute all the silhouettes
     for k, kdf in df.groupby('k'):
         print('Computing mean silhoutte for k=%d' % k)
         assert len(kdf) == 1
         # sklearn expects labels to be in [0...k-1]
         labels = np.array(relabel_to_0k(kdf['labels'].iloc[0]))
         print(k, silhouette_score(X, labels, metric='euclidean', random_state=0))  # sample_size=1000
예제 #11
0
파일: kmeans.py 프로젝트: yemode2k/studio
def _kmeans_silhouette_train_predict(table, input_cols, n_clusters_list=range(2, 10), prediction_col='prediction',
                                     init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto',
                                     seed=None, n_jobs=1, algorithm='auto', n_samples=None):
    
    feature_names, features = check_col_type(table, input_cols)

    if n_samples is None:
        n_samples = len(table)
    inputarr = features
    
    pca2_model = PCA(n_components=min(2, len(feature_names))).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)
    
    silhouette_list = []
    models = []
    centers_list = []
    images = []
    for k in n_clusters_list:
        k_means = SKKMeans(n_clusters=k, init=init, n_init=n_init, max_iter=max_iter, tol=tol,
                           precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True,
                           n_jobs=n_jobs, algorithm=algorithm)
        k_means.fit(inputarr)
        models.append(k_means)
        predict = k_means.labels_
        centersk = k_means.cluster_centers_
        centers_list.append(centersk)
        
        score = silhouette_score(inputarr, predict)
        silhouette_list.append(score)
        samples = silhouette_samples(inputarr, predict)
        # silhouette_samples_list.append(samples)
    
        pca2_centers = pca2_model.transform(centersk)

        _, (ax1, ax2) = plt.subplots(1, 2)
        colors = cm.nipy_spectral(np.arange(k).astype(float) / k)
        y_lower = 0

        for i, color in zip(range(k), colors):
            si = samples[predict == i]
            si.sort()

            sizei = si.shape[0]
            y_upper = y_lower + sizei

            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, si,
                              facecolor=color, edgecolor=color, alpha=0.7)
            
            # cluster label
            ax1.text(0.9, y_lower + 0.45 * sizei, str(i))

            y_lower = y_upper
            
            if pca2.shape[1] == 1:
                ax2.scatter(pca2[:, 0][predict == i], pca2[:, 0][predict == i], color=color)
            else:
                ax2.scatter(pca2[:, 0][predict == i], pca2[:, 1][predict == i], color=color)

        ax1.axvline(x=score, color="red")
        ax1.set_xlim(right=1.0)
        ax1.set_yticks([])
        ax1.set_xlabel("Silhouette coefficient values")
        ax1.set_ylabel("Cluster label")
        
        if pca2.shape[1] == 1:
            ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 0], marker='x', edgecolors=1, s=200, color=colors)
            ax2.set_xlabel("Feature space for the 1st feature")
            ax2.set_ylabel("Feature space for the 1st feature")
        else:
            ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors)
            ax2.set_xlabel("Feature space for the 1st feature")
            ax2.set_ylabel("Feature space for the 2nd feature")   
        
        plt.tight_layout()
        imagek = plt2MD(plt)
        plt.clf()
        images.append(imagek)
    
    argmax = np.argmax(silhouette_list)
    best_k = n_clusters_list[argmax]
    best_model = models[argmax]
    predict = best_model.predict(inputarr)
    best_centers = best_model.cluster_centers_
    best_labels = best_model.labels_
    best_sse = best_model.inertia_
    
    n_clusters = len(best_centers)
    colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters)
    fig_centers = _kmeans_centers_plot(feature_names, best_centers, colors)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, best_centers, seed, colors)
    fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2, colors)
    
    x_clusters = range(len(n_clusters_list))
    plt.xticks(x_clusters, n_clusters_list)
    plt.plot(x_clusters, silhouette_list, '.-')
    plt.xlabel("Number of Clusters k")
    plt.tight_layout()
    fig_silhouette = plt2MD(plt)
    plt.clf()
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Kmeans Silhouette Result
    | - silhoutte metrics:
    | {fig_silhouette}
    | - best K: {best_k} 
    | - Sum of square error: {best_sse}.
    | - best centers:
    | {fig_pca}
    | {fig_centers}
    | {fig_samples}
    |
    """.format(fig_silhouette=fig_silhouette, best_k=best_k, best_sse=best_sse, fig_pca=fig_pca, fig_centers=fig_centers,
               fig_samples=fig_samples)))

    for k, image in zip(n_clusters_list, images):
        rb.addMD(strip_margin("""
        | ### k = {k}
        | {image}
        |
        """.format(k=k, image=image)))

    model = _model_dict('kmeans_silhouette')
    model['best_k'] = best_k
    model['best_centers'] = best_centers
    model['best_model'] = best_model
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()
    
    out_table = table.copy()
    out_table[prediction_col] = predict
    # out_table['silhouette'] = silhouette_samples_list[best_k-2]
    # out_table = out_table.sort_values(by=['prediction','silhouette'])  
    # out_table = out_table.reset_index(drop=True)
        
    return {'out_table': out_table, 'model': model}
예제 #12
0
파일: kmeans.py 프로젝트: shovsj/studio
def _kmeans_silhouette_train_predict(table,
                                     input_cols,
                                     n_clusters_list=range(2, 10),
                                     prediction_col='prediction',
                                     init='k-means++',
                                     n_init=10,
                                     max_iter=300,
                                     tol=1e-4,
                                     precompute_distances='auto',
                                     seed=None,
                                     n_jobs=1,
                                     algorithm='auto',
                                     n_samples=None):
    if n_samples is None:
        n_samples = len(table)
    inputarr = table[input_cols]

    validate(all_elements_greater_than(n_clusters_list, 1, 'n_clusters_list'),
             greater_than_or_equal_to(n_init, 1, 'n_init'),
             greater_than_or_equal_to(max_iter, 1, 'max_iter'),
             greater_than(tol, 0.0, 'tol'),
             greater_than_or_equal_to(n_jobs, 1, 'n_jobs'),
             greater_than_or_equal_to(n_samples, 0, 'n_samples'))

    pca2_model = PCA(n_components=2).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)

    silhouette_list = []
    silouette_samples_list = []
    models = []
    centers_list = []
    images = []
    for k in n_clusters_list:
        k_means = SKKMeans(n_clusters=k,
                           init=init,
                           n_init=n_init,
                           max_iter=max_iter,
                           tol=tol,
                           precompute_distances=precompute_distances,
                           verbose=0,
                           random_state=seed,
                           copy_x=True,
                           n_jobs=n_jobs,
                           algorithm=algorithm)
        k_means.fit(inputarr)
        models.append(k_means)
        predict = k_means.labels_
        centersk = k_means.cluster_centers_
        centers_list.append(centersk)

        score = silhouette_score(inputarr, predict)
        silhouette_list.append(score)
        samples = silhouette_samples(inputarr, predict)
        silouette_samples_list.append(samples)

        pca2_centers = pca2_model.transform(centersk)

        _, (ax1, ax2) = plt.subplots(1, 2)
        colors = cm.nipy_spectral(np.arange(k).astype(float) / k)
        y_lower = 0

        for i, color in zip(range(k), colors):
            si = samples[predict == i]
            si.sort()

            sizei = si.shape[0]
            y_upper = y_lower + sizei

            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              si,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            y_lower = y_upper

            ax2.scatter(pca2[:, 0][predict == i],
                        pca2[:, 1][predict == i],
                        color=color)

        ax1.axvline(x=score, color="red")
        ax2.scatter(pca2_centers[:, 0],
                    pca2_centers[:, 1],
                    marker='x',
                    edgecolors=1,
                    s=200,
                    color=colors)

        imagek = plt2MD(plt)
        plt.clf()
        images.append(imagek)

    argmax = np.argmax(silhouette_list)
    best_k = n_clusters_list[argmax]
    best_model = models[argmax]
    predict = best_model.predict(inputarr)
    best_centers = best_model.cluster_centers_
    best_labels = best_model.labels_

    fig_centers = _kmeans_centers_plot(input_cols, best_centers)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples,
                                       best_centers)
    fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2)

    x_clusters = range(len(n_clusters_list))
    plt.xticks(x_clusters, n_clusters_list)
    plt.plot(x_clusters, silhouette_list, '.-')
    fig_silhouette = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Kmeans Silhouette Result
    | - silloutte metrics:
    | {fig_silhouette}
    | - best K: {best_k} 
    | - best centers:
    | {fig_pca}
    | {fig_centers}
    | {fig_samples}
    |
    """.format(fig_silhouette=fig_silhouette,
               best_k=best_k,
               fig_pca=fig_pca,
               fig_centers=fig_centers,
               fig_samples=fig_samples)))

    for k, image in zip(n_clusters_list, images):
        rb.addMD(
            strip_margin("""
        | ### k = {k}
        | {image}
        |
        """.format(k=k, image=image)))

    model = _model_dict('kmeans_silhouette')
    model['best_k'] = best_k
    model['best_centers'] = best_centers
    model['best_model'] = best_model
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()

    out_table = table.copy()
    out_table[prediction_col] = predict

    return {'out_table': out_table, 'model': model}
예제 #13
0
            zip(combinations(unique_labels, 2), values):

            indices_a = np.where(labels == label_a)[0]
            inter_dist[indices_a] = np.minimum(values_a, inter_dist[indices_a])
            del indices_a
            indices_b = np.where(labels == label_b)[0]
            inter_dist[indices_b] = np.minimum(values_b, inter_dist[indices_b])
            del indices_b
    return inter_dist

if __name__ == '__main__':
    import time

    from sklearn.metrics.cluster.unsupervised import silhouette_score

    np.random.seed(0)
    X = np.random.random((10000, 100))
    y = np.repeat(np.arange(100), 100)
    t0 = time.time()
    s = silhouette_score(X, y)
    t = time.time() - t0
    print 'Scikit silhouette (%fs): %f' % (t, s)
    t0 = time.time()
    s = silhouette_score_block(X, y)
    t = time.time() - t0
    print 'Block silhouette (%fs): %f' % (t, s)
    t0 = time.time()
    s = silhouette_score_block(X, y, n_jobs=2)
    t = time.time() - t0
    print 'Block silhouette parallel (%fs): %f' % (t, s)
예제 #14
0
파일: snippet.py 프로젝트: szabo92/gistable
        indices_a = np.where(labels == label_a)[0]
        inter_dist[indices_a] = np.minimum(values_a, inter_dist[indices_a])
        del indices_a
        indices_b = np.where(labels == label_b)[0]
        inter_dist[indices_b] = np.minimum(values_b, inter_dist[indices_b])
        del indices_b
    return inter_dist


if __name__ == '__main__':
    import time

    from sklearn.metrics.cluster.unsupervised import silhouette_score

    np.random.seed(0)
    X = np.random.random((10000, 100))
    y = np.repeat(np.arange(100), 100)
    t0 = time.time()
    s = silhouette_score(X, y)
    t = time.time() - t0
    print 'Scikit silhouette (%fs): %f' % (t, s)
    t0 = time.time()
    s = silhouette_score_block(X, y)
    t = time.time() - t0
    print 'Block silhouette (%fs): %f' % (t, s)
    t0 = time.time()
    s = silhouette_score_block(X, y, n_jobs=2)
    t = time.time() - t0
    print 'Block silhouette parallel (%fs): %f' % (t, s)
예제 #15
0
def test_non_encoded_labels():
    dataset = datasets.load_iris()
    X = dataset.data
    labels = dataset.target
    assert_equal(
        silhouette_score(X, labels + 10), silhouette_score(X, labels))
예제 #16
0
def test_non_numpy_labels():
    dataset = datasets.load_iris()
    X = dataset.data
    y = dataset.target
    assert_equal(
        silhouette_score(list(X), list(y)), silhouette_score(X, y))
예제 #17
0
plt.xlim([0,10])
plt.ylim([0,10])
plt.title('Instances')
plt.scatter(x1,x2)

colors = ['b','g','r','c','m','y','k','b']
markers = ['o','s','D','v','^','p','*','+']

clusters = [2,3,4,5,8]
subplot_counter = 1
sc_scores = []
for t in clusters:
    subplot_counter += 1
    plt.subplot(3,2,subplot_counter)
    kmeans_model = KMeans(n_clusters=t).fit(X)
    for i,l in enumerate(kmeans_model.labels_):
        plt.plot(x1[i],x2[i],color=colors[l],marker=markers[l],ls='None')
        
    plt.xlim([0,10])
    plt.ylim([0,10])
    sc_score = silhouette_score(X,kmeans_model.labels_,metric='euclidean')
    sc_scores.append(sc_score)
    plt.title('K=%s, silhouette Coefficient=%0.03f' %(t,sc_score))
    
plt.figure()
plt.plot(clusters,sc_scores,'*-')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Coefficient Score')

plt.show()