예제 #1
0
def Judge_1(inputfile,n):
    from sklearn.metrics.cluster import silhouette_score
    from sklearn.preprocessing import StandardScaler
    from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
    import matplotlib.pyplot as plt
    import mglearn
    X= pd.read_csv(inputfile)
    # 将数据缩放成平均值为 0,方差为 1
    scaler = StandardScaler()
    scaler.fit(X)
    X_scaled = scaler.transform(X)
    fig, axes = plt.subplots(1, 4, figsize=(15, 3), subplot_kw={'xticks': (), 'yticks': ()})
    # 需要使用的算法
    algorithms = [KMeans(n_clusters=n), AgglomerativeClustering(n_clusters=n), DBSCAN()]
    # 创建一个随机的簇分配,作为参考
    random_state = np.random.RandomState(seed=0)
    random_clusters = random_state.randint(low=0, high=2, size=len(X))
    axes[0].scatter(X_scaled[:, 0], X_scaled[:, 1], c=random_clusters, cmap=mglearn.cm3, s=60)
    axes[0].set_title('Random assignment:{:.2f}'.format(silhouette_score(X_scaled, random_clusters)))
    for ax, algorithm in zip(axes[1:], algorithms):
        clusters = algorithm.fit_predict(X_scaled)
        ax.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters, cmap=mglearn.cm3, s=60)
        ax.set_title('{}:{:.2f}'.format(algorithm.__class__.__name__, silhouette_score(X_scaled, clusters)))
    plt.show()
    return True
def test_silhouette():
    # Tests the Silhouette Coefficient.
    dataset = datasets.load_iris()
    X = dataset.data
    y = dataset.target
    D = pairwise_distances(X, metric='euclidean')
    # Given that the actual labels are used, we can assume that S would be
    # positive.
    silhouette = silhouette_score(D, y, metric='precomputed')
    assert(silhouette > 0)
    # Test without calculating D
    silhouette_metric = silhouette_score(X, y, metric='euclidean')
    assert_almost_equal(silhouette, silhouette_metric)
    # Test with sampling
    silhouette = silhouette_score(D, y, metric='precomputed',
                                  sample_size=int(X.shape[0] / 2),
                                  random_state=0)
    silhouette_metric = silhouette_score(X, y, metric='euclidean',
                                         sample_size=int(X.shape[0] / 2),
                                         random_state=0)
    assert(silhouette > 0)
    assert(silhouette_metric > 0)
    assert_almost_equal(silhouette_metric, silhouette)
    # Test with sparse X
    X_sparse = csr_matrix(X)
    D = pairwise_distances(X_sparse, metric='euclidean')
    silhouette = silhouette_score(D, y, metric='precomputed')
    assert(silhouette > 0)
예제 #3
0
def fg1():
    from sklearn.metrics.cluster import adjusted_mutual_info_score,silhouette_score
    from sklearn.cluster import KMeans, AgglomerativeClustering,DBSCAN
    from sklearn.datasets import make_moons
    from sklearn.preprocessing import StandardScaler

    X,y = make_moons(n_samples=500,noise=0.07,random_state=0)

    scaler = StandardScaler()
    scaler.fit(X)
    X_scaled = scaler.transform(X)

    fig,axes = plt.subplots(1,4,figsize=(15,3),
                            subplot_kw={'xticks':(),'yticks':()})

    algorithms = [KMeans(n_clusters=2),AgglomerativeClustering(n_clusters=2),DBSCAN()]

    random_state = np.random.RandomState(seed=0)
    random_clusters = random_state.randint(low=0,high=2,size=len(X))

    axes[0].scatter(X_scaled[:,0],X_scaled[:,1],c=random_clusters,cmap=mglearn.cm3,s=60)
    axes[0].set_title("Random assignment - ARI: {:.2f}".format(
        silhouette_score(X_scaled,random_clusters)))

    for ax, algorithm in zip(axes[1:],algorithms):
        clusters = algorithm.fit_predict(X_scaled)
        ax.scatter(X_scaled[:,0],X_scaled[:,1],c=clusters,cmap=mglearn.cm3,s=30)
        ax.set_title("{} - ARI: {:.2f}".format(algorithm.__class__.__name__,
                                               silhouette_score(X_scaled,clusters)))

    plt.show()
예제 #4
0
def test_silhouette():
    # Tests the Silhouette Coefficient.
    dataset = datasets.load_iris()
    X = dataset.data
    y = dataset.target
    D = pairwise_distances(X, metric='euclidean')
    # Given that the actual labels are used, we can assume that S would be
    # positive.
    silhouette = silhouette_score(D, y, metric='precomputed')
    assert (silhouette > 0)
    # Test without calculating D
    silhouette_metric = silhouette_score(X, y, metric='euclidean')
    assert_almost_equal(silhouette, silhouette_metric)
    # Test with sampling
    silhouette = silhouette_score(D,
                                  y,
                                  metric='precomputed',
                                  sample_size=int(X.shape[0] / 2),
                                  random_state=0)
    silhouette_metric = silhouette_score(X,
                                         y,
                                         metric='euclidean',
                                         sample_size=int(X.shape[0] / 2),
                                         random_state=0)
    assert (silhouette > 0)
    assert (silhouette_metric > 0)
    assert_almost_equal(silhouette_metric, silhouette)
    # Test with sparse X
    X_sparse = csr_matrix(X)
    D = pairwise_distances(X_sparse, metric='euclidean')
    silhouette = silhouette_score(D, y, metric='precomputed')
    assert (silhouette > 0)
예제 #5
0
def test_non_encoded_labels():
    dataset = datasets.load_iris()
    X = dataset.data
    labels = dataset.target
    assert silhouette_score(X, labels * 2 + 10) == silhouette_score(X, labels)
    assert_array_equal(silhouette_samples(X, labels * 2 + 10),
                       silhouette_samples(X, labels))
def test_non_encoded_labels():
    dataset = datasets.load_iris()
    X = dataset.data
    labels = dataset.target
    assert_equal(
        silhouette_score(X, labels * 2 + 10), silhouette_score(X, labels))
    assert_array_equal(
        silhouette_samples(X, labels * 2 + 10), silhouette_samples(X, labels))
예제 #7
0
    def apply(self):
        k_range = range(self.k_min, self.k_max)
        if isinstance(self.model, Simlr):
        # uses generative object and np.fromiter for memory efficiency
            go = (silhouette_score(X=self.matrix, labels=KMeans(k).fit_predict(self.model.set_params(n_clusters=k).fit_predict(self.matrix))) for k in k_range)
            silhouette_scores = np.fromiter(go, dtype=float, count=len(k_range))
        else:
            predicted_labels = [self.model.set_params(n_clusters=k).fit_predict(self.matrix) for k in k_range]
            silhouette_scores = [silhouette_score(X=self.matrix, labels=obj, metric=self.metric) for obj in predicted_labels]

        max_index = np.argmax(silhouette_scores)
        self.results = predicted_labels[max_index]
예제 #8
0
def test_silhouette_score_integer_precomputed():
    """Check that silhouette_score works for precomputed metrics that are integers.

    Non-regression test for #22107.
    """
    result = silhouette_score([[0, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1],
                              metric="precomputed")
    assert result == pytest.approx(1 / 6)

    # non-zero on diagonal for ints raises an error
    with pytest.raises(ValueError, match="contains non-zero"):
        silhouette_score([[1, 1, 2], [1, 0, 1], [2, 1, 0]], [0, 0, 1],
                         metric="precomputed")
def evaluate_algorithms_with_silhouette_coefficient():
    # 将数据缩放成均值为0,方差为1
    from sklearn.datasets import make_moons
    X_moons, y_moons = make_moons(n_samples=200, noise=0.05, random_state=seed)

    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(X_moons)
    X_scaled = scaler.transform(X_moons)

    fig, axes = plt.subplots(1,
                             4,
                             figsize=(10, 5),
                             subplot_kw={
                                 'xticks': (),
                                 'yticks': ()
                             })

    # 创建一个随机的簇分配,作为参考
    random_state = np.random.RandomState(seed=0)
    random_clusters = random_state.randint(low=0, high=2, size=len(X_moons))

    axes[0].scatter(X_scaled[:, 0],
                    X_scaled[:, 1],
                    c=random_clusters,
                    cmap=mglearn.cm3,
                    s=60)
    from sklearn.metrics.cluster import silhouette_score
    axes[0].set_title('随机分配: {:.2f}'.format(
        silhouette_score(X_scaled, random_clusters)))

    from sklearn.cluster import KMeans
    from sklearn.cluster import DBSCAN
    from sklearn.cluster import AgglomerativeClustering
    algorithms = [
        KMeans(n_clusters=2),
        AgglomerativeClustering(n_clusters=2),
        DBSCAN()
    ]
    for ax, algorithm in zip(axes[1:], algorithms):
        clusters = algorithm.fit_predict(X_scaled)
        ax.scatter(X_scaled[:, 0],
                   X_scaled[:, 1],
                   c=clusters,
                   cmap=mglearn.cm3,
                   s=60)
        ax.set_title('{} : {:.2f}'.format(algorithm.__class__.__name__,
                                          silhouette_score(X_scaled,
                                                           clusters)))
        pass
    plt.suptitle("图3-40:基于轮廓分数对two_moons数据集上的算法进行评价")
예제 #10
0
def cluster(train, val, type, number_of_clusters, plot_folder, classes):
    # todo this should be a class
    if type == "spectral_clustering":
        clustering_model = SpectralClustering(n_clusters=number_of_clusters,
                                              assign_labels="discretize",
                                              random_state=0).fit(
                                                  train["data"])
    elif type == "kmeans":
        clustering_model = KMeans(n_clusters=number_of_clusters,
                                  random_state=0).fit(train["data"])
    else:
        raise NotImplementedError
    # compute metrics
    accuracies = {}
    random_array = np.random.randint(9, size=train["labels"].shape)
    centroids = find_centroids(number_of_clusters, train,
                               clustering_model.labels_)
    test_classifications = cluster_test(val, centroids)
    visualize_clustering(train, clustering_model.labels_, type + "_training",
                         plot_folder, number_of_clusters, centroids)
    visualize_clustering(val, np.asarray(test_classifications),
                         type + "_validation", plot_folder, number_of_clusters,
                         centroids)

    accuracies["random_score"] = homogeneity_score(train["labels"],
                                                   random_array)
    accuracies["v_measure_score"] = v_measure_score(train["labels"],
                                                    clustering_model.labels_)
    accuracies["homogeneity_score"] = homogeneity_score(
        train["labels"], clustering_model.labels_)
    accuracies["completeness_score"] = completeness_score(
        train["labels"], clustering_model.labels_)
    accuracies["silhouette_score"] = silhouette_score(train["data"],
                                                      clustering_model.labels_)
    accuracies["purity_score"], accuracies[
        "contingency_matrix"] = purity_score(train["labels"],
                                             clustering_model.labels_)

    accuracies["v_measure_score_test"] = v_measure_score(
        val["labels"], test_classifications)
    accuracies["homogeneity_score_test"] = homogeneity_score(
        val["labels"], test_classifications)
    accuracies["completeness_score_test"] = completeness_score(
        val["labels"], test_classifications)
    accuracies["silhouette_score_test"] = silhouette_score(
        val["data"], test_classifications)
    accuracies["purity_score_test"], accuracies[
        "contingency_matrix_test"] = purity_score(val["labels"],
                                                  test_classifications)
    return accuracies
예제 #11
0
def main(data, clustering, reduce_dims=True, outpath=None, verbose=True):
    if verbose:
        print("Reading the data...")
        print("Creating the indexes...")

    ind = data[:, 0]
    data = data[:, 1]

    if verbose:
        print('Read {0} rows of data...'.format(ind.shape[0]))

    X, secs = build_and_clean(data, outpath=outpath, verbose=verbose)
    if verbose:
        print("Complete data build in {:0.3f} seconds".format(secs))
        print("New data has {0},{1} dimension".format(X.shape[0], X.shape[1]))
        print("Starting Clustering...")

    if reduce_dims:
        (X, var), secs = feature_reduction(X, 5000, verbose=verbose)
        if verbose:
            print("Complete data build in {:0.3f} seconds".format(secs))
            print("Explained variance of the SVD : {}".format(var))
            print("New data has {0},{1} dimension".format(
                X.shape[0], X.shape[1]))

    clustering.fit(X)
    pred_lbl = clustering.predict(X)
    if verbose:
        print("Finished Clustering with score {:0.3f}".format(
            clustering.inertia_))
        print("Computing Calinski_Harabaz Score")
    # print(calinski_harabaz_score(X.toarray(),pred_lbl))
    if verbose:
        print("Computing Silhouette Score")
    print(silhouette_score(X, pred_lbl, sample_size=10000))
예제 #12
0
def cluster(csv, k):

    data = pd.read_csv(csv)
    # X Features
    X = np.array(data.drop(['botname'], 1))
    X = scale(X.data)
    # Wähle Anzahl der Cluster, Startpunk der Centroids, Iterationen
    # Random State seed für Reproduktion der Ergebnisse
    clustering = KMeans(n_clusters=k,
                        init='k-means++',
                        n_init=10,
                        random_state=6)

    clustering.fit(X)

    X_scaled = X

    result = clustering.fit_predict(X)

    data['Cluster'] = result
    data = data.sort_values(['Cluster'])

    data.to_csv(r"C:\Users\Ronald Scheffler\.spyder-py3\clusterresult" +
                str(k) + ".csv")

    print(silhouette_score(X_scaled, result))
    def _extract_best_optics(self, clusterer):
        max_score = -inf
        best_pred = None

        # Traverse epsilon to detect the best cut
        for my_eps in arange(0.01, 0.5, 0.01):
            pred = cluster_optics_dbscan(
                    reachability=clusterer.reachability_,
                    core_distances=clusterer.core_distances_,
                    ordering=clusterer.ordering_, eps=my_eps)

            if not len(unique(pred)) in (1, len(self.data)):
                score = silhouette_score(X=self.data,
                                         labels=pred,
                                         metric=self.distance_metric,
                                         random_state=13712)

                if score > max_score:
                    max_score = score
                    best_pred = pred

        if best_pred is not None:
            return self._process_noise_as_singletons(best_pred)
        else:
            # All outputs are either one cluster or n clusters
            return self._process_noise_as_singletons(pred)
예제 #14
0
def get_clustering_metrics(train_data,
                           cluster_labels,
                           ground_truth_labels=None):
    clustering_metric_dict = dict({})
    clustering_metric_dict['silhouette_score'] = silhouette_score(
        train_data, cluster_labels, random_state=42)
    clustering_metric_dict[
        'calinski_harabasz_score'] = calinski_harabasz_score(
            train_data, cluster_labels)
    clustering_metric_dict['davies_bouldin_score'] = davies_bouldin_score(
        train_data, cluster_labels)

    if ground_truth_labels is not None:
        clustering_metric_dict['v_measure_score'] = v_measure_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict[
            'fowlkes_mallows_score'] = fowlkes_mallows_score(
                ground_truth_labels, cluster_labels)
        clustering_metric_dict['homogeneity_score'] = homogeneity_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict[
            'normalized_mutual_info_score'] = normalized_mutual_info_score(
                ground_truth_labels, cluster_labels)
        clustering_metric_dict['adjusted_rand_score'] = adjusted_rand_score(
            ground_truth_labels, cluster_labels)
        clustering_metric_dict['completeness_score'] = completeness_score(
            ground_truth_labels, cluster_labels)

    return clustering_metric_dict
예제 #15
0
def get_single_linkage(dataframe):
    dists = pdist(dataframe)
    Z = single(dists)
    best_score = (-1, 2)
    last_score = -1
    non_improving_iter = 0
    k = 2
    while non_improving_iter < 10:
        labels = fcluster(Z, k, criterion='maxclust')

        if len(np.unique(labels)) > 1:
            res = silhouette_score(dataframe, labels)

            if res > last_score:
                non_improving_iter = 0
            else:
                non_improving_iter += 1

            if res > best_score[0]:
                best_score = (res, k, labels)

            last_score = res
        k += 1

    return best_score[2]
예제 #16
0
def main():
    args, atom_indices, project, project_root = parse_cmdline()

    # load all of the data from disk
    xyzlist, sampled_frames = load_trajs(project, os.path.dirname(args.project_yaml),
                                       atom_indices, args.stride, args.fraction)
    assignments = io.loadh(args.assignments, 'arr_0')
    # pick only the assignments that had their xyz data loaded
    assignments = np.concatenate([assignments[i, sampled_frames[i]] for i in range(len(sampled_frames))])

    # make sure we didn't mess up the subsampling and get nonsense data
    assert not np.any(assignments < 0), 'assignments negative? stride/sampling messed up probs. did you use a different strid than you clustered with?'
    #assert np.all(np.unique(assignments) == np.arange(np.max(assignments)+1)), 'assignments dont go from 0 to max. did you use a different strid than you clustered with?'

    n_real_atoms = len(atom_indices)
    n_padded_atoms = xyzlist.shape[2]
    assert n_padded_atoms >= n_real_atoms

    pairwise = calculate_pairwise_rmsd(xyzlist, n_real_atoms)

    print 'computing silhouette...'
    score = silhouette_score(pairwise, assignments, metric='precomputed')
    print 'silhouette score: %f' % score

    path = os.path.join(args.output, 'silhouette.dat')
    print 'saving results to flat text file (append): %s...' % path
    if not os.path.exists(args.output):
        os.makedirs(args.output)

    with open(path, 'a') as f:
        f.write('%f\n' % score)
    def _cluster_ispherical_kmeans(self,
                                   init: str = "k-means++"):
        """
        Employ spherical k-means on L2 normalised directional data points in an
        iterative manner to select the best k according to intrinsic clustering
        evaluation measures.

        Parameters
        ----------
        init: str
            The initialisation method - "random" or "k-means++"

        """
        max_sil = -inf
        best_pred = None
        # Pay attention that k-means++ initialiser may be using Eucledian
        # distances still..
        for ik in range(2, len(self.data) - 1):
            skm = SphericalKMeans(n_clusters=ik, init=init, n_init=500,
                                  random_state=13712, normalize=False)
            pred = skm.fit_predict(self.data)
            score = silhouette_score(X=self.data,
                                     metric=self.distance_metric,
                                     labels=pred,
                                     random_state=13712)
            if score > max_sil:
                max_sil = score
                best_pred = pred

        return best_pred
def cluster(csv):

    data = pd.read_csv(csv)
    # X Features
    X = np.array(data.drop(['botname'], 1))
    #print(X)

    X = scale(X.data)

    # Wähle Anzahl der Cluster, Random State seed für Reproduktion der Ergebnisse
    clustering = MeanShift()

    clustering.fit(X)
    #    print(X_scaled)
    X_scaled = X
    #print(X_scaled)

    result = clustering.fit_predict(X)

    data['Cluster'] = result
    data = data.sort_values(['Cluster'])

    data.to_csv(r"C:\Users\Ronald Scheffler\.spyder-py3\meanshiftresult.csv")
    # Auswertung:
    # Silhouette Score?
    print(silhouette_score(X_scaled, result))
    print(data)
    # CLass Prediction for Trainingsset
    from sklearn.model_selection import train_test_split
    X = np.array(data.drop(['botname'], 1))
    y = data['Cluster']  # Klassen?
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    print(X_test)
    print(y)
예제 #19
0
    def no_label_metrics(input_feature,
                         assigned_label,
                         print_metric,
                         metric='euclidean'):
        """  https://scikit-learn.org/stable/modules/clustering.html#clustering-evaluation """
        no_label_metrics = {}
        no_label_metrics['silhouette_score'] = \
            cluster_metric.silhouette_score(input_feature,
                                            assigned_label,
                                            metric=metric)
        no_label_metrics['calinski_score'] = \
            cluster_metric.calinski_harabaz_score(input_feature,
                                                  assigned_label)
        # no_label_metrics['davie_bouldin_score'] = \
        #     cluster_metric.davies_bouldin_score(input_feature,
        #                                         assigned_label)
        if (print_metric):
            print('Metrics without ture labels')
            print("silhouette score: % s" %
                  no_label_metrics['silhouette_score'])
            print("calinski score: % s" % no_label_metrics['calinski_score'])
            # print("davie bouldin score: % s"
            #       % no_label_metrics['davie_bouldin_score'])

        return no_label_metrics
def cluster_number_study(n=50):
    """ Check out some basic cluster metrics for different cluster sizes. """

    fnamecsv = './AL_pchange_vars.csv'
    df = pd.read_csv(fnamecsv)
    variables = (df.as_matrix())[:, 1:].astype(float)
    for j in range(len(variables[0, :])):  #ugly way of looping over columns
        variables[:,
                  j] = (variables[:, j] - np.mean(variables[:, j])) / np.std(
                      variables[:, j])

    scores = []
    for i in (2 + np.array(range(n))):
        k = KMeans(n_clusters=i, n_init=50, n_jobs=3).fit(variables)
        y = silhouette_score(variables, k.labels_)
        scores.append((i, y))

    with open('cluster_vs_silhouette.txt', 'w') as f:
        for s in scores:
            f.write(str(s[0]) + "\t" + str(s[1]) + "\n")
    print scores

    scores = []
    for i in (2 + np.array(range(n))):
        k = KMeans(n_clusters=i, n_init=50, n_jobs=3).fit(variables)
        #y = silhouette_score(variables,k.labels_)
        y = calinski_harabaz_score(variables, k.labels_)
        scores.append((i, y))

    with open('cluster_vs_calharabaz.txt', 'w') as f:
        for s in scores:
            f.write(str(s[0]) + "\t" + str(s[1]) + "\n")
 def calculate_scores(self):
   x, c, labels = self.x, self.c, self.labels
   self.v_measure = v_measure_score(c, labels)
   self.complete = completeness_score(c, labels)
   self.adjusted_mutual = adjusted_mutual_info_score(c, labels)
   self.adjusted_rand = adjusted_rand_score(c, labels)
   self.silhouette = silhouette_score(x, c)
   self.purity, self.partial_purity = self.__purity__()
 def calculate_scores(self):
     x, c, labels = self.x, self.c, self.labels
     self.v_measure = v_measure_score(c, labels)
     self.complete = completeness_score(c, labels)
     self.adjusted_mutual = adjusted_mutual_info_score(c, labels)
     self.adjusted_rand = adjusted_rand_score(c, labels)
     self.silhouette = silhouette_score(x, c)
     self.purity, self.partial_purity = self.__purity__()
예제 #23
0
    def _check_silhouette(self, dataset, transformed):
        expected = KMeans().fit_predict(dataset)
        got = KMeans().fit_predict(transformed)

        if type(dataset) is not np.ndarray:
            dataset = dataset.toarray()
        if type(expected) is not np.ndarray:
            expected = expected.toarray()
        if type(got) is not np.ndarray:
            got = got.toarray()

        print("Silhouette Index: expected:",
              silhouette_score(dataset, expected), "got:",
              silhouette_score(dataset, got))
        print("Calinski-Harabaz Index: expected:",
              calinski_harabaz_score(dataset, expected), "got:",
              calinski_harabaz_score(dataset, got))
예제 #24
0
def test_correct_labelsize():
    # Assert 1 < n_labels < n_samples
    dataset = datasets.load_iris()
    X = dataset.data

    # n_labels = n_samples
    y = np.arange(X.shape[0])
    err_msg = (r'Number of labels is %d\. Valid values are 2 '
               r'to n_samples - 1 \(inclusive\)' % len(np.unique(y)))
    with pytest.raises(ValueError, match=err_msg):
        silhouette_score(X, y)

    # n_labels = 1
    y = np.zeros(X.shape[0])
    err_msg = (r'Number of labels is %d\. Valid values are 2 '
               r'to n_samples - 1 \(inclusive\)' % len(np.unique(y)))
    with pytest.raises(ValueError, match=err_msg):
        silhouette_score(X, y)
예제 #25
0
def evaluation(X_selected, X_test, n_clusters, y):
    """
    This function calculates ARI, ACC and NMI of clustering results

    Input
    -----
    X_selected: {numpy array}, shape (n_samples, n_selected_features}
            input data on the selected features
    n_clusters: {int}
            number of clusters
    y: {numpy array}, shape (n_samples,)
            true labels

    Output
    ------
    nmi: {float}
        Normalized Mutual Information
    acc: {float}
        Accuracy
    """
    k_means = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300,
                     tol=0.0001, precompute_distances=True, verbose=0,
                     random_state=None, copy_x=True, n_jobs=1)

    k_means.fit(X_selected)
    y_predict = k_means.predict(X_test)
    
    # calculate NMI
    nmi = normalized_mutual_info_score(y, y_predict, average_method='arithmetic')

    # calculate Silhouette score
    try:
        sil = silhouette_score(X_test, y_predict, metric='euclidean')
    except ValueError:
        sil = float('nan')
        app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Silhouette score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT)

    # calculate Davies Bouldin 
    try:
        db = davies_bouldin_score(X_test, y_predict)
    except ValueError:
        db = float('nan')
        app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Davies Bouldin score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT)

    # calculate Calinski Harabasz score
    try:
        ch = calinski_harabasz_score(X_test, y_predict)
    except ValueError:
        ch = float('nan')
        app_logger.warning('K-means lables are {0}; but y_predict are: {1}. Calinski Harabasz score requires predicts in 2 or more clusters.'.format(np.unique(k_means.labels_), np.unique(y_predict)), extra = LOGGER_EXTRA_OBJECT)

    # calculate Purity
    pur = purity(y, y_predict)

    return nmi, sil, db, ch, pur

    '''
예제 #26
0
def silTest():
    """
    轮廓系数
    """
    x, y = make_moons(n_samples=200, noise=0.05, random_state=0)
    scaler = StandardScaler()
    scaler.fit(x)
    x_scaled = scaler.transform(x)

    fig, axes = plt.subplots(1,
                             4,
                             figsize=(15, 3),
                             subplot_kw={
                                 'xticks': (),
                                 'yticks': ()
                             })
    algorithms = [
        KMeans(n_clusters=2),
        AgglomerativeClustering(n_clusters=2),
        DBSCAN()
    ]

    random_state = np.random.RandomState(seed=0)
    random_clusters = random_state.randint(low=0, high=2, size=len(x))

    axes[0].scatter(x_scaled[:, 0],
                    x_scaled[:, 1],
                    c=random_clusters,
                    cmap=mglearn.cm3,
                    s=60)
    axes[0].set_title("Random assignment - ARI: {:.2f}".format(
        silhouette_score(x_scaled, random_clusters)))

    for ax, algorithm in zip(axes[1:], algorithms):
        clusters = algorithm.fit_predict(x_scaled)
        ax.scatter(x_scaled[:, 0],
                   x_scaled[:, 1],
                   c=clusters,
                   cmap=mglearn.cm3,
                   s=60)
        ax.set_title("{} - ARI: {:.2f}".format(
            algorithm.__class__.__name__, silhouette_score(x_scaled,
                                                           clusters)))
    plt.show()
예제 #27
0
def in70():
    from sklearn.datasets import make_moons
    from sklearn.metrics.cluster import silhouette_score
    x, y = make_moons(n_samples=200, noise=0.05, random_state=0)
    from sklearn.preprocessing import StandardScaler
    std = StandardScaler()
    std.fit(x)
    x_scaled = std.transform(x)

    from sklearn.cluster import KMeans

    from sklearn.cluster import AgglomerativeClustering

    from sklearn.cluster import DBSCAN

    fig, axer = plt.subplots(1, 3, figsize=(15, 3))
    axer[0].scatter(x_scaled[:, 0],
                    x_scaled[:, 1],
                    c=KMeans().fit_predict(x_scaled),
                    cmap=mglearn.cm2,
                    s=60)
    axer[0].set_title('KMeans:{}'.format(
        silhouette_score(x_scaled,
                         KMeans().fit_predict(x_scaled))))

    axer[1].scatter(x_scaled[:, 0],
                    x_scaled[:, 1],
                    c=AgglomerativeClustering().fit_predict(x_scaled),
                    cmap=mglearn.cm2,
                    s=60)
    axer[1].set_title('AgglomerativeClustering:{}'.format(
        silhouette_score(x_scaled,
                         AgglomerativeClustering().fit_predict(x_scaled))))

    axer[2].scatter(x_scaled[:, 0],
                    x_scaled[:, 1],
                    c=DBSCAN().fit_predict(x_scaled),
                    cmap=mglearn.cm2,
                    s=60)
    axer[2].set_title('DBSCAN:{}'.format(
        silhouette_score(x_scaled,
                         DBSCAN().fit_predict(x_scaled))))
    plt.legend(['feature 0', 'feature 1'])
    plt.show()
예제 #28
0
def test_no_nan():
    # Assert Silhouette Coefficient != nan when there is 1 sample in a class.
    # This tests for the condition that caused issue 960.
    # Note that there is only one sample in cluster 0. This used to cause the
    # silhouette_score to return nan (see bug #960).
    labels = np.array([1, 0, 1, 1, 1])
    # The distance matrix doesn't actually matter.
    D = np.random.RandomState(0).rand(len(labels), len(labels))
    silhouette = silhouette_score(D, labels, metric='precomputed')
    assert_false(np.isnan(silhouette))
예제 #29
0
파일: mining.py 프로젝트: quirozxc/ti-vort
def _print_clusteringMetrics(_kMean, _X):
	metrics = [['Clustering K-Means', 'Datos obtenidos'],
			   ['Inercia', _kMean.inertia_],
			   ['Entropy', entropy(_kMean.labels_)],
			   ['Silhouette Score', silhouette_score(_X, _kMean.labels_, random_state = 0)],
			   ['Calinski-Harabaz Score', calinski_harabaz_score(_X, _kMean.labels_)], ]

	print('\nMinería de Datos - Clustering K-Means - <VORT>', '\n')
	print(_kMean, '\n')
	print(look(metrics))
예제 #30
0
def test_no_nan():
    # Assert Silhouette Coefficient != nan when there is 1 sample in a class.
    # This tests for the condition that caused issue 960.
    # Note that there is only one sample in cluster 0. This used to cause the
    # silhouette_score to return nan (see bug #960).
    labels = np.array([1, 0, 1, 1, 1])
    # The distance matrix doesn't actually matter.
    D = np.random.RandomState(0).rand(len(labels), len(labels))
    silhouette = silhouette_score(D, labels, metric='precomputed')
    assert_false(np.isnan(silhouette))
예제 #31
0
def test_silhouette():
    # Tests the Silhouette Coefficient.
    dataset = datasets.load_iris()
    X_dense = dataset.data
    X_csr = csr_matrix(X_dense)
    X_dok = sp.dok_matrix(X_dense)
    X_lil = sp.lil_matrix(X_dense)
    y = dataset.target

    for X in [X_dense, X_csr, X_dok, X_lil]:
        D = pairwise_distances(X, metric='euclidean')
        # Given that the actual labels are used, we can assume that S would be
        # positive.
        score_precomputed = silhouette_score(D, y, metric='precomputed')
        assert score_precomputed > 0
        # Test without calculating D
        score_euclidean = silhouette_score(X, y, metric='euclidean')
        pytest.approx(score_precomputed, score_euclidean)

        if X is X_dense:
            score_dense_without_sampling = score_precomputed
        else:
            pytest.approx(score_euclidean, score_dense_without_sampling)

        # Test with sampling
        score_precomputed = silhouette_score(D,
                                             y,
                                             metric='precomputed',
                                             sample_size=int(X.shape[0] / 2),
                                             random_state=0)
        score_euclidean = silhouette_score(X,
                                           y,
                                           metric='euclidean',
                                           sample_size=int(X.shape[0] / 2),
                                           random_state=0)
        assert score_precomputed > 0
        assert score_euclidean > 0
        pytest.approx(score_euclidean, score_precomputed)

        if X is X_dense:
            score_dense_with_sampling = score_precomputed
        else:
            pytest.approx(score_euclidean, score_dense_with_sampling)
예제 #32
0
파일: si.py 프로젝트: ariaaay/CategoryEval
    def calc_si(
        self,
        representations: np.array,
        category_labels: List[int],
        metric: str = 'cosine',
    ):
        """
        """
        print(f'Computing silhouette scores...')

        return silhouette_score(representations, category_labels, metric)
예제 #33
0
def _clustering_metrics(labels, X, digits):
    if X is None:
        SIL = None
        DB = None
        CH = None
    else:
        SIL = round(silhouette_score(X, labels),digits)
        DB = round(davies_bouldin_score(X, labels),digits)
        CH = round(calinski_harabasz_score(X, labels),digits)

    return SIL, DB, CH
예제 #34
0
def get_clustering_cluster_output(df_arr, clustering_method, clustering_options, titles, bow):
    clusters = get_clusters(df_arr, clustering_method, clustering_options)

    cluster_info_df = None
    if clusters is not None and titles is not None and bow is not None:
        cluster_info_df = get_cluster_info_df(10, clusters, titles, bow)

    cluster_info_score = None
    if np.unique(clusters).size > 1:
        cluster_info_score = "Silhouette Score: %.2f" % silhouette_score(df_arr.values, clusters)

    return misc.generate_datatable(cluster_info_df, "cluster_info", 1000, "600px"), cluster_info_score
예제 #35
0
def my_kmeans(feature_vector, no_of_centers=8):
    start = time()
    km = KMeans(n_clusters=no_of_centers).fit(feature_vector)
    end = time()
    labels = km.labels_
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    print 'The no of non noisy clusters is {} with no of centers = {}'.format(n_clusters, no_of_centers)
    print "Time taken to finish {} seconds".format(end - start)
    if option == 1:
        cluster_entropy(labels)
    else:
        print 'The silhouette score is {}'.format(silhouette_score(feature_vector, labels, metric='euclidean'))
예제 #36
0
    def get_score(self, name="None"):
        self.pred = self.cluster.labels_
        self.pred = np.where(self.pred > 1000, -1, self.pred)

        self.class_ = np.unique(self.pred)
        score = {}
        score1 = silhouette_score(self.pred.reshape(-1, 1), self.labels)
        score2 = metrics.adjusted_rand_score(self.pred, self.labels)

        score["轮廓系数"] = score1
        score["调整兰德系数"] = score2
        return score
예제 #37
0
def test_silhouette():
    # Tests the Silhouette Coefficient.
    dataset = datasets.load_iris()
    X_dense = dataset.data
    X_csr = csr_matrix(X_dense)
    X_dok = sp.dok_matrix(X_dense)
    X_lil = sp.lil_matrix(X_dense)
    y = dataset.target

    for X in [X_dense, X_csr, X_dok, X_lil]:
        D = pairwise_distances(X, metric='euclidean')
        # Given that the actual labels are used, we can assume that S would be
        # positive.
        score_precomputed = silhouette_score(D, y, metric='precomputed')
        assert_greater(score_precomputed, 0)
        # Test without calculating D
        score_euclidean = silhouette_score(X, y, metric='euclidean')
        assert_almost_equal(score_precomputed, score_euclidean)

        if X is X_dense:
            score_dense_without_sampling = score_precomputed
        else:
            assert_almost_equal(score_euclidean,
                                score_dense_without_sampling)

        # Test with sampling
        score_precomputed = silhouette_score(D, y, metric='precomputed',
                                             sample_size=int(X.shape[0] / 2),
                                             random_state=0)
        score_euclidean = silhouette_score(X, y, metric='euclidean',
                                           sample_size=int(X.shape[0] / 2),
                                           random_state=0)
        assert_greater(score_precomputed, 0)
        assert_greater(score_euclidean, 0)
        assert_almost_equal(score_euclidean, score_precomputed)

        if X is X_dense:
            score_dense_with_sampling = score_precomputed
        else:
            assert_almost_equal(score_euclidean, score_dense_with_sampling)
예제 #38
0
def my_agg_clustering(feature_vector, no_of_centers, metric_name):
    start = time()
    ag_c = AgglomerativeClustering(n_clusters=no_of_centers, affinity=metric_name).fit(feature_vector)
    end = time()
    labels = ag_c.labels_
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    print 'The no of non noisy clusters is {} with no of centers = {} with metric = {}'.format(
        n_clusters, no_of_centers, metric_name)
    print "Time taken to finish {} seconds".format(end - start)
    if option == 1:
        cluster_entropy(labels)
    else:
        print 'The silhouette score is {}'.format(silhouette_score(feature_vector, labels, metric=metric_name))
예제 #39
0
def test_silhouette_paper_example():
    # Explicitly check per-sample results against Rousseeuw (1987)
    # Data from Table 1
    lower = [5.58,
             7.00, 6.50,
             7.08, 7.00, 3.83,
             4.83, 5.08, 8.17, 5.83,
             2.17, 5.75, 6.67, 6.92, 4.92,
             6.42, 5.00, 5.58, 6.00, 4.67, 6.42,
             3.42, 5.50, 6.42, 6.42, 5.00, 3.92, 6.17,
             2.50, 4.92, 6.25, 7.33, 4.50, 2.25, 6.33, 2.75,
             6.08, 6.67, 4.25, 2.67, 6.00, 6.17, 6.17, 6.92, 6.17,
             5.25, 6.83, 4.50, 3.75, 5.75, 5.42, 6.08, 5.83, 6.67, 3.67,
             4.75, 3.00, 6.08, 6.67, 5.00, 5.58, 4.83, 6.17, 5.67, 6.50, 6.92]
    D = np.zeros((12, 12))
    D[np.tril_indices(12, -1)] = lower
    D += D.T

    names = ['BEL', 'BRA', 'CHI', 'CUB', 'EGY', 'FRA', 'IND', 'ISR', 'USA',
             'USS', 'YUG', 'ZAI']

    # Data from Figure 2
    labels1 = [1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 1]
    expected1 = {'USA': .43, 'BEL': .39, 'FRA': .35, 'ISR': .30, 'BRA': .22,
                 'EGY': .20, 'ZAI': .19, 'CUB': .40, 'USS': .34, 'CHI': .33,
                 'YUG': .26, 'IND': -.04}
    score1 = .28

    # Data from Figure 3
    labels2 = [1, 2, 3, 3, 1, 1, 2, 1, 1, 3, 3, 2]
    expected2 = {'USA': .47, 'FRA': .44, 'BEL': .42, 'ISR': .37, 'EGY': .02,
                 'ZAI': .28, 'BRA': .25, 'IND': .17, 'CUB': .48, 'USS': .44,
                 'YUG': .31, 'CHI': .31}
    score2 = .33

    for labels, expected, score in [(labels1, expected1, score1),
                                    (labels2, expected2, score2)]:
        expected = [expected[name] for name in names]
        # we check to 2dp because that's what's in the paper
        pytest.approx(expected,
                      silhouette_samples(D, np.array(labels),
                                         metric='precomputed'),
                      abs=1e-2)
        pytest.approx(score,
                      silhouette_score(D, np.array(labels),
                                       metric='precomputed'),
                      abs=1e-2)
예제 #40
0
def my_dbscan(feature_vector, metric_name, eps=None, minpts=None):
    start = time()
    if eps is None and minpts is None:
        db = DBSCAN(metric=metric_name).fit(feature_vector)
    elif minpts is None:
        db = DBSCAN(eps=eps, metric=metric_name).fit(feature_vector)
    elif eps is None:
        db = DBSCAN(min_samples=minpts, metric=metric_name).fit(feature_vector)
    else:
        db = DBSCAN(eps=eps, min_samples=minpts, metric=metric_name).fit(feature_vector)
    end = time()
    labels = db.labels_
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)  # ignoring noise if present
    print 'The no of non noisy clusters is {} with metric = {}'.format(n_clusters, metric_name)
    print "Time taken to finish {} seconds".format(end - start)
    if option == 1:
        cluster_entropy(labels)
    else:
        print 'The silhouette score is {}'.format(silhouette_score(feature_vector, labels, metric=metric_name))
def test_cluster_size_1():
    # Assert Silhouette Coefficient == 0 when there is 1 sample in a cluster
    # (cluster 0). We also test the case where there are identical samples
    # as the only members of a cluster (cluster 2). To our knowledge, this case
    # is not discussed in reference material, and we choose for it a sample
    # score of 1.
    X = [[0.], [1.], [1.], [2.], [3.], [3.]]
    labels = np.array([0, 1, 1, 1, 2, 2])

    # Cluster 0: 1 sample -> score of 0 by Rousseeuw's convention
    # Cluster 1: intra-cluster = [.5, .5, 1]
    #            inter-cluster = [1, 1, 1]
    #            silhouette    = [.5, .5, 0]
    # Cluster 2: intra-cluster = [0, 0]
    #            inter-cluster = [arbitrary, arbitrary]
    #            silhouette    = [1., 1.]

    silhouette = silhouette_score(X, labels)
    assert_false(np.isnan(silhouette))
    ss = silhouette_samples(X, labels)
    assert_array_equal(ss, [0, .5, .5, 0, 1, 1])
예제 #42
0
파일: example1.py 프로젝트: jzm17173/Learn
-1 代表噪声

增大eps,更多的点会被包含在一个簇中。这让簇变大,但可能也会导致多个簇合并成一个
增大min_samples,核心点会变得更少,更多的点被标记为噪声

参数eps 在某种程度上更加重要,因为它决定了点与点之间“接近”的含义。
将eps 设置得非常小,意味着没有点是核心样本,可能会导致所有点都被标记为噪声。
将eps 设置得非常大,可能会导致所有点形成单个簇

设置min_samples 主要是为了判断稀疏区域内的点被标记为异常值还是形成自己的簇。
如果增大min_samples,任何一个包含少于min_samples 个样本的簇现在将被标记为噪声。
因此,min_samples 决定簇的最小尺寸
'''
print(clusters)
print(len(set(clusters)))

if len(set(clusters)) > 1:
    print('{} {} {}'.format(eps, min_samples, silhouette_score(X, clusters)))
# 0.5 5 -0.12276159423271887
# 0.7 5 0.3593629426203677

'''
如果全是-1就抛异常
1 < n_labels 不成立
def check_number_of_labels(n_labels, n_samples):
    if not 1 < n_labels < n_samples:
        raise ValueError("Number of labels is %d. Valid values are 2 "
                         "to n_samples - 1 (inclusive)" % n_labels)

'''
예제 #43
0
def test_non_numpy_labels():
    dataset = datasets.load_iris()
    X = dataset.data
    y = dataset.target
    assert_equal(
        silhouette_score(list(X), list(y)), silhouette_score(X, y))