예제 #1
0
def test_calinski_harabaz_score():
    rng = np.random.RandomState(seed=0)

    # Assert message when there is only one label
    assert_raise_message(ValueError, "Number of labels is",
                         calinski_harabaz_score,
                         rng.rand(10, 2), np.zeros(10))

    # Assert message when all point are in different clusters
    assert_raise_message(ValueError, "Number of labels is",
                         calinski_harabaz_score,
                         rng.rand(10, 2), np.arange(10))

    # Assert the value is 1. when all samples are equals
    assert_equal(1., calinski_harabaz_score(np.ones((10, 2)),
                                            [0] * 5 + [1] * 5))

    # Assert the value is 0. when all the mean cluster are equal
    assert_equal(0., calinski_harabaz_score([[-1, -1], [1, 1]] * 10,
                                            [0] * 10 + [1] * 10))

    # General case (with non numpy arrays)
    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
         [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
    assert_almost_equal(calinski_harabaz_score(X, labels),
                        45 * (40 - 4) / (5 * (4 - 1)))
예제 #2
0
def test_calinski_harabaz_score():
    rng = np.random.RandomState(seed=0)

    # Assert message when there is only one label
    assert_raise_message(ValueError, "Number of labels is",
                         calinski_harabaz_score, rng.rand(10, 2), np.zeros(10))

    # Assert message when all point are in different clusters
    assert_raise_message(ValueError,
                         "Number of labels is", calinski_harabaz_score,
                         rng.rand(10, 2), np.arange(10))

    # Assert the value is 1. when all samples are equals
    assert_equal(1., calinski_harabaz_score(np.ones((10, 2)),
                                            [0] * 5 + [1] * 5))

    # Assert the value is 0. when all the mean cluster are equal
    assert_equal(
        0., calinski_harabaz_score([[-1, -1], [1, 1]] * 10,
                                   [0] * 10 + [1] * 10))

    # General case (with non numpy arrays)
    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + [[0, 4], [1, 3]] * 5 +
         [[3, 1], [4, 0]] * 5)
    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
    assert_almost_equal(calinski_harabaz_score(X, labels),
                        45 * (40 - 4) / (5 * (4 - 1)))
def cluster_number_study(n=50):
    """ Check out some basic cluster metrics for different cluster sizes. """

    fnamecsv = './AL_pchange_vars.csv'
    df = pd.read_csv(fnamecsv)
    variables = (df.as_matrix())[:, 1:].astype(float)
    for j in range(len(variables[0, :])):  #ugly way of looping over columns
        variables[:,
                  j] = (variables[:, j] - np.mean(variables[:, j])) / np.std(
                      variables[:, j])

    scores = []
    for i in (2 + np.array(range(n))):
        k = KMeans(n_clusters=i, n_init=50, n_jobs=3).fit(variables)
        y = silhouette_score(variables, k.labels_)
        scores.append((i, y))

    with open('cluster_vs_silhouette.txt', 'w') as f:
        for s in scores:
            f.write(str(s[0]) + "\t" + str(s[1]) + "\n")
    print scores

    scores = []
    for i in (2 + np.array(range(n))):
        k = KMeans(n_clusters=i, n_init=50, n_jobs=3).fit(variables)
        #y = silhouette_score(variables,k.labels_)
        y = calinski_harabaz_score(variables, k.labels_)
        scores.append((i, y))

    with open('cluster_vs_calharabaz.txt', 'w') as f:
        for s in scores:
            f.write(str(s[0]) + "\t" + str(s[1]) + "\n")
예제 #4
0
    def no_label_metrics(input_feature,
                         assigned_label,
                         print_metric,
                         metric='euclidean'):
        """  https://scikit-learn.org/stable/modules/clustering.html#clustering-evaluation """
        no_label_metrics = {}
        no_label_metrics['silhouette_score'] = \
            cluster_metric.silhouette_score(input_feature,
                                            assigned_label,
                                            metric=metric)
        no_label_metrics['calinski_score'] = \
            cluster_metric.calinski_harabaz_score(input_feature,
                                                  assigned_label)
        # no_label_metrics['davie_bouldin_score'] = \
        #     cluster_metric.davies_bouldin_score(input_feature,
        #                                         assigned_label)
        if (print_metric):
            print('Metrics without ture labels')
            print("silhouette score: % s" %
                  no_label_metrics['silhouette_score'])
            print("calinski score: % s" % no_label_metrics['calinski_score'])
            # print("davie bouldin score: % s"
            #       % no_label_metrics['davie_bouldin_score'])

        return no_label_metrics
예제 #5
0
    def _check_silhouette(self, dataset, transformed):
        expected = KMeans().fit_predict(dataset)
        got = KMeans().fit_predict(transformed)

        if type(dataset) is not np.ndarray:
            dataset = dataset.toarray()
        if type(expected) is not np.ndarray:
            expected = expected.toarray()
        if type(got) is not np.ndarray:
            got = got.toarray()

        print("Silhouette Index: expected:",
              silhouette_score(dataset, expected), "got:",
              silhouette_score(dataset, got))
        print("Calinski-Harabaz Index: expected:",
              calinski_harabaz_score(dataset, expected), "got:",
              calinski_harabaz_score(dataset, got))
예제 #6
0
파일: mining.py 프로젝트: quirozxc/ti-vort
def _print_clusteringMetrics(_kMean, _X):
	metrics = [['Clustering K-Means', 'Datos obtenidos'],
			   ['Inercia', _kMean.inertia_],
			   ['Entropy', entropy(_kMean.labels_)],
			   ['Silhouette Score', silhouette_score(_X, _kMean.labels_, random_state = 0)],
			   ['Calinski-Harabaz Score', calinski_harabaz_score(_X, _kMean.labels_)], ]

	print('\nMinería de Datos - Clustering K-Means - <VORT>', '\n')
	print(_kMean, '\n')
	print(look(metrics))
def test_calinski_harabaz_score():
    assert_raises_on_only_one_label(calinski_harabaz_score)

    assert_raises_on_all_points_same_cluster(calinski_harabaz_score)

    # Assert the value is 1. when all samples are equals
    assert_equal(1., calinski_harabaz_score(np.ones((10, 2)),
                                            [0] * 5 + [1] * 5))

    # Assert the value is 0. when all the mean cluster are equal
    assert_equal(0., calinski_harabaz_score([[-1, -1], [1, 1]] * 10,
                                            [0] * 10 + [1] * 10))

    # General case (with non numpy arrays)
    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 +
         [[0, 4], [1, 3]] * 5 + [[3, 1], [4, 0]] * 5)
    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
    pytest.approx(calinski_harabaz_score(X, labels),
                        45 * (40 - 4) / (5 * (4 - 1)))
예제 #8
0
def test_calinski_harabaz_score():
    assert_raises_on_only_one_label(calinski_harabaz_score)

    assert_raises_on_all_points_same_cluster(calinski_harabaz_score)

    # Assert the value is 1. when all samples are equals
    assert_equal(1., calinski_harabaz_score(np.ones((10, 2)),
                                            [0] * 5 + [1] * 5))

    # Assert the value is 0. when all the mean cluster are equal
    assert_equal(
        0., calinski_harabaz_score([[-1, -1], [1, 1]] * 10,
                                   [0] * 10 + [1] * 10))

    # General case (with non numpy arrays)
    X = ([[0, 0], [1, 1]] * 5 + [[3, 3], [4, 4]] * 5 + [[0, 4], [1, 3]] * 5 +
         [[3, 1], [4, 0]] * 5)
    labels = [0] * 10 + [1] * 10 + [2] * 10 + [3] * 10
    pytest.approx(calinski_harabaz_score(X, labels),
                  45 * (40 - 4) / (5 * (4 - 1)))
예제 #9
0
def ECBO(dataSet, c):
    n, m = dataSet.shape
    clusterAssment = np.mat(np.zeros((n, 1)))

    #1. init membership  and centroids
    #initail_U = initial_U(dataSet, c)
    initial_centroids = initCentroids(dataSet, c)
    print('initial centroids:\n', initial_centroids)

    #2. update membership U
    lastJ, lastU = upUwithSimplex(dataSet, initial_centroids, c, A1, A2)
    last_centroids = calCentroids(dataSet, lastU, c)

    #3. update centroids
    J, U = upUwithSimplex(dataSet, last_centroids, c, A1, A2)
    centroids = calCentroids(dataSet, U, c)

    #while last_centroids.all() != centroids.all():
    #count = 0
    #while (count < 2):
    lastcentroids = centroids
    J, U = upUwithSimplex(dataSet, lastcentroids, c, A1, A2)
    centroids = calCentroids(dataSet, U, c)
    #count += 1

    #assgin data to clusters
    for k in range(n):
        for i in range(c):
            if (U[k, i] == 1):
                clusterAssment[k] = i
    """for i in range(n):
        minDist = 10000.0
        minIdex = 0
        for j in range(c):
            distance = euclDistance(dataSet[i], centroids[j])
            if distance < minDist:
                minDist = distance #update min distance
                minIdex = j #update assignment of data to cluster
        if clusterAssment[i, 0] != minIdex:
            clusterAssment[i, :] = minIdex"""

    label_pred = clusterAssment
    #print(label_pred)
    CHI = calinski_harabaz_score(dataSet, label_pred)
    print('final U:\n', np.mat(U))
    print('final centroids:\n', centroids)
    print('objective function:', value(J.objective))
    print('cluster assignment:', clusterAssment)
    print('Calinski-Harabaz Index:', CHI)
    for i in range(c):
        print(np.sum(U[:, i]))
    return U, centroids, J, clusterAssment
def ECBO(dataSet, c):
    n, m = dataSet.shape
    clusterAssment = np.mat(np.zeros((n, 1)))

    #1. init membership  and centroids
    #initail_U = initial_U(dataSet, c)
    initial_centroids = initCentroids(dataSet, c)
    print('initial centroids:\n', initial_centroids)

    #2. update membership U
    lastJ, lastU = upUwithSimplex(dataSet, initial_centroids, c, k)
    last_centroids = calCentroids(dataSet, lastU, c)

    #3. update centroids
    J, U = upUwithSimplex(dataSet, last_centroids, c, k)
    centroids = calCentroids(dataSet, U, c)

    #while last_centroids.all() != centroids.all():
    count = 0
    while (count < 10):
        lastcentroids = centroids
        J, U = upUwithSimplex(dataSet, lastcentroids, c, k)
        centroids = calCentroids(dataSet, U, c)
        count += 1

    for p in range(n):
        for i in range(c):
            if (U[p, i] == 1):
                clusterAssment[p] = i

    label_pred = clusterAssment
    # print(label_pred)
    CHI = calinski_harabaz_score(dataSet, label_pred)
    print('final U:\n', np.mat(U))
    print('final centroids:\n', centroids)
    print('objective function:', value(J.objective))
    print('cluster assignment:', clusterAssment)
    print('Calinski-Harabaz Index:', CHI)

    for i in range(c):
        print(np.sum(U[:, i]))
    return U, centroids, J, clusterAssment
def main():

    min_pts = 69
    epsilon = 0.0031894736842105263
    with open('train.dat', 'r') as fh:
        data = fh.read().splitlines()

    data_csr = build_csr(data)

    # tf-idf + TruncatedSVD = LSA (latent semantic analysis)
    X = TfidfTransformer(norm='l2',
                         use_idf=True,
                         smooth_idf=True,
                         sublinear_tf=False).fit_transform(data_csr)
    selector = TruncatedSVD(n_components=5, algorithm="arpack", random_state=8)
    X = selector.fit_transform(X)
    X = sparse.csr_matrix(X)
    X = csr_l2normalize(X, copy=True)

    print('Running DBSCAN clustering algorithm now....')
    print("For eps: {}, minPts: {}".format(epsilon, min_pts))

    labels, core_pts, border_pts = DBSCAN(X, eps=epsilon, minPts=min_pts)
    labels = assign_noise_to_centroid(X, labels, recompute_centroid=True)

    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    print('  Estimated number of clusters: %d' % n_clusters_)

    sil_score = silhouette_score(X, labels, metric='cosine', random_state=8)
    print("  Silhouette Coefficient: %0.5f" % sil_score)

    X_dense = X.toarray()
    ch_score = calinski_harabaz_score(X_dense, labels)
    print("  Calinski Harabaz Score: %0.5f" % ch_score)

    print('Writing out predictions now....')
    submission_df = pd.read_csv("format.dat", sep="\t", header=None)
    submission_df[0] = labels
    submission_df.to_csv('submission.txt', sep='\n', index=False, header=False)
예제 #12
0
def L1_ECBO(dataSet, c):
    n, m = dataSet.shape
    clusterAssment = np.mat(np.zeros((n, 1)))

    initial_centroids = initCentroids(dataSet, c)
    print('initial centroids:\n', initial_centroids)

    lastJ, lastU = upUwithSimplex(dataSet, initial_centroids, c, A1, A2)
    last_centroids = calCentroids(dataSet, lastU, c)

    # 3. update centroids
    J, U = upUwithSimplex(dataSet, last_centroids, c, A1, A2)
    centroids = calCentroids(dataSet, U, c)

    count = 0
    while count < 9:
        lastcentroids = centroids
        J, U = upUwithSimplex(dataSet, lastcentroids, c, A1, A2)
        centroids = calCentroids(dataSet, U, c)
        count += 1

    for p in range(n):
        for i in range(c):
            if (U[p, i] == 1):
                clusterAssment[p] = i

    label_pred = clusterAssment
    # print(label_pred)
    CHI = calinski_harabaz_score(dataSet, label_pred)

    print('final membership:', U)
    print('objetive function:', J)
    print('final centroids:', centroids, type(centroids))
    #print('cluster assignment:', clusterAssment)
    print('Calinski-Harabaz Index:', CHI)
    for i in range(c):
        print(np.sum(U[:, i]))

    return U, J, centroids, clusterAssment
예제 #13
0
def L1_kMeans(dataSet, c):
    n, m = dataSet.shape
    clusterAssment = np.mat(np.zeros((n, 1)))

    initial_centroids = initCentroids(dataSet, c)
    print('initial centroids:\n', initial_centroids)

    lastJ, lastU = updateU(dataSet, initial_centroids, c)
    centroids = calCentroids(dataSet, lastU, c)
    J, U = updateU(dataSet, centroids, c)
    centroids = np.array(centroids)
    count = 0
    while count < 10:
        lastJ = J
        centroids = calCentroids(dataSet, U, c)
        J, U = updateU(dataSet, centroids, c)
        count += 1
    print('count:', count)

    for k in range(n):
        for i in range(c):
            if (U[k, i] == 1):
                clusterAssment[k] = i

    label_pred = clusterAssment
    # print(label_pred)
    CHI = calinski_harabaz_score(dataSet, label_pred)

    print('final membership:', U)
    print('objetive function:', J)
    print('final centroids:', centroids)
    #print('cluster assignment:', clusterAssment)
    print('Calinski-Harabaz Index:', CHI)
    for i in range(c):
        print(np.sum(U[:, i]))

    return U, J, centroids, clusterAssment
예제 #14
0
def fuzzyCMeans(dataSet, c, m):
    #n, p = dataSet.shape

    # 1. init centroids randomly
    initial_centroids = initCentroids(dataSet, c)

    # 2. update membership U with fixing centroids
    last_U = calUwithCent(dataSet, initial_centroids, c, m)
    last_J = objFuncJ(dataSet, last_U, initial_centroids, c, m)

    # 3. update centroids with fixing U
    centroids = calCentwithU(dataSet, last_U, c, m)

    U = calUwithCent(dataSet, centroids, c, m)
    J = objFuncJ(dataSet, U, centroids, c, m)

    count = 0
    while count < 10:
        centroids = calCentwithU(dataSet, U, c, m)
        U = calUwithCent(dataSet, centroids, c, m)
        J = objFuncJ(dataSet, U, centroids, c, m)
        count += 1

    clusterAssment = getCluster(dataSet, U, c)
    #label_pred = clusterAssment

    CHI = calinski_harabaz_score(dataSet, clusterAssment)

    print('final membership:', U)
    print('objetive function:', J)
    print('final centroids:', centroids)
    print('cluster assignment:', clusterAssment)
    print('Calinski-Harabaz Index:', CHI)
    for i in range(c):
        print("cluster size:", np.sum(U[:, i]))

    return U, centroids, J, clusterAssment
예제 #15
0
    #x = glass.values
    #data = x[:, [3, 5]]

    #wine = datasets.load_wine()
    #x = wine.data
    #data = x[:, [0, 11]]

    #iris = datasets.load_iris()
    #original_x = iris.data
    #data = original_x[:, :2] + original_x[:, 2:]

    #data = pd.read_csv('D:/Tsukuba/My Research/Program/dataset/4_three_ciecles_with_diffR/three_ciecles_with_diffR.csv')
    #x = data.values[1:, :2]
    #y = data.values[1:, -1]
    #print(x, y)
    data = pd.read_csv('D:/Tsukuba/My Research/Program/dataset/diff_var.csv')
    x = data.values[:, 1:3]
    y = data.values[:, 0]

    pso = PSO(n_cluster=3, n_particle=10, data=x)  # max_iter, print_debug
    pso.run()
    pred_cluster = pso.cluster

    ari = adjusted_rand_score(y, pred_cluster)
    ch = calinski_harabaz_score(x, pred_cluster)
    print('ARI:', ari)
    print('CH:', ch)
    pso.show_cluter()


from sklearn.cluster import KMeans
from sklearn.metrics.cluster import calinski_harabaz_score, silhouette_score

# 加载数据
data = pd.read_csv('data.csv')
# print(data.head())
x_train = data[["2019年国际排名", "2018世界杯", "2015亚洲杯"]]

# 特征工程 - 标准化
min_max_scaler = preprocessing.StandardScaler()
x_train = min_max_scaler.fit_transform(x_train)

# 创建KMeans聚类评估器
estimator = KMeans(n_clusters=3)

# 模型训练
estimator.fit(x_train)
y_predict = estimator.predict(x_train)

# 模型评估
# CH系数
# 值越大聚类效果越好
print('CH系数:', calinski_harabaz_score(x_train, y_predict))
# 平均轮廓系数的取值范围为[-1,1],系数越大,聚类效果越好
print('平均轮廓系数:', silhouette_score(x_train, y_predict))

# 合并聚类结果,插入到原数据中
result = pd.concat((data, pd.DataFrame(y_predict)), axis=1)
result.rename({0: u'聚类'}, axis=1, inplace=True)
print(result)
'''

for K in range(4, 5):
    print('\n\n\nK = ', K)
    kmeans = KMeans(n_clusters=K).fit(numpy_selected_data)
    labels = kmeans.labels_
    save_name = 'Set1_KMeans_K=' + str(K) + '.csv'
    np.savetxt(save_name, labels, delimiter=',')
    label_counter = collections.Counter(labels)
    print(label_counter)
    unique_clusters, centroids = getAveragePlayerFromCluster(
        non_normalized_data, labels)

    print("Start calculating centroids")
    i = 0
    column_names = []
    for _cent in centroids:
        if i == 0:
            new_file = pd.DataFrame(_cent.values).T
            column_names = _cent.index
        else:
            new_file = new_file.append(pd.DataFrame(_cent.values).T)
        i = i + 1
    new_file.columns = column_names
    new_file.to_csv('KMeans_K=' + str(K) + '.csv', index=False)
    unique_clusters = set(labels)
    palette = sns.color_palette('hls', len(unique_clusters))
    cluster_colors = [palette[col] for col in labels]
    print("K-Means Calinski-Harabaz: " +
          str(smc.calinski_harabaz_score(selected_data, labels)))
예제 #18
0
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import calinski_harabaz_score

# 1. 生成数据
X, y = make_blobs(n_samples=1000,
                  n_features=2,
                  centers=[(-1, -1), (0, 0), (1, 1), (2, 2)],
                  cluster_std=[0.4, 0.2, 0.2, 0.2])

print(X)

# 2. 使用KMeans进行聚类
estimator = KMeans(n_clusters=4)
y_pred = estimator.fit_predict(X)

# 3. 数据可视化(绘制分类结果图)
plt.figure(figsize=(5, 4), dpi=80)
# 绘制散点图, 查看数据的分布情况
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
# 显示
plt.show()

# 分类评估
# 2 3096.7473856516135
# 3 2940.6149446783725
# 4 5866.614435267102
# CH系数: 值越大聚类效果越好.
print('CH系数', calinski_harabaz_score(X, y_pred))