示例#1
0
def _cluster(params):
    cls = None
    method = sh.getConst('method')
    if method=='kmedoid':
        assert False
        # from kmedoid import kmedsoid
        # cls = kmedoid
    elif method=='dbscan':
        from sklearn.cluster import DBSCAN
        cls = DBSCAN(eps=params['eps'],min_samples=params['min_samples'],
                     metric='precomputed')
    else:
        assert False, 'FATAL: unknown cluster method'

    ##
    mat = sh.getConst('mat')
    labels = cls.fit_predict(mat)
    nLabels = len(set(labels))

    ##
    sil = None; cal = None
    if (nLabels >= 2)and(nLabels <= len(labels)-1):
        sil = met.silhouette_score(mat,labels,'precomputed')
        cal = met.calinski_harabaz_score(mat,labels)
    perf = dict(silhouette_score=sil,calinski_harabaz_score=cal)

    return (labels,perf)
示例#2
0
def tryOne(label, kDict):
    data = kDict['data']
    kValues = kDict['k']
    if 'main' in kDict:
        main = kDict['main']
        try:
            main()
        except:
            traceback.print_exc()
    for nc in kValues:
        print('%s[%d]:' % (label, nc))
        kmeans = KMeans(n_clusters=nc)
        try:
            kmeans.fit(data)
        except:
            traceback.print_exc()
            continue
        print(kmeans.cluster_centers_)
        labels = kmeans.labels_

        # https://sklearn.org/modules/clustering.html#silhouette-coefficient
        sscore = metrics.silhouette_score(data, labels)
        print('Silhouette Coefficient: %f' % sscore)

        # https://sklearn.org/modules/clustering.html#calinski-harabaz-index
        chindex = metrics.calinski_harabaz_score(data, labels)
        print('Calinski-Harabaz Index: %f' % chindex)
def question_two(X): 
    import matplotlib.pyplot as plt
    from sklearn.cluster import KMeans
    from sklearn import metrics

    kmean = KMeans(n_clusters=4, random_state=9)
    y_pred = kmean.fit_predict(X)
    
    data = X.copy()
    data[u'标签'] = kmean.labels_
    for i in range(4):
        tmp = data[data[u'标签'] == i]
        print('类簇人数占人数比重',tmp.shape[0] / data.shape[0])
        print(u'一级诊断 icd9编码范围:', min(tmp[u'一级诊断']), max(tmp[u'一级诊断']))
        print(u'二级诊断 icd9编码范围:', min(tmp[u'二级诊断']), max(tmp[u'三级诊断']))
        print(u'三级诊断 icd9编码范围:', min(tmp[u'三级诊断']), max(tmp[u'三级诊断']))
    
    # 绘制三维图
    x, y, z = list(map(eval, list(X[u'一级诊断']))), list(map(eval, list(X[u'二级诊断']))), list(map(eval, list(X[u'三级诊断'])))
    ax = plt.subplot(111, projection='3d')  # 创建一个三维的绘图工程
    #  将数据点分成三部分画,在颜色上有区分度
    ax.scatter(x, y, z, c = y_pred, s=0.1)  # 绘制数据点
    
    ax.set_zlabel('diag_3')  # 坐标轴
    ax.set_ylabel('diag_2')
    ax.set_xlabel('diag_1')
    plt.show()
    print(metrics.calinski_harabaz_score(X, y_pred))    
    
    
    # 寻找最优的k值结果
    result = []
    for i in range(100)[3:]:    
        y_pred = KMeans(n_clusters=i, random_state=9).fit_predict(X)
        tmp = metrics.calinski_harabaz_score(X, y_pred)
        result.append(tmp)
        print("Calinski-Harabasz Score", tmp)
        
    plt.scatter([i for i in range(100)[3:]], result, alpha=0.5)
    plt.show()
示例#4
0
 def metrics(pred, labels=None, embeddings=None):
     from sklearn import metrics
     print('Estimated number of clusters: {}'.format(len(np.unique(pred))))
     if labels is not None:
         print("Homogeneity: {:0.3f}".format(
             metrics.homogeneity_score(labels, pred)))
         print("Completeness: {:0.3f}".format(
             metrics.completeness_score(labels, pred)))
         print("V-measure: {:0.3f}".format(
             metrics.v_measure_score(labels, pred)))
         print("Adjusted Rand Index: {:0.3f}".format(
             metrics.adjusted_rand_score(labels, pred)))
         print("Adjusted Mutual Information: {:0.3f}".format(
             metrics.adjusted_mutual_info_score(labels, pred)))
     if embeddings is not None:
         print("Silhouette Coefficient: {:0.3f}".format(
             metrics.silhouette_score(embeddings, pred)))
         print("Calinski-Harabaz Index: {:0.3f}".format(
             metrics.calinski_harabaz_score(embeddings, pred)))
示例#5
0
def cluster_range(X, clusterer, k_start, k_stop):
    """Calculate the internal validation criteria for different number of
    clusters
    
    Parameters
    ----------
    X : array
        Design matrix with each row corresponding to a point
    clusterer : class
        Contains the random state for replicability
    k_start, k_stop : int
        Starting and last number of clusters to perform the internal 
        validation on.

    Returns
    -------
    cluster : dict
        dictionary containing the internal validation values for different k's
    """
    # YOUR CODE HERE
    ys = []
    inertias = []
    chs = []
    scs = []
    iidrs = []

    for k in range(k_start, k_stop + 1):
        print('Executing k=', k)
        kmeans_X = KMeans(n_clusters=k, random_state=clusterer.random_state)
        y_predict_X = kmeans_X.fit_predict(X)
        ys.append(y_predict_X)
        inertias.append(kmeans_X.inertia_)
        iidrs.append(intra_to_inter(X, y_predict_X, euclidean, 50))
        chs.append(calinski_harabaz_score(X, y_predict_X))
        scs.append(silhouette_score(X, y_predict_X))
        clear_output()
    cluster = {}
    cluster['ys'] = ys
    cluster['inertias'] = inertias
    cluster['chs'] = chs
    cluster['iidrs'] = iidrs
    cluster['scs'] = scs
    return cluster
示例#6
0
def cluster_score(X, y, true_labels=None, _metric="silhouette_score", **kwds):
    """
    A score to compare clusterings.
    If you know the true labels, use fowlkes_mallows_score. Otherwise use silhouette_score.
    """
    # Unsupervised metrics
    if _metric == 'calinski_harabaz_score':
        return metrics.calinski_harabaz_score(X, y)
    elif _metric == 'silhouette_score':
        return metrics.silhouette_score(X, y, **kwds)

    # Supervised metrics
    elif _metric == 'adjusted_rand_score':
        return metrics.adjusted_rand_score(true_labels, y)
    elif _metric == 'fowlkes_mallows_score':
        return metrics.fowlkes_mallows_score(true_labels, y)

    else:
        raise ValueError('Unimplemented metric')
示例#7
0
def calculoMedidas(subset, predictions):
    normalized_set = preprocessing.normalize(subset, norm='l2')

    meditions = []

    for pred in predictions:
        #CalculamosCalinski-Harabaz
        metric_CH = metrics.calinski_harabaz_score(normalized_set, pred[1])
        #Calculamos Silhouette en el 50% de la població,
        tric_SC = metrics.silhouette_score(normalized_set,
                                           pred[1],
                                           metric='euclidean',
                                           sample_size=floor(0.5 *
                                                             len(subset)),
                                           random_state=123456)

        meditions.append((pred[0], metric_CH, tric_SC))

    return meditions
示例#8
0
def Grid_Birch(param_grid, features):
    '''
    BIRCH 基于密度聚类
    Parameters:
        param_grid:     参数网格
        x:              特征    
    '''
    for threshold, branching_factor in zip(param_grid['threshold'],
                                           param_grid['branching_factor']):
        clf = Birch(n_clusters=4,
                    threshold=threshold,
                    branching_factor=branching_factor)
        clf.fit(features)
        predicted = clf.predict(features)

        plot_scatter(features, predicted)
        print('threshold:', threshold, 'branching_factor:', branching_factor)
        print('metrics.calinski_harabaz_score:',
              metrics.calinski_harabaz_score(features, predicted))
def k_means_equi(vectors, prod=4, min_size=100, max_size=500):
    MIN_SIZE = min_size
    MAX_SIZE = max_size
    clusters = np.zeros(vectors.shape[0], dtype=int)
    counter = pd.Series(collections.Counter(clusters))
    last_max = np.inf
    cluster_centers = dict()

    while max(counter.values) > MAX_SIZE and last_max != max(counter.values):
        last_max = max(counter.values)
        last_n_cluster = max(clusters)

        i = counter[counter > MAX_SIZE].sort_values(ascending=False).keys()[0]
        km = KMeans(prod * counter[i] // MAX_SIZE, init="random")

        reduced_vectors = vectors[list(np.where(clusters == i)[0])]
        reduced_clusters = km.fit_predict(reduced_vectors)
        reduced_counter = pd.Series(collections.Counter(reduced_clusters))
        while (min(reduced_counter.values) < MIN_SIZE):
            j = reduced_counter[
                reduced_counter < MIN_SIZE].sort_values().keys()[0]
            clusters_dist = pd.Series([
                np.linalg.norm(km.cluster_centers_[j] - km.cluster_centers_[k])
                for k in reduced_counter.index
            ],
                                      index=[k for k in reduced_counter.index])
            clusters_dist[j] = np.inf
            k = clusters_dist.sort_values().keys()[0]
            km.cluster_centers_[k] = (
                reduced_counter[k] * km.cluster_centers_[k] +
                reduced_counter[j] * km.cluster_centers_[j]) / (
                    reduced_counter[k] + reduced_counter[j])
            km.cluster_centers_[j] = np.inf
            np.place(reduced_clusters, reduced_clusters == j, k)
            reduced_counter = pd.Series(collections.Counter(reduced_clusters))
        clusters[list(np.where(
            clusters == i)[0])] = last_n_cluster + reduced_clusters + 1
        counter = pd.Series(collections.Counter(clusters))
        for i in np.unique(reduced_clusters):
            cluster_centers[last_n_cluster + i] = km.cluster_centers_[i]

    return (clusters, cluster_centers, counter,
            calinski_harabaz_score(vectors, clusters))
示例#10
0
 def cluster(self, eps, min_samples):
     if self.X is None:
         raise ValueError
     db = DBSCAN(eps=eps, min_samples=min_samples,
                 n_jobs=-1).fit(self.X, sample_weight=self.weight_array)
     # The DBSCAN algorithm views clusters as areas of high density separated by areas of low density.
     # Due to this rather generic view, clusters found by DBSCAN can be any shape,
     # as opposed to k-means which assumes that clusters are convex shaped.
     # The central component to the DBSCAN is the concept of core samples, which are samples that are in areas
     # of high density. A cluster is therefore a set of core samples,
     # each close to each other (measured by some distance measure) and a set of non-core samples that are close
     # to a core sample (but are not themselves core samples).
     # There are two parameters to the algorithm, min_samples and eps, which define formally what we mean when we say dense.
     # Higher min_samples or lower eps indicate higher density necessary to form a cluster.
     # Cite:
     # “A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise”
     # Ester, M., H. P. Kriegel, J. Sander, and X. Xu,
     # In Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining,
     # Portland, OR, AAAI Press, pp. 226–231. 1996
     self.core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
     self.core_samples_mask[db.core_sample_indices_] = True
     self.labels = db.labels_
     self.n_clusters = len(set(
         self.labels)) - (1 if -1 in self.labels else 0)
     try:
         self.si_score = silhouette_score(self.X, self.labels)
     # The Silhouette Coefficient is calculated using the mean intra-cluster distance (a)
     # and the mean nearest-cluster distance (b) for each sample.
     # The Silhouette Coefficient for a sample is (b - a) / max(a, b).
     # To clarify, b is the distance between a sample and the nearest cluster that the sample is not a part of.
     # Note that Silhouette Coefficient is only defined if number of labels is 2 <= n_labels <= n_samples - 1.
     # Cite:
     # Peter J. Rousseeuw (1987). “Silhouettes: a Graphical Aid to the Interpretation and Validation of Cluster Analysis”.
     # Computational and Applied Mathematics 20: 53-65.
     except ValueError:
         self.si_score = -1
     try:
         self.calinski = calinski_harabaz_score(self.X, self.labels)
         # The score is defined as ratio between the within-cluster dispersion and the between-cluster dispersion.
         # Cite:
         # T.Calinski and J.Harabasz, 1974. “A dendrite method for cluster analysis”.Communications in Statistics
     except ValueError:
         self.calinski = 0
示例#11
0
def cluster_spectralclustering(n_clusters):
    """
    SpectralClustering聚类算法
    :param n_clusters:质心数量
    :return:
    """
    data = get_data("../data/feature_vector_pca.csv")
    spectral = SpectralClustering(n_clusters=n_clusters, gamma=0.01)
    clusters = spectral.fit_predict(data)
    # 遍历超参以寻找最优参数
    # for index, gamma in enumerate((0.01, 0.1, 1, 10)):
    #     for index2, k in enumerate((15, 20, 25, 30)):
    #         clusters = SpectralClustering(n_clusters=k, gamma=gamma).fit_predict(data)
    #         print("Calinski-Harabasz Score with gamma=", gamma, "n_clusters=", k, "score:",
    #               metrics.calinski_harabaz_score(data, clusters))
    print("Calinski-Harabasz Score",
          metrics.calinski_harabaz_score(data, clusters))
    print("每个样本点所属类别索引", clusters)
    data_labeled_to_csv(clusters, "data/data_labeld_birch.csv")
示例#12
0
def BestClusteringGMM(objectives):
    """

    :type objectives: dataframe of objectives
    """
    X = objectives.values
    num_clusters = np.arange(3, 7)
    models = [
        GMM(n_components=n, covariance_type='full').fit(X)
        for n in num_clusters
    ]
    scores = [metrics.calinski_harabaz_score(X, m.predict(X)) for m in models]
    # scores2 = [metrics.cluster.silhouette_score(X,m.predict(X)) for m in models ]
    max_score = np.max(scores)
    max_index = scores.index(max_score)
    best_num_cluster = num_clusters[max_index]
    print("Best number of clusters is {} with the calinski_harabaz_score={}".
          format(best_num_cluster, round(max_score, 3)))
    return best_num_cluster
示例#13
0
def task1():
    X = pd.read_csv("pluton.csv")

    for numIterations in [1, 2]:

        kmeans = KMeans(n_clusters=3, random_state=0, max_iter=numIterations).fit(X)

        colormap = plt.get_cmap('hsv')
        norm = matplotlib.colors.Normalize(vmin=0, vmax=3)
        axes = pd.plotting.scatter_matrix(X, color=colormap(norm(kmeans.labels_)))
        plt.suptitle(f'max_iter = {numIterations}')

        labels = kmeans.labels_
        print(f'max_iter = {numIterations}, n_iter = {kmeans.n_iter_}')
        print('Silhouette-Score = ', metrics.silhouette_score(X, labels, metric='euclidean'))
        print('Calinski-Harabaz Index = ', metrics.calinski_harabaz_score(X, labels))
        print('Davies-Bouldin Index', metrics.davies_bouldin_score(X, labels), end='\n\n')

    matplotlib.pyplot.show()
示例#14
0
def cluster(values, max_cluster):
    algorithm = "auto"
    silhouette_scores = []
    calinski_harabaz_scores = []
    min_silhouette_avg = 0.
    cluster_centers = []
    cluster_labels = []
    for n_clusters in range(2, max_cluster, 1):
        print(n_clusters)
        kmeans = KMeans(n_clusters=n_clusters,
                        random_state=10,
                        algorithm=algorithm,
                        n_init=30).fit(values)
        labels_temp = kmeans.labels_
        silhouette_avg = metrics.silhouette_score(values, labels_temp)

        silhouette_scores.append(silhouette_avg)

        calinski_harabaz_score = metrics.calinski_harabaz_score(
            values, labels_temp)
        calinski_harabaz_scores.append(calinski_harabaz_score)

        nb_values_over = 0
        print("cluster:", n_clusters, "silhouette score:", silhouette_avg,
              "calinski_harabaz:", calinski_harabaz_score)
        if min_silhouette_avg < silhouette_avg:
            min_silhouette_avg = silhouette_avg
            cluster_centers = kmeans.cluster_centers_
            cluster_labels = labels_temp

        silhouette_samples_values = metrics.silhouette_samples(
            values, labels_temp)
        for cluster in range(n_clusters):
            cluster_values = silhouette_samples_values[cluster_labels ==
                                                       cluster]
            nb_values_over = nb_values_over + len(
                cluster_values[np.where(cluster_values > silhouette_avg)])
        print("Number of values over average:", nb_values_over,
              "({:04.1f}%)".format(nb_values_over / len(values) * 100))

    return cluster_centers.shape[
        0], cluster_centers, cluster_labels, silhouette_scores, calinski_harabaz_scores
示例#15
0
def cluster_range(X, clusterer, k_start, k_stop, actual=None):

    chs = []
    iidrs = []
    inertias = []
    scs = []
    ys = []
    amis = []
    ars = []
    ps = []

    for i in range(k_start, k_stop + 1):

        clusterer2 = clusterer
        clusterer2.n_clusters = i
        ys.append(clusterer2.fit_predict(X))

        iidrs.append(intra_to_inter(X, ys[-1], euclidean, 50))
        chs.append(calinski_harabaz_score(X, ys[-1]))
        inertias.append(clusterer2.inertia_)
        scs.append(silhouette_score(X, ys[-1]))

    keys = ['ys', 'iidrs', 'chs', 'inertias', 'scs']
    values = [ys, iidrs, chs, inertias, scs]

    if actual is not None:

        for i in ys:
            ps.append(purity(actual, i))
            ars.append(adjusted_rand_score(actual, i))
            amis.append(adjusted_mutual_info_score(actual, i))

        keys.extend(['ps', 'ars', 'amis'])
        values.append(ps)
        values.append(ars)
        values.append(amis)

        return dict(zip(keys, values))

    else:

        return dict(zip(keys, values))
示例#16
0
def cluster(method, dis_matrix, n, Mode, ef, p):
    if (method == "KMeans"):
        M = "KM"
        #print("KMeans n = "+str(n))
        from sklearn.cluster import KMeans
        #eigen_values, eigen_vectors = np.linalg.eigh(dis_matrix)
        clusters_KMeans = KMeans(n_clusters=n,
                                 init='k-means++').fit(dis_matrix)
        labels = clusters_KMeans.labels_
        output_file = open(
            "C:/tmp2/result_cluster/" + Mode + "/" + str(p) +
            "_out_cluster_KMeans_" + str(n) + ".txt", "w")
        for item in labels:
            output_file.write("%s\n" % item)
        output_file.close()

    elif (method == "Spectral"):
        M = "SP"
        #print("SpectralClustering n = "+str(n))
        from sklearn.cluster import SpectralClustering
        cl = SpectralClustering(n_clusters=n, affinity='precomputed')
        clusters_SpectralClustering = cl.fit(dis_matrix)
        labels = clusters_SpectralClustering.labels_
        output_file = open(
            "C:/tmp2/result_cluster/" + Mode + "/" + str(p) +
            "_out_cluster_SpectralClustering_" + str(n) + ".txt", "w")
        for item in labels:
            output_file.write("%s\n" % item)
        output_file.close()

    else:
        print("No method found")
        return -1

    from sklearn import metrics
    ef.write("%d%%\t%s\tn=%d\t%.7f\t%.7f\n" %
             (p, M, n,
              metrics.silhouette_score(dis_matrix, labels, metric='euclidean'),
              metrics.calinski_harabaz_score(dis_matrix, labels)))
    #ef.write("%d0%% %s n=%d C-Score: %.7f\n" % (p, M, n, metrics.calinski_harabaz_score(dis_matrix, labels)))

    return 0
示例#17
0
def plot_variance_ratio(adata,
                        res_list,
                        X='latent',
                        out='./clustering/',
                        prefix='',
                        rep='latent',
                        save=True):
    """
    res_list (list of float): list of resolution
    X (str): representation or layer to use {'latent', 'X', 'raw'}
    """
    if 'X_{}'.format(X) in adata.obsm:
        data = adata.obsm['X_{}'.format(X)]
    elif X == 'X':
        data = adata.X
    else:
        data = adata.layers[X]

    fig, ax = plt.subplots()
    for method in ['Louvain', 'Leiden']:
        keys = []
        resolution = []
        for res in res_list:
            key = prefix + '{}Res{}_{}'.format(method, res, rep)
            # include resolution with more than one cluster
            if len(adata.obs[key].cat.categories) > 1:
                keys.append(key)
                resolution.append(res)
        scores = [
            calinski_harabaz_score(data, adata.obs[key].values) for key in keys
        ]
        ax.plot(resolution, scores, label=method)
    ax.legend()
    ax.set_ylabel('Variance Ratio Criterion')
    ax.set_xlabel('Resolution')
    ax.set_title(X)
    if save:
        fig.savefig(os.path.join(out, f'variance_ratio_criterion_{X}.png'))
        plt.close()
    else:
        plt.show()
    return
示例#18
0
def kmeans(kvalue):
    dataMat = []
    fr = open("E:\\code\\python\\data\\dshl_kmeans.txt")
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = list(map(float, curLine))  # 映射所有的元素为 float(浮点数)类型
        dataMat.append(fltLine)
    km = KMeans(n_clusters=int(kvalue))  # 初始化
    km.fit(dataMat)  # 拟合
    km_preds = km.predict(dataMat).tolist()  # 预测
    centers = km.cluster_centers_.tolist()  # 质心
    result_list = []
    for i, centerPoint in enumerate(centers):
        result = {'kIndex': i + 1, 'centerPoint': centerPoint, 'dataCount': 0}
        result_list.append(result)
    for km_pred in km_preds:
        result_list[km_pred][
            'dataCount'] = result_list[km_pred]['dataCount'] + 1
    ch_value = metrics.calinski_harabaz_score(dataMat, km_preds)
    return jsonify({'resultList': result_list, 'chValue': ch_value})
示例#19
0
def test_clusterer_calinskiHarabaz(XY):
    X = XY[0]
    Y = XY[1]

    # "_args": [{  "type": "numpy.ndarray",  "dtype": "float32"},
    #           {"type": "numpy.ndarray", "dtype": "int32"}],
    # "_return": [{"type": "float"}]

    # we only want to test cosine metric for this example, but it could be a parameter in other cases

    from sklearn import metrics

    print('test_clusterer_calinskiHarabaz')
    min_score = 0
    max_score = 200

    calinski_harabaz = metrics.calinski_harabaz_score(X, Y)

    calinski_harabaz = (calinski_harabaz - min_score) / (max_score - min_score)
    return calinski_harabaz
示例#20
0
def evaluate_clusters(X, ids, labels_file):

    print("Evaluating: "+ labels_file)
    clusters, label_list = load_clusters(labels_file, ids)

    # run evaluations
    # 1. Silhouette Coeffient
    sc = 0.0
    for i in range(100):
        sc += silhouette_score(X, clusters, sample_size=1000)
    sc /= 100.0

    # 2. Variance Ratio Criterion
    vrc = calinski_harabaz_score(X, clusters)

    # 3. 
    dbs = davies_bouldin_score(X, clusters)

    print("Silhouette, Calinski-Harabaz, Davies-Bouldin")
    print([sc,vrc,dbs])
示例#21
0
  def _cluster_plot(self, embedding, labels):
    silhouette = silhouette_score(embedding.squeeze(), labels)
    chs = calinski_harabaz_score(embedding.squeeze(), labels)
    dbs = davies_bouldin_score(embedding.squeeze(), labels)

    n_labels = len(set(labels))

    self.writer.add_scalar(f"silhouette {n_labels}", silhouette, self.step_id)
    self.writer.add_scalar(f"chs {n_labels}", chs, self.step_id)
    self.writer.add_scalar(f"dbs {n_labels}", dbs, self.step_id)

    indices = list(range(len(labels)))
    random.shuffle(indices)
    samples_to_plot = indices[:1000]
    sample_labels = [labels[idx] for idx in samples_to_plot]
    sample_embedding = embedding[samples_to_plot]
    pca = PCA(2).fit_transform(sample_embedding.squeeze())
    fig, ax = plt.subplots()
    ax.scatter(pca[:, 0], pca[:, 1], c=sample_labels, cmap="tab20")
    self.writer.add_figure(f"clustering {n_labels}", fig, self.step_id)
示例#22
0
def clustering_in_eval(labels, X, mode='sil'):
    """
    :param mode:
           sil: Silhouette Coefficient
           cal: Calinski-Harabaz
    :param labels: clustering labels
    :param X: M x N Document-feature maxtrix
    :return:
    """
    if np.unique(labels).size == 1:
        print('labels: ', labels)
        return None
    if mode == 'sil':
        value = metrics.silhouette_score(X, labels, metric='euclidean')
        print("Silhouette Coefficient: ", value)
    elif mode == 'cal':
        value = metrics.calinski_harabaz_score(X, labels)
        print("Calinski-Harabaz: ", value)

    return value
示例#23
0
def cluster_score(X, y_pred):
    """计算聚类的效果 越大越好
    [in]  X: array-like, shape(n_samples, n_features), 数据矩阵
          y_pred: array-like, shape(n_samples,), 聚类结果
    [out] score: double, 聚类效果得分
    """
    try:
        if not isinstance(X, np.ndarray):
            X = X.toarray()
        score = calinski_harabaz_score(X, y_pred)
        logging.info("Calinski-Harabasz Score : %.4f" % score)
    except ValueError as e:
        score = -1.0
        logging.info("caculate score fail.")
        logging.info(e)
    except Exception as e:
        score = -1.0
        logging.info("caculate score fail.")
        logging.info(e)
    return score
def clustering_centers_D():
    model = KMeans(n_clusters=5,
                   init="k-means++",
                   n_init=228,
                   precompute_distances=True,
                   random_state=None,
                   max_iter=300).fit(audio_array_scaled)
    #    labels = model.fit_predict(audio_array_scaled)

    model.cluster_centers_
    centers = np.array(model.cluster_centers_)
    counter = len(centers)
    i = 0
    for r in range(counter):
        i += 1
        if i <= counter:
            print({"The center of cluster " + str(r): centers[r]})

    print("The value of calinski_harabaz_score is = " +
          str(metrics.calinski_harabaz_score(audio_array_scaled, labels)))
示例#25
0
def dbscan_clustering(data_frame: pd.core.frame.DataFrame):
    print("DBSCAN clustering")
    x = data_frame.values
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    normalized_data_frame = pd.DataFrame(x_scaled)

    DBSCAN_clustering = DBSCAN()

    cluster_labels = DBSCAN_clustering.fit_predict(normalized_data_frame)
    calinski_harabaz_avg = calinski_harabaz_score(normalized_data_frame,
                                                  cluster_labels)
    print("The average calinski_harabaz_score is :", calinski_harabaz_avg)

    pca2D = decomposition.PCA(2)
    # Turn the data into two columns with PCA
    plot_columns = pca2D.fit_transform(normalized_data_frame)
    # Plot using a scatter plot and shade by cluster label
    plt.scatter(x=plot_columns[:, 0], y=plot_columns[:, 1], c=cluster_labels)
    plt.show()
示例#26
0
def MyKmeans10(Im, ImageType, MaxClusters):

    if ImageType == 'Hyper':
        r, c = Im.shape[0:2]
        Im = np.reshape(Im, (r * c, Im.shape[2]))
        pca = PCA(n_components=0.95)
        data = pca.fit_transform(Im)
    # pre_processing
    if ImageType == 'RGB':
        r, c = Im.shape[0:2]
        data = np.zeros((r * c, 3))
        #    r,c,p = self._weights.shape[0:3]
        #    weights = np.zeros((r*c,p))
        n = -1
        for i in range(r):
            for j in range(c):
                n = n + 1
                data[n, :] = Im[i, j, :]

    metric = []
    for numclust in range(2, MaxClusters + 1):
        kmeans = KMeans(n_clusters=numclust)
        kmeans.fit(data)
        labels = kmeans.labels_
        metric.append(metrics.calinski_harabaz_score(data, labels))
    metric = np.array(metric)
    index = np.argwhere(metric == max(metric))
    index = index + 2
    if len(index) > 1:
        index = min(index)
    index = int(index)
    kmeans1 = KMeans(n_clusters=index)
    kmeans1.fit(data)
    labels1 = kmeans1.labels_ + 1
    ClusterIm = np.zeros((r, c))
    n2 = -1
    for i2 in range(r):
        for j2 in range(c):
            n2 = n2 + 1
            ClusterIm[i2, j2] = labels1[n2]
    return ClusterIm
def cluster_embeddings(word_vectors, cluster_num, batch_size, init, max_iter, max_no_improvement, verbose):

    """Cluster word embedding vectors, and evaluate the variance"""

    kmeans_handler = MiniBatchKMeans(n_clusters=cluster_num, batch_size=batch_size, init=init, max_iter=max_iter, max_no_improvement=max_no_improvement, verbose=verbose)

    # get the cluster indices of each embedding
    indices = kmeans_handler.fit_predict(word_vectors.vectors)

    # create a dictionary that maps each word with its cluster number
    word_cluster_map = dict(zip(word_vectors.index2word, indices))

    # calculate the ratio between intra- and inter-cluster variance (the lower, the better)
    variance_ratio = calinski_harabaz_score(word_vectors.vectors, indices)

    # get the lists of words forming each cluster
    word_clusters = [list() for _ in range(cluster_num)]
    for word in word_cluster_map.keys():
        word_clusters[word_cluster_map[word]].append(word)

    return word_clusters, variance_ratio
示例#28
0
def kmeans_blocks(x, box_num=3):
    '''
        聚类分箱
    :param x: dtype=[]
    :param bins: 分箱数量
    :return: 箱边界
    '''
    len_clocks = min(box_num, len(x), len(set(x)) + 1)
    if len_clocks <= 1:
        return [-np.inf, np.inf]
    X = np.array(x).reshape([-1, 1])
    km = KMeans(n_clusters=len_clocks - 1, random_state=666)
    y_pre = km.fit_predict(X)
    # 使用Calinski-Harabasz Index评估的聚类分数: 分数越高,表示聚类的效果越好
    if km.cluster_centers_.size > 1:  # 聚类类别大于1
        kmeans_score = calinski_harabaz_score(X, y_pre)
        print("聚类效果评分:{}".format(kmeans_score))
    tb = km.cluster_centers_.reshape([-1])
    tb.sort()
    blocks = np.concatenate([[-np.inf], tb, [np.inf]])
    return blocks.tolist()
示例#29
0
文件: fitness.py 项目: ZaidUsm/EvoNP
def CHI(points, labelsPred):
    """    
    Calinski and Harabasz Index measure
    
    Parameters
    ---------- 
    point : ndarray
        The point that we need to find its Nearest
    labelPred : list
        A list of predicted labels for the points for each chroomosome
            
    Returns
    -------
    float
        fitness: The fitness value
    """
    global fitnessFunc
    fitnessFunc = "CH"
    ch = metrics.calinski_harabaz_score(points, labelsPred)
    fitness = 1 / ch
    return fitness
示例#30
0
def get_hdbsan_best_eps_minsamples(df_features, max_num_members):

    calinski_score_dict = {}
    for i in range(2, max_num_members):
        min_samples = i
        min_cluster_size = min_samples
        estimator = get_model('hdbscan', min_samples,
                              df_features)  # build clustering estimator
        estimator.fit(df_features)
        labels = estimator.labels_
        # Number of clusters in labels, ignoring noise if present.
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        try:
            calinski_score = metrics.calinski_harabaz_score(
                df_features, labels)
        except:
            calinski_score = 0
        print('min_cluster_size', min_cluster_size, 'minsample', min_samples,
              'n_cluster: ', n_clusters_, 'calinski_score: ', calinski_score)
        calinski_score_dict[str(int(min_samples))] = calinski_score
    return get_highest_score(calinski_score_dict)
示例#31
0
def kmeans(normalizedDataFrame, df_new):

    ### set kmeans for k=5
    k = 5
    kmeans = KMeans(n_clusters=k)

    # Cluster and put every row into a cluster
    cluster_labels = kmeans.fit_predict(
        normalizedDataFrame)  # 100 numbers which divided to k groups

    # Use calinski_harabaz score to access the quality of data
    calinski_avg = metrics.calinski_harabaz_score(normalizedDataFrame,
                                                  cluster_labels)
    print("For n_clusters = ", str(k),
          " the average calinski_harabaz_score is :", calinski_avg)

    # PCA
    # Let's convert our high dimensional data to 2 dimensions
    pca2D = decomposition.PCA(2)

    # Turn the data into two columns with PCA
    plot_columns = pca2D.fit_transform(normalizedDataFrame)

    # Plot using a scatter plot and shade by cluster label
    plt.scatter(x=plot_columns[:, 0], y=plot_columns[:, 1], c=cluster_labels)
    plt.title("For kmeans method,PCA for k=" + str(k))
    plt.savefig("For kmeans method, PCA for k=" + str(k))
    plt.show()
    plt.clf()
    plt.close()

    ## Get labels for each sector
    print("\n**************Kmeans**************\n")
    df_new['labels'] = cluster_labels
    dfa = pd.concat([df_new['sector'], df_new['labels']], axis=1)
    for i in range(0, 5):
        dfa0 = dfa.loc[dfa['labels'] == i, :]

        print("For cluster", i)
        print(dfa0['sector'].value_counts())
示例#32
0
def ward(normalizedDataFrame, df_new):

    Z = linkage(normalizedDataFrame, method='ward', metric='euclidean')

    # we set k=5
    k = 5
    labels_1 = fcluster(Z, t=k, criterion='maxclust')

    # Use calinski_harabaz score to access the quality of data
    calinski_avg = metrics.calinski_harabaz_score(normalizedDataFrame,
                                                  labels_1)
    print("For n_clusters = ", str(k),
          " the average calinski_harabaz_score is :", calinski_avg)

    #####
    # PCA
    # Let's convert our high dimensional data to 2 dimensions
    # using PCA
    pca2D = decomposition.PCA(2)

    # Turn the data into two columns with PCA
    plot_columns = pca2D.fit_transform(normalizedDataFrame)

    # Plot using a scatter plot and shade by cluster label
    plt.scatter(x=plot_columns[:, 0], y=plot_columns[:, 1], c=labels_1)
    plt.title("For hierarchical method,PCA for k=" + str(k))
    plt.savefig("For hierarchical method,PCA for k=" + str(k))
    plt.show()
    plt.clf()
    plt.close()

    ## Get labels for each sector
    print("\n**************Hierarchical**************\n")
    df_new['labels'] = labels_1
    dfa = pd.concat([df_new['sector'], df_new['labels']], axis=1)
    for i in range(1, 6):
        dfa0 = dfa.loc[dfa['labels'] == i, :]

        print("For cluster", i)
        print(dfa0['sector'].value_counts())
示例#33
0
    def find_optimal_K(training_data, min_number_of_clusters,
                       max_number_of_clusters):
        max_calinski_harabaz = {'num_clusters': 0, 'score': -1}

        for i in range(min_number_of_clusters, max_number_of_clusters + 1):
            cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(training_data.T,
                                                             i,
                                                             5,
                                                             error=0.005,
                                                             maxiter=1000,
                                                             init=None,
                                                             seed=1)

            cluster_membership = np.argmax(u, axis=0)

            score = calinski_harabaz_score(training_data, cluster_membership)

            if score >= max_calinski_harabaz['score']:
                max_calinski_harabaz['num_clusters'] = i
                max_calinski_harabaz['score'] = score

        return max_calinski_harabaz['num_clusters']
示例#34
0
def main():
    if len(sys.argv)!=4:
        print "Usage: python kmedoid.py [e|ic|gpcr|nr] [dataDir] [outputDir]"
        return

    dataPath = sys.argv[1]
    dataset = sys.argv[2]
    outPath = sys.argv[3]

    # Load file
    print "Preparing data"
    _,comList,proList = yam.loadComProConnMat(dataset,dataPath+"/Adjacency")
    kernel = yam.loadKernel(dataset,dataPath)

    nComp = len(comList)
    nProtein = len(proList)

    comSimMat = np.zeros((nComp,nComp), dtype=float)
    proSimMat = np.zeros((nProtein,nProtein), dtype=float)

    for row,i in enumerate(comList):
        for col,j in enumerate(comList):
            comSimMat[row][col] = kernel[(i,j)]

    for row,i in enumerate(proList):
        for col,j in enumerate(proList):
            proSimMat[row][col] = kernel[(i,j)]

    # convert similarity matrix to distance Matrix
    proDisMat = simToDis(proSimMat)
    comDisMat = simToDis(comSimMat)

    print "Clustering"
    proMedoid,proClust = kMedoids(len(proList)/2, proDisMat)
    comMedoid,comClust = kMedoids(len(comList)/2, comDisMat)
    # Take each label for each sample
    comLabelList = np.zeros((nComp))
    proLabelList = np.zeros((nProtein))
    proMetaClust = dict()
    comMetaClust = dict()

    for lab in proClust:
        meta = []
        for idx in proClust[lab]:
            meta.append(proList[idx])
            proLabelList[idx] = lab
        proMetaClust[lab] = meta


    for lab in comClust:
        meta = []
        for idx in comClust[lab]:
            meta.append(comList[idx])
            comLabelList[idx] = lab
        comMetaClust[lab] = meta

    print "Evaluation"

    comSilhouette = met.silhouette_score(comDisMat,comLabelList,metric="precomputed")
    proSilhouette = met.silhouette_score(proDisMat,proLabelList,metric="precomputed")

    comCalinskiHarabaz = met.calinski_harabaz_score(comDisMat,comLabelList)
    proCalinskiHarabaz = met.calinski_harabaz_score(proDisMat,proLabelList)

    print ("Silhouette score :\nCompound cluster = "+str(comSilhouette)+
            ",Protein cluster = "+str(proSilhouette))

    print ("Calinski Harabaz score :\nCompound cluster = "+str(comCalinskiHarabaz)+
            ", Protein cluster = "+str(proCalinskiHarabaz))

    print "Writing Output"

    perf = {'silhouette_score_':{'compound':comSilhouette,'protein':proSilhouette},
            'calinski_harabaz_score':{'compound':comCalinskiHarabaz,'protein':
            proCalinskiHarabaz}}

    with open(outPath+"/perf_medoid_"+dataset+".json",'w') as f:
        json.dump(perf,f, indent=2, sort_keys=True)

    with open(outPath+"/cluster_medoid_com_"+dataset+".json",'w') as f:
        json.dump(comMetaClust,f, indent=2, sort_keys=True)

    with open(outPath+"/cluster_medoid_pro_"+dataset+".json",'w') as f:
        json.dump(proMetaClust,f, indent=2, sort_keys=True)
示例#35
0
    def run(self):
        """ Process data """
        data = copy.copy(self.indata['Raster'])
        self.update_vars()

        no_clust = range(self.min_cluster, self.max_cluster+1)

        self.reportback('Cluster analysis started')

# Section to deal with different bands having different null values.
        masktmp = data[0].data.mask
        for i in data:
            masktmp += i.data.mask
        for i, _ in enumerate(data):
            data[i].data.mask = masktmp
        X = np.array([i.data.compressed() for i in data]).T

        if self.radiobutton_sscale.isChecked():
            X = skp.StandardScaler().fit_transform(X)
        elif self.radiobutton_rscale.isChecked():
            X = skp.RobustScaler().fit_transform(X)

        dat_out = []
        for i in self.pbar.iter(no_clust):
            if self.cltype != 'DBSCAN':
                self.reportback('Number of Clusters:'+str(i))
            elif i > no_clust[0]:
                continue

            if self.cltype == 'k-means':
#                cfit = skc.KMeans(n_clusters=i, tol=self.tol,
#                                  max_iter=self.max_iter).fit(X)
                cfit = skc.MiniBatchKMeans(n_clusters=i, tol=self.tol,
                                           max_iter=self.max_iter).fit(X)
            elif self.cltype == 'DBSCAN':
                cfit = skc.DBSCAN(eps=self.eps,
                                  min_samples=self.min_samples).fit(X)

            elif self.cltype == 'Birch':
                cfit = skc.Birch(n_clusters=i, threshold=self.bthres,
                                 branching_factor=self.branchfac).fit(X)

            dat_out.append(Clust())
            for k in data:
                dat_out[-1].input_type.append(k.dataid)

            zonal = np.ma.masked_all(data[0].data.shape)
            alpha = (data[0].data.mask == 0)
            zonal[alpha == 1] = cfit.labels_

            dat_out[-1].data = zonal
            dat_out[-1].nullvalue = zonal.fill_value
            dat_out[-1].no_clusters = i
            dat_out[-1].center = np.zeros([i, len(data)])
            dat_out[-1].center_std = np.zeros([i, len(data)])
            if cfit.labels_.max() > -1:
                dat_out[-1].vrc = skm.calinski_harabaz_score(X, cfit.labels_)

            if self.cltype == 'k-means':
                dat_out[-1].center = np.array(cfit.cluster_centers_)

            self.log = ("Cluster complete" + ' (' + self.cltype+')')

        for i in dat_out:
            i.tlx = data[0].tlx
            i.tly = data[0].tly
            i.xdim = data[0].xdim
            i.ydim = data[0].ydim
            i.nrofbands = 1
            i.dataid = 'Clusters: '+str(i.no_clusters)
            if self.cltype == 'DBSCAN':
                i.dataid = 'Clusters: '+str(int(i.data.max()+1))
            i.rows = data[0].rows
            i.cols = data[0].cols
            i.nullvalue = data[0].nullvalue

        self.reportback("Cluster complete" + ' ('+self.cltype + ' ' + ')')

        for i in dat_out:
            i.data += 1
            i.data = i.data.astype(np.uint8)

        self.outdata['Cluster'] = dat_out
        self.outdata['Raster'] = self.indata['Raster']

        return True
def compute_calinski_harabaz_score(estimator, X, y=None):
    return 0.0 if estimator.n_clusters == 1 else calinski_harabaz_score(X, estimator.fit_predict(X))
示例#37
0
            transf_list = arguments.sc3_transf.split(",")
            print('\nThere are {0} transformations given.'.format(len(transf_list)))
            for ts in transf_list:
                print('- Adding transformation {0}'.format(ts))
                trg_clustering.add_dimred_calculation(partial(sc.transformations, components=max_pca_comp, method=ts))

            trg_clustering.add_intermediate_clustering(partial(sc.intermediate_kmeans_clustering, k=trg_k))
            trg_clustering.set_build_consensus_matrix(sc.build_consensus_matrix)
            trg_clustering.set_consensus_clustering(partial(sc.consensus_clustering, n_components=trg_k))
            trg_clustering.apply()

        # --------------------------------------------------
        # 4. EVALUATE CLUSTER ASSIGNMENT
        # --------------------------------------------------
        print('\nUnsupervised evaluation:')
        accs[0, j, i] = metrics.calinski_harabaz_score(
            trg_clustering.pp_data.T, trg_clustering.cluster_labels)
        accs[1, j, i] = metrics.silhouette_score(
            trg_clustering.pp_data.T, trg_clustering.cluster_labels, metric='euclidean')
        accs[2, j, i] = metrics.silhouette_score(
            trg_clustering.pp_data.T, trg_clustering.cluster_labels, metric='correlation')
        accs[3, j, i] = metrics.silhouette_score(
            trg_clustering.pp_data.T, trg_clustering.cluster_labels, metric='jaccard')
        print '  -Calinski-Harabaz : ', accs[0, j, i]
        print '  -Silhouette (euc) : ', accs[1, j, i]
        print '  -Silhouette (corr): ', accs[2, j, i]
        print '  -Silhouette (jacc): ', accs[3, j, i]
        if trg_labels is not None:
            print('\nSupervised evaluation:')
            accs[4, j, i] = metrics.adjusted_rand_score(
                trg_labels[trg_clustering.remain_cell_inds], trg_clustering.cluster_labels)
            print '  -ARI: ', accs[4, j, i]
print("2nd component: ", pca.components_[1])


# In[51]:

from sklearn.metrics import silhouette_samples, silhouette_score
Resultk=[0]*9
ResultC=[0]*9
for k in [2,3,4,5,6,7,8,9,10]:
    kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10)
    cluster_labels = kmeans.fit_predict(reduced_data[:,:pca_comps])
    #cluster_labels=kmeans.fit(reduced_data[:,:2])
    #silhouette_avg = silhouette_score(reduced_data[:,:pca_comps], cluster_labels)
    #silhouette_avg = silhouette_score(reduced_data[:,:2], cluster_labels)
    #print("For n_clusters =", k, "The average silhouette_score is :", silhouette_avg)
    calinski_harabaz_score_avg = metrics.calinski_harabaz_score(reduced_data[:,:pca_comps], cluster_labels)
    #calinski_harabaz_score_avg = metrics.calinski_harabaz_score(reduced_data[:,:2], cluster_labels)
    print("For n_clusters =", k," the average metrics.calinski_harabaz_score is :", calinski_harabaz_score_avg)
    Resultk[k-2]=k
    ResultC[k-2]=calinski_harabaz_score_avg    
plt.plot(Resultk,ResultC,'r*-.')


# In[56]:

n_clusters = 2
kmeans = KMeans(init='k-means++', n_clusters=2, n_init=10)
pca = PCA().fit(data)
cluster_labels = kmeans.fit_predict(reduced_data[:,:2])
plt.figure()
plt.plot(range(len(pca.explained_variance_ratio_)), np.cumsum(pca.explained_variance_ratio_))
from sklearn import metrics

km2_silc = metrics.silhouette_score(X, km2_labels, metric='euclidean')
km5_silc = metrics.silhouette_score(X, km5_labels, metric='euclidean')

print('Silhouette Coefficient for num clusters=2: ', km2_silc)
print('Silhouette Coefficient for num clusters=5: ', km5_silc)


# ## Calinski-Harabaz Index

# In[30]:


km2_chi = metrics.calinski_harabaz_score(X, km2_labels)
km5_chi = metrics.calinski_harabaz_score(X, km5_labels)

print('Calinski-Harabaz Index for num clusters=2: ', km2_chi)
print('Calinski-Harabaz Index for num clusters=5: ', km5_chi)


# # Model tuning

# ## Build and Evaluate Default Model

# In[31]:


from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn import metrics

X = np.array([],dtype = np.float)
n_cls = 4

fo = open('anchor_ratio.txt','r')

for line in fo:
    if float(line) < 1.0 :#and  float(line) > 3.0:
	    X = np.append(X,np.float(line))

X = X.reshape(-1,1)
#print(X)

y_pred = KMeans(n_clusters=n_cls).fit_predict(X)
score = metrics.calinski_harabaz_score(X,y_pred)
print(y_pred.shape)
#print('score = {}'.format(score))

for i in range(0,n_cls):
    cls_index = np.where(y_pred == i)
    X_value = X[cls_index]
    #X_mean = np.sum(X_value)/len(X_value)
    X_mean = np.mean(X_value)	
    #X_med = np.median(X_value)	
    #print('x index {}'.format(i))
    #print(cls_index)
    #print('x value {}'.format(i))
    #print(X_value)
    print('{}: x len = {:8f}, meam = {:8f}'.format(i,len(X_value),X_mean))