Exemplo n.º 1
0
def crank_feats(fargs):
    rss, ccs, lv, installed_in, dfile, nfeatures = fargs
    noaa_init(installed_in)
    wat = pd.read_csv(dfile).set_index('station')

    es = ['e' + str(x) for x in range(0, nfeatures)]
    #ccs = [3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30, 40]
    #rss=[0, 1]
    prefix='eigen' + str(nfeatures)
    #let's do some clustering with the six eigenvectors and see how they hold together
    flatnew, nmeans, nstds = flatten(wat[es])  #strictly speaking not necessary since 
    flatold, omeans, ostds = flatten(wat[lv])

    #note: this method flattens wat internally
    produce_kmeans_climates(wat, es, ccs, rss, prefix)

    for rs in rss:

        kf = pd.read_csv(noaafile('climates/' + prefix + '_rs_' + str(rs) + '.csv'))
        for cc in ccs:
            #this silhouettes thing gobbles memory, I'm guessing because each worker
            #creates an entire new metric matrix.
            kf['sil_eigen_' + str(cc)]  = silhouette_samples(flatnew, kf['vtx'+str(cc)].values)
            #pull out silhouette scores on the old metric too, just for fun...
            kf['sil_old_' + str(cc)]  = silhouette_samples(flatold, kf['vtx'+str(cc)].values)

        kf.to_csv(noaafile('climates/' + prefix + '_sil_rs_' + str(rs) + '.csv'), index=False)    
Exemplo n.º 2
0
def bestRep(dat,labels,outName):
    bestExample = []
    silSamp = metrics.silhouette_samples(dat, labels)
    for num in np.unique(labels):
        clusterMask = labels==num
        bestExample.append(outName[clusterMask][np.argmax(silSamp[clusterMask])])
    return bestExample
Exemplo n.º 3
0
def cluster_driver(a_driver):
    
#    print a_driver['DStats']
#    print "#############################DStats Above#################################ValueError: zero-size array to reduction operation minimum which has no identity#################"

    X = StandardScaler().fit_transform(a_driver['DStats'])
#    print X
#    print "DStats are.....::" , a_driver['DStats']
#    print "X is...........::" ,['AvgDistDel', 'AvgACosDel', 'SDevDistDel', 'SDevACosDel','TotalTime','SkewDistDel','SkewACosDel'] X
#    print "############################Scaled X Above###################################################"
    
#    db = KMeans(n_clusters=20,n_jobs = -1).fit(X)
    db = DBSCAN(eps=0.45).fit(X)
#    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
#    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    
#    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print "###############################################################################"
#    print('Estimated number of clusters: %d' % n_clusters_)
#    print 'Count of Predicts::', len(X)
    print("Silhouette Coefficient: %0.3f"    % metrics.silhouette_score(X, labels,metric="mahalanobis"))
#    print "##############################DBSCAN  X Below#################################################"
#    print X    G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/DriverTelemetricAnalysis-AXA/'
#    try:
    return (metrics.silhouette_samples(X, labels,metric="mahalanobis")+1)/2
Exemplo n.º 4
0
    def test_silhouette_samples(self):
        result = self.df.metrics.silhouette_samples()
        expected = metrics.silhouette_samples(self.data, self.pred)

        self.assertTrue(isinstance(result, pdml.ModelSeries))
        self.assert_index_equal(result.index, self.df.index)
        self.assert_numpy_array_almost_equal(result.values, expected)
def silhouette_analysis(clustering, labels=None):
    distance_df = clustering['distance_df']
    if labels is None:
        labels = clustering['labels']
    sample_scores = silhouette_samples(distance_df, metric='precomputed', labels=labels)
    score = np.mean(sample_scores)
    return sample_scores, score
Exemplo n.º 6
0
def cluster(algorithm, data, topics, make_silhouette=False):
  print str(algorithm)
  clusters = algorithm.fit_predict(data)
  labels = algorithm.labels_
  print 'Homogeneity: %0.3f' % metrics.homogeneity_score(topics, labels)
  print 'Completeness: %0.3f' % metrics.completeness_score(topics, labels)
  print 'V-measure: %0.3f' % metrics.v_measure_score(topics, labels)
  print 'Adjusted Rand index: %0.3f' % metrics.adjusted_rand_score(topics, labels)
  print 'Silhouette test: %0.3f' % metrics.silhouette_score(data, labels)
  print ' ***************** '
  
  silhouettes = metrics.silhouette_samples(data, labels)
  num_clusters = len(set(clusters))
  print 'num clusters: %d' % num_clusters
  print 'num fitted: %d' % len(clusters)

  # Make a silhouette plot if the flag is set
  if make_silhouette:
    order = numpy.lexsort((-silhouettes, clusters)) 
    indices = [numpy.flatnonzero(clusters[order] == num_clusters) for k in range(num_clusters)]
    ytick = [(numpy.max(ind)+numpy.min(ind))/2 for ind in indices]
    ytickLabels = ["%d" % x for x in range(num_clusters)]
    cmap = cm.jet( numpy.linspace(0,1,num_clusters) ).tolist()
    clr = [cmap[i] for i in clusters[order]]

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.barh(range(data.shape[0]), silhouettes[order], height=1.0,   
            edgecolor='none', color=clr)
    ax.set_ylim(ax.get_ylim()[::-1])
    plt.yticks(ytick, ytickLabels)
    plt.xlabel('Silhouette Value')
    plt.ylabel('Cluster')
    plt.savefig('cluster.png')
Exemplo n.º 7
0
def visualize_silhouette_score(X,y_km):

    cluster_labels = np.unique(y_km)
    n_clusters = cluster_labels.shape[0]
    silhouette_vals = metrics.silhouette_samples(X,
                                         y_km,
                                         metric='euclidean')
    y_ax_lower, y_ax_upper = 0, 0
    yticks = []
    for i, c in enumerate(cluster_labels):
        c_silhouette_vals = silhouette_vals[y_km == c]
        c_silhouette_vals.sort()
        y_ax_upper += len(c_silhouette_vals)
        color = cm.jet(i / n_clusters)
        plt.barh(range(y_ax_lower, y_ax_upper),
                c_silhouette_vals,
                height=1.0,
                edgecolor='none',
                color=color)
        yticks.append((y_ax_lower + y_ax_upper) / 2)
        y_ax_lower += len(c_silhouette_vals)

    silhouette_avg = np.mean(silhouette_vals)
    plt.axvline(silhouette_avg,
                color="red",
                linestyle="--")
    plt.yticks(yticks, cluster_labels + 1)
    plt.ylabel('Cluster')
    plt.xlabel('Silhouette coefficient')
    plt.show()
Exemplo n.º 8
0
def silhouette_original_clusterings(dataset='CB1', neuropil='Antennal_lobe', clusterer_or_k=60):
    """Returns a pandas dataframe with the silhouette index of each cluster member.
    The dataframe have columns (cluster_id, member_id, silhouette).
    """

    # Read the expression matrix
    print('Reading expression matrix')
    Xdf = ExpressionDataset.dataset(dset=dataset, neuropil=neuropil).Xdf(index_type='string')

    # Generate a flat map cluster_id -> members
    print('Finding cluster assignments')
    clusters_df, _ = get_original_clustering(dataset=dataset, neuropil=neuropil,
                                             clusterer_or_k=clusterer_or_k)
    dfs = []
    for cluster_id, members in zip(clusters_df.cluster_id,
                                   clusters_df.original_voxels_in_cluster):
        dfs.append(pd.DataFrame({'cluster_id': cluster_id, 'member_id': members}))
    members_df = pd.concat(dfs).set_index('member_id').loc[Xdf.index]

    # Compute the distance matrix - this must be parameterised
    print('Computing distance')
    import mkl
    mkl.set_num_threads(6)
    D = dicedist_metric(Xdf)

    # Compute silhouette
    # Here we could go for the faster implementation in third_party, if needed
    print('Computing silhouette index')
    members_df['silhouette'] = silhouette_samples(D.values,
                                                  members_df.cluster_id.values,
                                                  metric='precomputed')
    return (members_df.
            reset_index().
            rename(columns=lambda col: {'index': 'member_id'}.get(col, col))
            [['cluster_id', 'member_id', 'silhouette']])
Exemplo n.º 9
0
def cluster_driver(a_driver):
    
#    print a_driver['DStats']
#    print "#############################DStats Above##################################################"

    X = StandardScaler().fit_transform(a_driver['DStats'])
    
#    print X
#    print "DStats are.....::" , a_driver['DStats']
#    print "X is...........::" , X
#    print "############################Scaled X Above###################################################"
    
    db = DBSCAN(eps=0.6, min_samples=5).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print "###############################################################################"
#    print('Estimated number of clusters: %d' % n_clusters_)
#    print 'Count of Predicts::', len(X)
#    print("Silhouette Coefficient: %0.3f"    % metrics.silhouette_score(X, labels))
#    print "##############################DBSCAN  X Below#################################################"
#    print X    G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/DriverTelemetricAnalysis-AXA/'
#    try:
    return (metrics.silhouette_samples(X, labels)+1)/2
Exemplo n.º 10
0
    def fit(self, X, y=None, **kwargs):
        """
        Fits the model and generates the silhouette visualization.
        """
        # TODO: decide to use this method or the score method to draw.
        # NOTE: Probably this would be better in score, but the standard score
        # is a little different and I'm not sure how it's used.

        # Fit the wrapped estimator
        self.estimator.fit(X, y, **kwargs)

        # Get the properties of the dataset
        self.n_samples_ = X.shape[0]
        self.n_clusters_ = self.estimator.n_clusters

        # Compute the scores of the cluster
        labels = self.estimator.predict(X)
        self.silhouette_score_ = silhouette_score(X, labels)
        self.silhouette_samples_ = silhouette_samples(X, labels)

        # Draw the silhouette figure
        self.draw(labels)

        # Return the estimator
        return self
def run_clutering(n_sites,order_dict,sim_mat):

   	n_clusters = 6
   	name_file = 'clustering_sil' + str(n_clusters)
   	output_file = open(name_file,'w')
   	name_file1 = 'clustering_labels' + str(n_clusters)
   	output_file1 = open(name_file1,'w')
   	
   	spectral = cluster.SpectralClustering(n_clusters=n_clusters, \
   			eigen_solver='arpack',affinity='precomputed')
   	labels = spectral.fit_predict(sim_mat)
   	
   	silhouette_avg = metrics.silhouette_score(sim_mat,labels)
   	output_file.write(" ".join(["aver silhouette_score:",str(silhouette_avg)]))

    # Compute the silhouette scores for each sample
   	sample_silhouette_values = metrics.silhouette_samples(sim_mat, labels)
   	
   	for siteid in order_dict:
   		stringa = ' '.join(                                       \
			[siteid,
        	str(sample_silhouette_values[order_dict[siteid]])])
   		output_file.write(stringa +'\n')
   	
   	for siteid in order_dict:
   		stringa = ' '.join(                                       \
			[str(siteid),str(labels[order_dict[siteid]])
        	])
   		output_file1.write(stringa +'\n')
def calculateNumberOfIdealClusters(maxAmount, corpus):
	print "Initializing silhouette analysis"
	range_n_clusters = range(2, maxAmount) # max amount of clusters equal to amount of jobs

	silhouette_high = 0;
	silhouette_high_n_clusters = 2;

	for n_clusters in range_n_clusters:
		# Initialize the clusterer with n_clusters value
		cluster = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward", affinity="euclidean")
		cluster_labels = cluster.fit_predict(corpus)

		# The silhouette_score gives the average value for all the samples.
		# This gives a perspective into the density and separation of the formed clusters
		silhouette_avg = silhouette_score(corpus, cluster_labels)

		print "For n_clusters = %d, the average silhouette_score is: %.5f" % (n_clusters, silhouette_avg)

		if (silhouette_avg > silhouette_high):
		    silhouette_high = silhouette_avg
		    silhouette_high_n_clusters = n_clusters

		# Compute the silhouette scores for each sample
		sample_silhouette_values = silhouette_samples(corpus, cluster_labels)

	print ("Highest score = %f for n_clusters = %d" % (silhouette_high, silhouette_high_n_clusters))
	return silhouette_high_n_clusters
Exemplo n.º 13
0
def get_silhouette(df):
    df=df[(df.AB!=".")].copy()
    df.loc[:,'AB']=pd.to_numeric(df.loc[:,'AB'])
    df.loc[:,'CN']=pd.to_numeric(df.loc[:,'CN'])

    tp=df.iloc[0,:].loc['svtype']

    [mn_CN, mn_AB]=df.loc[:, ['CN', 'AB']].mean(skipna=True)
    [sd_CN, sd_AB]=df.loc[:, ['CN', 'AB']].std(skipna=True)

    if df.loc[:,'GT'].unique().size==1:
        df.loc[:,'sil_gt_avg']=1
        df.loc[:, 'sil_gt']=1
        df=df[ ['var_id', 'sample', 'svtype', 'AF', 'GT', 'CN', 'AB', 'sil_gt_avg', 'sil_gt']]
        return df

    #standardize the 2 dims
    if sd_AB>0.01:
        df.loc[:, 'AB1']=(df.loc[:,'AB']-mn_AB)/sd_AB
    else: 
        df.loc[:, 'AB1']=df.loc[:, 'AB']
    if tp in ['DEL', 'DUP', 'MEI'] or sd_CN>0.01:
        df.loc[:, 'CN1']=(df.loc[:,'CN']-mn_CN)/sd_CN
    else:
        df.loc[:, 'CN1']=df.loc[:, 'CN']

    
    gt_code={'0/0':1, '0/1':2, '1/1':3}
    df.loc[:,'gtn']=df.loc[:, 'GT'].map(gt_code)

    dist_2d_sq=spatial.distance.squareform(spatial.distance.pdist(df[['AB1', 'CN1']], metric='cityblock'))
    df.loc[:, 'sil_gt_avg']=metrics.silhouette_score(dist_2d_sq, df.loc[:, 'gtn'].values, metric='precomputed')
    df.loc[:, 'sil_gt']=metrics.silhouette_samples(dist_2d_sq, df.loc[:, 'gtn'].values, metric='precomputed')
    df=df[ ['var_id', 'sample', 'svtype', 'AF', 'GT', 'CN', 'AB', 'sil_gt_avg', 'sil_gt']]
    return df
def find_clusters(df, k_vals=[4, 9, 16, 25], how='hierarchical'):
    '''Find clusters, and if method is k-means run silhouette analysis
    to determine the value of k.

    Args:
        df (data frame): A data frame with normalised expression data.
        k_vals (list or range): The range over which to test k.
        how ('hierarchical' or 'kmeans'): Clustering method.

    Returns:
        A list of cluster numbers.

    '''

    ## Don't run the silhouette analysis for hierarchical clustering,
    ## just calculate the clusters using estimate of k.
    if how == 'hierarchical':
        k = int(np.sqrt((len(df) / 2.0)))
        hc = hac.linkage(df, method='average')
        optimal_clusters = hac.fcluster(hc, t=k, criterion='maxclust')

    ## If method is k-means, run silhouette analysis.
    elif how == 'kmeans':
        best_combined_score = 0
        optimal_k = 2

        ## Try values of k from range and keep track of optimal k according
        ## to silhouette score.
        for k in k_vals:
            km = KMeans(n_clusters=k, random_state=10)
            clusters = km.fit_predict(df)
            silhouette_avg = silhouette_score(df, clusters)
            sample_silhouette_values = silhouette_samples(df, clusters)
            above_mean = 0
            silhouette_sizes = []

            for i in range(k):
                ith_cluster_silhouette_values = sample_silhouette_values[clusters == i]
                size_cluster_i = ith_cluster_silhouette_values.shape[0]
                silhouette_sizes.append(size_cluster_i)
                if max(ith_cluster_silhouette_values) > silhouette_avg:
                    above_mean += 1

            ## This combined score should pick the best value of k
            above_mean_score = float(above_mean) / k
            std_score = 1.0/np.std(silhouette_sizes) if np.std(silhouette_sizes) > 1.0 else 1.0
            combined_score = (silhouette_avg + above_mean_score + std_score) / 3

            ## Put the clusters in the new column in the data frame.
            if combined_score > best_combined_score:
                best_combined_score = combined_score
                optimal_k = k
                optimal_clusters = clusters

        optimal_clusters = [cluster + 1 for cluster in optimal_clusters]

    return optimal_clusters
Exemplo n.º 15
0
def test_gmm():
    sil = pyclust.validate.Silhouette()
    sil_score = sil.score(X, ypred, sample_size=None)

    print(sil_score[0])

    print(sil.sample_scores[:10])

    print(silhouette_score(X, ypred, sample_size=None))
    
    print(silhouette_samples(X, ypred)[:10])
Exemplo n.º 16
0
def compute_sil_score_vector(filelist):
	"""returns dictionary indexed by num_clusters and 
	values which are vectors of silscore for all samples
	"""
	silscore = dict()
	for f in filelist:
		y, X = get_labels_features(f)
		num_clusters = np.unique(y).shape[0]
		silscore[num_clusters]= sklm.silhouette_samples(X,y)

	return silscore
Exemplo n.º 17
0
def silhouette_samples(clusters, word2vec_model):
    labels = []
    matrix = []
    for i in range(len(clusters)):
        words = clusters[i][-1]
        _, mat = get_words_matrix(words, word2vec_model)
        for j in range(len(mat)):
            matrix.append(list(mat[j]))
            labels.append(i)
    matrix = np.array(matrix)
    labels = np.array(labels)
    samples_score = metrics.silhouette_samples(matrix, labels)
    return labels, samples_score
Exemplo n.º 18
0
def grind_kmeans(fargs):
    rss, ccs, mus, lv, installed_in, mdat, prefix = fargs

    noaa_init(installed_in)
    produce_kmeans_climates(mdat, lv, ccs, rss, prefix)
    for rs in rss:

        kf = pd.read_csv(noaafile('climates/' + prefix + '_rs_' + str(rs) + '.csv'))
        for cc in ccs:
            #this silhouettes thing gobbles memory, I'm guessing because each worker
            #creates an entire new metric matrix.
            kf['sil' + str(cc)]  = silhouette_samples(mus, kf['vtx'+str(cc)].values, metric='precomputed')
        kf.to_csv(noaafile('climates/' + prefix + '_sil_rs_' + str(rs) + '.csv'), index=False)    
Exemplo n.º 19
0
def plotter_3d(pit, day, clusters, files):
    data = pickle.load(open("/usr/local/bee/beemon/beeW/Chris/" + pit + "/" + day + "/clusterdata_" + str(clusters) + "_" + str(files) + "_reduced.p", 'rb'), encoding = 'bytes')
    labels = pickle.load(open("/usr/local/bee/beemon/beeW/Chris/" + pit + "/" + day + "/clusterdata_" + str(clusters) + "_" + str(files) + ".pkl", 'rb'), encoding = 'bytes')
    try:
        if len(labels[0]) < 400:
            silhouettes = silhouette_samples(data, labels[0])
            print(silhouettes)
            print(np.mean(silhouettes))
    except Exception:
        print("Silhouette scoring cannot be done.")
    num1 = int(clusters)
    path1 = "Pit:" + pit + " Day:" + day + " Clusters:" + str(clusters) + " Files:" + str(files)
    graph_3d(data, num1, path1, labels)
    plt.close()
Exemplo n.º 20
0
def Silhouette(D,labels,k):
    """
    Taken from SKlearn's plot kmeans example
    D = matriz de distancia
    k = numero de clusters
    """
    plt.ion()
    fig, ax1 = plt.subplots()
    fig.set_size_inches(18, 7)
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(D) + (k + 1) * 10])
    
    sample_silhouette_values = metrics.silhouette_samples(D , labels, metric='precomputed')
    
    y_lower = 10
    
    for i in range(k):
        ith_cluster_silhouette_values = \
                sample_silhouette_values[labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / k)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                                        0, ith_cluster_silhouette_values,
                                        facecolor=color, edgecolor=color, alpha=0.7)

        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        
        y_lower = y_upper + 10  
    
    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    silhouette_avg = metrics.silhouette_score(D , labels, metric='precomputed')	
    
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([]) 
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
    
    plt.suptitle(("Silhouette analysis with n_clusters =",k," and average = ",silhouette_avg),
    fontsize=14, fontweight='bold')

    plt.show()
Exemplo n.º 21
0
def get_silhouette_scores(X, km, nc):
    
    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    cluster_labels = km.labels_
    silhouette_avg = silhouette_score(X, cluster_labels)
    #print ("For n_clusters =" + str(nc) + "The average silhouette_score is :"
    #      + str(silhouette_avg))

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)
    
    return silhouette_avg, sample_silhouette_values
Exemplo n.º 22
0
 def identify_accurate_number_of_clusters(self, model, compounds, max_range=3):
     silhouette_avg_scores = []
     for n_cluster in range(2, max_range):
         assigned_cluster = cluster.KMeans(n_clusters=n_cluster,
                                           n_init=20).fit_predict(model)
         silhouette_avg = silhouette_score(model, assigned_cluster)
         silhouette_avg_scores.append(silhouette_avg)
     max_silhouette_score = max(silhouette_avg_scores)
     index_max_score = silhouette_avg_scores.index(max_silhouette_score)
     final_cluster_num = range(2, max_range)[index_max_score]
     final_assigned_cluster = cluster.KMeans(n_init=20,
                                             n_clusters=final_cluster_num).fit_predict(model)
     final_sample_sil_vals = silhouette_samples(model, final_assigned_cluster)
     return final_assigned_cluster, final_cluster_num, final_sample_sil_vals
def kmeans(cluster_data):
    # Data pre-processing
    scaler = preprocessing.StandardScaler()
    km = KMeans(n_clust, init='random')#random_state=10)
    km.fit(cluster_data) #change variable - data with labels
    #clf.fit(clusterdata)
    #cluster_number = km.predict(scaler.fit_transform(clusterdata))
    #cluster_number = clf.predict(clusterdata)
    centers = km.cluster_centers_
    # Compute the silhouette score
    labels = km.labels_
    score = metrics.silhouette_score(cluster_data, labels)
    sample_score = metrics.silhouette_samples(cluster_data, labels)
    sum_of_squares = km.inertia_
    return(labels, score, sample_score, centers, sum_of_squares)
Exemplo n.º 24
0
def fit_plot_save(k, smoothExprs, day, probeID, geneSymbol, organ, strain, path):
    """
    Fit k-means, plot and save results

    Arguments
    =========
    k - no. of clusters
    smoothExprs - gene expression rows = genes, columns = day
    day - day
    probeID - probeID
    geneSymbol - geneSymbol
    path - path
    
    Returns
    =========
    None - results are plotted and saved

    """
    model = KMeans(n_clusters=k)
    model.fit(smoothExprs)
    clustCentre = model.cluster_centers_
    # Plot results
    plot_silhouette(silhouette_samples(smoothExprs, model.labels_), model.labels_)
    clust.multi_plot(smoothExprs, clustCentre, day, model.labels_)
    # Hierarchical clustering
    # Ward + Euclidean
    header = ["Cluster%i" % label for label in np.unique(model.labels_)]    
    hclust = hc.linkage(clustCentre, method='ward', metric='euclidean')
    plt.figure(); plt.title("Hclust() Ward + Euclidean")
    hc.dendrogram(hclust, color_threshold=0.0, labels=header)
    #seed=101
    #embedding = tsne.tsne(smoothExprs, no_dims = 3, initial_dims = 20, perplexity = 30.0, seed=seed) # low dimensional embedding
    #tsne.plot(embedding, model.labels_)

    # Save model 
    io.save_pickle(os.path.join(path['Clust']['Model'], organ + strain + ".pickle"), model)
    # Save Gene/Probe List    
    geneList = clust.get_gene_list(model.labels_, geneSymbol)
    probeList = clust.get_gene_list(model.labels_, probeID)
    io.write_list_to_csv(os.path.join(path['Clust']['GeneList'], organ + strain + ".csv"), header, geneList) # Gene list 
    io.write_list_to_csv(os.path.join(path['Clust']['ProbeList'], organ + strain + ".csv"), header, probeList) # Probe list    
    # Save Cluster "centres"    
    dataMatrix = np.hstack((np.array(header)[:, None], clustCentre)) 
    header = list(itertools.chain.from_iterable([["Cluster"], list(day)]))    
    io.write_to_csv(os.path.join(path['Clust']['Centres'], organ + strain + ".csv"), header, dataMatrix) # Cluster "centres"   
    # Save Alternate plot     
    hFig = clust.multi_plot(smoothExprs, clustCentre, day, model.labels_)    
    io.save_pdf(os.path.join(path['Clust']['Plot'], organ + strain + "2.pdf"), hFig) # Plot 
Exemplo n.º 25
0
    def _internal(cluster_list, affinity_matrix, dist_matrix,
                  idx, n_jobs, n, queue_y):
        for i in range(idx, n, n_jobs):
            sp = SpectralClustering(n_clusters=cluster_list[i],
                                    affinity='precomputed',
                                    norm_laplacian=True,
                                    n_init=1000)
            sp.fit(affinity_matrix)

            save_results_clusters("res_spectral_{:03d}_clust.csv"
                                  .format(cluster_list[i]),
                                  sample_names, sp.labels_)

            silhouette_list = silhouette_samples(dist_matrix, sp.labels_,
                                                 metric="precomputed")
            queue_y[i] = np.mean(silhouette_list)
Exemplo n.º 26
0
def main():
    
    X, y = make_blobs(n_samples=150,
            n_features=2,
            centers=3,
            cluster_std=0.5,
            shuffle=True,
            random_state=0)
    
    km = KMeans(n_clusters=2,
            init='k-means++',
            n_init=10,
            max_iter=300,
            tol=1e-04,
            random_state=0)
    y_km = km.fit_predict(X)

    cluster_labels = np.unique(y_km)
    n_clusters = cluster_labels.shape[0]
    silhouette_vals = silhouette_samples(X,
            y_km, metric='euclidean')
    y_ax_lower, y_ax_upper = 0, 0
    yticks = []
    
    for i, c in enumerate(cluster_labels):
        c_silhouette_vals = silhouette_vals[y_km == c]
        c_silhouette_vals.sort()
        y_ax_upper += len(c_silhouette_vals)
        color = cm.jet(i / n_clusters)
        plt.barh(range(y_ax_lower, y_ax_upper),
                c_silhouette_vals,
                height=1.0,
                edgecolor='none',
                color=color)
        yticks.append((y_ax_lower + y_ax_upper) / 2)
        y_ax_lower += len(c_silhouette_vals)

    silhouette_avg = np.mean(silhouette_vals)
    plt.axvline(silhouette_avg,
            color='red',
            linestyle='--')
    plt.yticks(yticks, cluster_labels + 1)
    plt.ylabel('Cluster')
    plt.xlabel('Silhouette Coefficient')
    plt.show()

    return
Exemplo n.º 27
0
    def eval_silhouette(self, verbose=True):
        """Evaluate each estimator via silhouette score."""
        for k in self.n_clusters:
            # The silhouette_score gives the average value for all the samples.
            # This gives a perspective into the density and separation of the formed
            # clusters
            cluster_labels = self.estimators[k].labels_

            results = munch.Munch()

            results.silhouette_avg = silhouette_score(self.X, cluster_labels)
            if verbose:
                print("For n_clusters = {k} The average silhouette_score is : {avg_sil}".format(k=k, avg_sil=results.silhouette_avg))

            # Compute the silhouette scores for each sample
            results.sample_silhouette_values = silhouette_samples(self.X, cluster_labels)
            self.silhouette_results[k] = results
Exemplo n.º 28
0
def silhouette_clusters(data, clusters):
    """
        :param data: n*d where n is the number of observations and d the dimensions of
                     each observation
        :param clusters: an array of length n

        compute silhoutte score for every cluster
    """
    silhouette_samples_score = metrics.silhouette_samples(data, clusters, metric='euclidean')
    values_possible = np.unique(clusters)
    silhouette_mean_clusters = np.zeros((1, len(values_possible)))
    k = 0
    for i in values_possible:
        index = np.where(clusters == i)
        silhouette_mean_clusters[k] = np.mean(silhouette_samples_score[index])
        k += 1

    return silhouette_mean_clusters
Exemplo n.º 29
0
def optimum_clusters(range_n_clusters,y):

    logger.info('Start deciding Optimum Number of Clusters')

    dic = {}

    for n_clusters in range_n_clusters:

        s_cluster = []
        cluster = KMeans(n_clusters=n_clusters, random_state=10)    # # Create a subplot with 1 row and 2 columns

        cluster_labels = cluster.fit_predict(y)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        #silhouette_avg = silhouette_score(y, cluster_labels)
        #print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)

        # centers = cluster.cluster_centers_
        # print (centers)

        sample_silhouette_values = silhouette_samples(y, cluster_labels)
        for i in range(n_clusters):

            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]
            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]

            s_cluster.append(size_cluster_i)

        mean_s = np.mean(s_cluster)
        mse = 0
        for j in s_cluster:
            mse = mse+(s_cluster[j] - mean_s)**2

        dic[n_clusters] = mse
    #print (dic)

    min_d = min(dic, key=dic.get)
    logger.info('Optimum number of Clusters Decided')
    return (min_d)
Exemplo n.º 30
0
 def silhouette_analysis(self):
     if not self.pca_reduced:
         self.pc_analysis()
     range_n_clusters = range(2, 10)
     for n_clusters in range_n_clusters:
         fig, (ax1, ax2) = plt.subplots(1, 2)
         fig.set_size_inches(18, 7)
         ax1.set_xlim([-0.1, 1])
         ax1.set_ylim([0, len(self.pca_reduced) + (n_clusters + 1) * 10])
         clusterer = KMeans(n_clusters=n_clusters, random_state=10)
         cluster_labels = clusterer.fit_predict(self.pca_reduced)
         silhouette_avg = silhouette_score(self.pca_reduced, cluster_labels)
         print("For n_clusters =", n_clusters, "the average silhouette_score is :", silhouette_avg)
         sample_silhouette_values = silhouette_samples(self.pca_reduced, cluster_labels)
         y_lower = 10
         for i in range(n_clusters):
             ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
             ith_cluster_silhouette_values.sort()
             size_cluster_i = ith_cluster_silhouette_values.shape[0]
             y_upper = y_lower + size_cluster_i
             color = cm.spectral(float(i) / n_clusters)
             ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values,
                               facecolor=color, edgecolor=color, alpha=0.7)
             ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
             y_lower = y_upper + 10
         ax1.set_title("The silhouette plot for the various clusters.")
         ax1.set_xlabel("The silhouette coefficient values")
         ax1.set_ylabel("Cluster label")
         ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
         ax1.set_yticks([])
         ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
         colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
         ax2.scatter(self.pca_reduced[:, 0], self.pca_reduced[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors)
         centers = clusterer.cluster_centers_
         ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200)
         for i, c in enumerate(centers):
             ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)
         ax2.set_title("The visualization of the clustered data.")
         ax2.set_xlabel("Feature space for the 1st feature")
         ax2.set_ylabel("Feature space for the 2nd feature")
         plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                       "with n_clusters = %d" % n_clusters),
                      fontsize=14, fontweight='bold')
Exemplo n.º 31
0
    def k_medoids_over_instances(self,
                                 dataset,
                                 cols,
                                 k,
                                 distance_metric,
                                 max_iters,
                                 n_inits=5,
                                 p=1):
        # If we set it to default we use the pyclust package...
        temp_dataset = dataset[cols]
        if distance_metric == 'default':
            km = pyclust.KMedoids(n_clusters=k, n_trials=n_inits)
            km.fit(temp_dataset.as_matrix())
            cluster_assignment = km.labels_

        else:
            self.p = p
            cluster_assignment = []
            best_silhouette = -1

            # Compute all distances
            D = self.compute_distance_matrix_instances(temp_dataset,
                                                       distance_metric)

            for it in range(0, n_inits):
                # First select k random points as centers:
                centers = random.sample(range(0, len(dataset.index)), k)
                prev_centers = []
                points_to_cluster = []

                n_iter = 0
                while (n_iter < max_iters) and not (centers == prev_centers):
                    n_iter += 1
                    prev_centers = centers
                    # Assign points to clusters.
                    points_to_centroid = D[centers].idxmin(axis=1)

                    new_centers = []
                    for i in range(0, k):
                        # And find the new center that minimized the sum of the differences.
                        best_center = D.ix[points_to_centroid == centers[i],
                                           points_to_centroid ==
                                           centers[i]].sum().idxmin(axis=1)
                        new_centers.append(best_center)
                    centers = new_centers

                # Convert centroids to cluster numbers:

                points_to_centroid = D[centers].idxmin(axis=1)
                current_cluster_assignment = []
                for i in range(0, len(dataset.index)):
                    current_cluster_assignment.append(
                        centers.index(points_to_centroid.ix[i, :]))

                silhouette_avg = silhouette_score(
                    temp_dataset, np.array(current_cluster_assignment))
                if silhouette_avg > best_silhouette:
                    cluster_assignment = current_cluster_assignment
                    best_silhouette = silhouette_avg

        # And add the clusters and silhouette scores to the dataset.
        dataset['cluster'] = cluster_assignment
        silhouette_avg = silhouette_score(temp_dataset,
                                          np.array(cluster_assignment))
        silhouette_per_inst = silhouette_samples(temp_dataset,
                                                 np.array(cluster_assignment))
        dataset['silhouette'] = silhouette_per_inst

        return dataset
Exemplo n.º 32
0
def kmeans_base_clustering(corr: Union[np.ndarray, pd.DataFrame],
                           names_features: list = None,
                           max_num_clusters: int = 10,
                           **kwargs: Any) -> (pd.DataFrame, dict, pd.Series):
    """
    Perform base clustering with Kmeans.

    Arguments
    ---------
    corr: numpy.array or pd.DataFrame
      Correlation matrix.
    names_features : list of str
      List of names for features.
    max_num_clusters: int
      Maximum number of clusters.
    **kwargs
        Arbitrary keyword arguments for sklearn.cluster.KMeans().

    Returns
    -------
    pd.DataFrame
      Clustered correlation matrix.
    dictionary
      List of clusters and their content.
    pd.Series
      Silhouette scores.

    Notes
    -----
      Function adapted from "Machine Learning for Asset Managers",
      Marcos López de Prado (2020).

      To learn more about sklearn.cluster.KMeans():
      https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
    """

    # Initializations
    corr = pd.DataFrame(data=corr,
                        index=names_features,
                        columns=names_features)
    silh_score = pd.Series()

    # Define the observations matrix
    Xobs = (((1 - corr.fillna(0)) / 2.)**.5).values

    # Modify it to get an Euclidean distance matrix
    X = np.zeros(shape=Xobs.shape)
    for i, j in itertools.product(range(X.shape[0]), range(X.shape[1])):
        X[i, j] = np.sqrt(sum((Xobs[i, :] - Xobs[j, :])**2))
    X = pd.DataFrame(data=X, index=names_features, columns=names_features)

    # Loop to generate different numbers of clusters
    for i in range(2, max_num_clusters + 1):

        # Define model and fit
        kmeans_current = cluster.KMeans(n_clusters=i, **kwargs).fit(X)

        # Compute silhouette score
        silh_current = silhouette_samples(X, kmeans_current.labels_)

        # Compute clustering quality q (t-statistic of silhouette score)
        quality_current = silh_current.mean() / silh_current.std()
        quality = silh_score.mean() / silh_score.std()

        # Keep best quality scores and clustering
        if np.isnan(quality) or (quality_current > quality):
            silh_score = silh_current
            kmeans = kmeans_current

    # Extract index according to sorted labels
    new_idx = np.argsort(kmeans.labels_)

    # Reorder rows and columns
    clustered_corr = corr.iloc[new_idx]
    clustered_corr = clustered_corr.iloc[:, new_idx]

    # Form clusters
    clusters = {
        i: clustered_corr.columns[np.where(kmeans.labels_ == i)[0]].tolist()
        for i in np.unique(kmeans.labels_)
    }

    # Define a series with the silhouette score
    silh_score = pd.Series(silh_score, index=X.index)

    return clustered_corr, clusters, silh_score
Exemplo n.º 33
0
def cluster_observation_matrix(X: pd.DataFrame,
                               n_clust_range: range,
                               model: cluster,
                               verbose: bool = True,
                               **kwargs: Any) -> (dict, dict):
    """
    Apply clustering for an arbitrary model as long as the model has an argument 'n_clusters'.

    Parameters
    ----------
    X : pd.DataFrame
      The Observation matrix on which the clustering is based.
    n_clust_range: range
      Range of integer values for the number of clusters to be tested.
    model : sklearn.cluster
      The clustering model to be used from sklearn.
    verbose : bool
      Verbose option.
    **kwargs :
      Arguments for the clustering model.

    Returns
    -------
    dict
      Labels corresponding to the different clusters.
    dict
      Quality corresponding to the different clusters.

    Notes
    -----
      To learn more about sklearn.cluster:
      https://scikit-learn.org/stable/modules/classes.html?highlight=cluster#module-sklearn.cluster
    """

    # Checks
    if min(n_clust_range) < 2:
        raise AssertionError(
            "Argument n_clust_range must have values starting at values >= 2.")

    # Initialization
    save_labels = {}
    save_quality = {}
    n_clust_range_min = n_clust_range[0]
    n_clust_range_max = n_clust_range[-1]
    n_clust_range_step = int(n_clust_range[-1] - n_clust_range[-2])

    # Looping
    for k in n_clust_range:

        # Build clusters
        fitted_model = model(n_clusters=k, **kwargs).fit(X)
        save_labels[k] = fitted_model.labels_.tolist()

        # Compute scores
        silh = silhouette_samples(X, fitted_model.labels_)
        save_quality[k] = silh.mean() / silh.std()

    # Plot qualities
    if verbose:
        plt.xticks(ticks=n_clust_range)
        plt.plot(n_clust_range, list(save_quality.values()))

        # Make it cute
        plt.title("Normalized Silhouette Score")
        plt.xlabel("Number of clusters")
        plt.ylabel("Score")

    # Make bars containing the clusters composition
    m = len(save_labels)
    assert (m == len(n_clust_range))
    bars = np.zeros(shape=(m, n_clust_range_max))

    # Loop over max number of clusters
    for k in n_clust_range:

        # Count appearing values
        count_vals = []
        for j in range(n_clust_range_max):
            count_vals.append(int(save_labels[k].count(j)))

        # Distribute these values to build bars
        for i in range(n_clust_range_max):
            bars[(k - n_clust_range_min) // n_clust_range_step,
                 i] = count_vals[i]

    # Plot clusters compositions with bar plot
    if verbose:
        plt.figure(figsize=(10, 5))
        m = bars.shape[0]
        sum_bars = [0] * m

        for i in range(n_clust_range_max):
            if i > 0:
                sum_bars += bars[:, i - 1]
            plt.bar(n_clust_range, bars[:, i], width=0.8, bottom=sum_bars)

        # Make it cute
        plt.xticks(ticks=n_clust_range)
        plt.title("Composition of clusters")
        plt.xlabel("Number of clusters")
        plt.ylabel("Composition")

    # Return labels
    return save_labels, save_quality
Exemplo n.º 34
0
 def sample_silhouette_score(self, x, cluster_clients):
     self.sample_silh_score = silhouette_samples(x, cluster_clients)
Exemplo n.º 35
0
def SilhouetteScores(vectors, mode, cluster_range, view, output):
    """
    Compute silhouette and elbow method (nltk/sklearn) for range of clusters
    param1: array of vecotors (3 dimensions)
    param2: mode ('nltk'/'sklearn'/'agglomerative')
    param3: list/range of number of clusters
    param4: output path of figure (needs {} to format number of clusters)
    output: tupel of two dicts with elbow and silhouette scores (nltk/sklearn),
            dict with silhouette scores (agglomerative)
    """
    if mode.lower() == "nltk":
        rng = random.Random()
        rng.seed(123)
        wcss = {}  # for elbow method
        s_scores = {}  # for silhouette scores
        for NUM_CLUSTERS in tqdm(cluster_range):
            kclusterer = KMeansClusterer(
                NUM_CLUSTERS,
                distance=nltk.cluster.util.cosine_distance,
                repeats=25,
                rng=rng,
                avoid_empty_clusters=True)
            labels = kclusterer.cluster(vectors, assign_clusters=True)
            # elbow method
            # the centroids: kclusterer.means()
            centroid_array = np.vstack(
                [kclusterer.means()[label] for label in labels])
            wcss[NUM_CLUSTERS] = nltk_inertia(vectors, centroid_array)
            # silhouette scores
            if 1 < NUM_CLUSTERS:
                silhouette_s = metrics.silhouette_samples(vectors,
                                                          labels,
                                                          metric='cosine')
                ss_max = max(silhouette_s)
                ss_min = min(silhouette_s)
                ss_mean = float(sum(silhouette_s) / len(silhouette_s))
                s_scores[NUM_CLUSTERS] = (ss_max, ss_min, ss_mean)
                # plotting
                scatterplot3D(vectors,
                              color=labels,
                              view=view,
                              output=output.format(NUM_CLUSTERS))
        return (wcss, s_scores)

    elif mode.lower() == "sklearn":
        wcss = {}  # for elbow method
        s_scores = {}  # for silhouette scores
        for NUM_CLUSTERS in tqdm(cluster_range):
            kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS,
                                    n_init=25,
                                    random_state=42)
            kmeans.fit(vectors)  # compute k-means clustering
            labels = kmeans.labels_
            # elbow method
            wcss[NUM_CLUSTERS] = kmeans.inertia_
            # silhouette scores
            if 1 < NUM_CLUSTERS:
                silhouette_s = metrics.silhouette_samples(vectors,
                                                          labels,
                                                          metric='euclidean')
                ss_max = max(silhouette_s)
                ss_min = min(silhouette_s)
                ss_mean = float(sum(silhouette_s) / len(silhouette_s))
                s_scores[NUM_CLUSTERS] = (ss_max, ss_min, ss_mean)
                # plotting
                scatterplot3D(vectors,
                              color=labels,
                              view=view,
                              output=output.format(NUM_CLUSTERS))
        return (wcss, s_scores)

    elif mode.lower() == "agglomerative":
        s_scores = {}
        for NUM_CLUSTERS in tqdm(cluster_range):
            agglo = cluster.AgglomerativeClustering(n_clusters=NUM_CLUSTERS)
            agglo.fit(vectors)  # compute k-means clustering
            labels = agglo.labels_
            # silhouette scores
            if 1 < NUM_CLUSTERS:
                silhouette_s = metrics.silhouette_samples(vectors,
                                                          labels,
                                                          metric='euclidean')
                ss_max = max(silhouette_s)
                ss_min = min(silhouette_s)
                ss_mean = float(sum(silhouette_s) / len(silhouette_s))
                s_scores[NUM_CLUSTERS] = (ss_max, ss_min, ss_mean)
                # plotting
                scatterplot3D(vectors,
                              color=labels,
                              view=view,
                              output=output.format(NUM_CLUSTERS))
        return s_scores
Exemplo n.º 36
0
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters, "The average silhouette_score is :",
          silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
Exemplo n.º 37
0
    def calculateSilhouetteScore(self, dataFile):
        """Calculate the silhouette score for different numbers of clusters.

        :param self: An instance of the class SilhouetteScore.
        :param dataFile: An array with the input data points.
        :return: A list with the names of the image files created.
        """
        instanceKmeans = KmeansRunner()
        X = instanceKmeans.retrieveData(dataFile)
        if (X.shape[0] > 10000):
            size = round(X.shape[0] * 0.001)
            idx = np.random.randint(X.shape[0], size=size)
            subset = X[idx, :]
            X = subset
        range_n_clusters = [2, 3, 4, 5, 6]
        list_images = []

        for n_clusters in range_n_clusters:

            fig, (ax1, ax2) = plt.subplots(1, 2)
            fig.set_size_inches(18, 7)

            ax1.set_xlim([-0.1, 1])

            ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

            clusterer = KMeans(n_clusters=n_clusters, random_state=10)
            cluster_labels = clusterer.fit_predict(np.array(X))

            silhouette_avg = silhouette_score(X, cluster_labels)
            print("For n_clusters =", n_clusters,
                  "The average silhouette_score is :", silhouette_avg)

            sample_silhouette_values = silhouette_samples(X, cluster_labels)

            y_lower = 10
            for i in range(n_clusters):

                ith_cluster_silhouette_values = \
                    sample_silhouette_values[cluster_labels == i]

                ith_cluster_silhouette_values.sort()

                size_cluster_i = ith_cluster_silhouette_values.shape[0]
                y_upper = y_lower + size_cluster_i

                color = cm.spectral(float(i) / n_clusters)
                ax1.fill_betweenx(np.arange(y_lower, y_upper),
                                  0, ith_cluster_silhouette_values,
                                  facecolor=color, edgecolor=color, alpha=0.7)

                ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

                y_lower = y_upper + 10

            ax1.set_title("The silhouette plot for the various clusters.")
            ax1.set_xlabel("The silhouette coefficient values")
            ax1.set_ylabel("Cluster label")

            ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

            ax1.set_yticks([])
            ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

            colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
            ax2.scatter(X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7,
                        c=colors)

            centers = clusterer.cluster_centers_
            ax2.scatter(centers[:, 0], centers[:, 1],
                        marker="o", c="white", alpha=1, s=200)

            for i, c in enumerate(centers):
                ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50)

            ax2.set_title("The visualization of the clustered data.")
            ax2.set_xlabel("Feature space for the 1st feature")
            ax2.set_ylabel("Feature space for the 2nd feature")

            plt.suptitle(("Silhouette analysis for k-means"
                          "clustering on sample data "
                          "with n_clusters = %d" % n_clusters),
                         fontsize=14, fontweight="bold")
            fig.savefig("cluster_" + str(n_clusters) + ".png")
            list_images.append("cluster_" + str(n_clusters) + ".png")
        return list_images
Exemplo n.º 38
0
def silhouette(dataset, n):

    range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10]
    if n < 5:
        range_n_clusters = [2, 3, 4, 5, 6, 7]

    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(dataset) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        cluster_labels = clusterer.fit_predict(dataset)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(dataset, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(dataset, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              ith_cluster_silhouette_values,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(dataset[:, 0],
                    dataset[:, 1],
                    marker='.',
                    s=30,
                    lw=0,
                    alpha=0.7,
                    c=colors,
                    edgecolor='k')

        # Labeling the clusters
        centers = clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0],
                    centers[:, 1],
                    marker='o',
                    c="white",
                    alpha=1,
                    s=200,
                    edgecolor='k')

        for i, c in enumerate(centers):
            ax2.scatter(c[0],
                        c[1],
                        marker='$%d$' % i,
                        alpha=1,
                        s=50,
                        edgecolor='k')

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(
            ("Silhouette analysis for KMeans clustering on sample data "
             "with n_clusters = %d" % n_clusters),
            fontsize=14,
            fontweight='bold')

    plt.show()
Exemplo n.º 39
0
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(peopleMatrixPcaTransform) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(peopleMatrixPcaTransform)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = metrics.silhouette_score(peopleMatrixPcaTransform,
                                              cluster_labels)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = metrics.silhouette_samples(
        peopleMatrixPcaTransform, cluster_labels)

    # The score is bounded between -1 for incorrect clustering and +1 for highly dense clustering.
    # Scores around zero indicate overlapping clusters.
    # The score is higher when clusters are dense and well separated, which relates to a standard concept of a cluster.

    print(
        "\n\n\nFor n_clusters =", n_clusters,
        "\n\nThe average silhouette_score is :", silhouette_avg,
        "\n\n* The silhouette score is bounded between -1 for incorrect clustering and +1 for highly dense clustering.",
        "\n* Scores around zero indicate overlapping clusters.",
        "\n* The score is higher when clusters are dense and well separated, which relates to a standard concept of a cluster",
        "\n\nThe individual silhouette scores were :",
        sample_silhouette_values, "\n\nAnd their assigned clusters were :",
        cluster_labels,
        "\n\nWhich correspond to : 'Jane', 'Bob', 'Mary', 'Mike', 'Alice', 'Skip', 'Kira', 'Moe', 'Sara', and 'Tom'"
Exemplo n.º 40
0
    def __plot_clusters_onto_2D(self,
                                clusterer,
                                X,
                                dim_reduction_method,
                                perplexity,
                                plot=False):
        """
        For high dimensional data, use t-SNE to reduce the dimensionality and plot result
        on a 2D plane.
        Args:
            perplexity: The perplexity is related to the number of nearest neighbors that is used
                        in other manifold learning algorithms. Larger datasets usually require a
                        larger perplexity. Consider selecting a value between 5 and 50. The choice
                        is not extremely critical since t-SNE is quite insensitive to this parameter.
        """

        if hasattr(clusterer, 'predict'):
            cluster_labels = clusterer.predict(X)
        else:
            cluster_labels = clusterer.labels_
        if len(set(cluster_labels)) == 1:
            print(
                "clustering failed. unable to group data into two more clusters."
            )
            return

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)

        n_clusters = clusterUtilities.get_n_clusters(clusterer)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        if plot is True:
            cmap = cm.get_cmap("CMRmap")
            # Create a subplot with 1 row and 2 columns
            fig, (ax1, ax2) = plt.subplots(1, 2)
            fig.set_size_inches(18, 7)

            # The 1st subplot is the silhouette plot
            # The silhouette coefficient can range from -1, 1 but in this example all
            # lie within [-0.1, 1]
            ax1.set_xlim([-0.1, 1])
            # The (n_clusters+1)*10 is for inserting blank space between silhouette
            # plots of individual clusters, to demarcate them clearly.
            ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

            # Compute the silhouette scores for each sample
            sample_silhouette_values = silhouette_samples(X, cluster_labels)

            y_lower = 10
            for i in range(n_clusters):
                # Aggregate the silhouette scores for samples belonging to
                # cluster i, and sort them
                ith_cluster_silhouette_values = \
                                                sample_silhouette_values[cluster_labels == i]

                ith_cluster_silhouette_values.sort()

                size_cluster_i = ith_cluster_silhouette_values.shape[0]
                y_upper = y_lower + size_cluster_i

                # cm.spectral has been removed since matplotlib 2.2
                #color = cm.spectral(float(i) / n_clusters)
                color = cmap(float(i) / n_clusters)
                ax1.fill_betweenx(np.arange(y_lower, y_upper),
                                  0,
                                  ith_cluster_silhouette_values,
                                  facecolor=color,
                                  edgecolor=color,
                                  alpha=0.7)

                # Label the silhouette plots with their cluster numbers at the middle
                ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

                # Compute the new y_lower for next plot
                y_lower = y_upper + 10  # 10 for the 0 samples

            ax1.set_title("The silhouette plot for the various clusters.")
            ax1.set_xlabel("The silhouette coefficient values")
            ax1.set_ylabel("Cluster label")

            # The vertical line for average silhouette score of all the values
            ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

            ax1.set_yticks([])  # Clear the yaxis labels / ticks
            ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

            # map X (high dimensional) to 2D
            if X.shape[1] > 2:
                # reduce the dimensionality of input data to 2D
                if dim_reduction_method == DIM_REDUCTION_METHOD_KERNEL_PCA:
                    kpca = KernelPCA(n_components=2,
                                     kernel="rbf",
                                     gamma=10,
                                     random_state=0)
                    X = kpca.fit_transform(X)
                else:
                    tsne = manifold.TSNE(n_components=2,
                                         perplexity=perplexity,
                                         init='pca',
                                         random_state=0)
                    X = tsne.fit_transform(X)

            # 2nd Plot showing the actual clusters formed
            colors = cmap(cluster_labels.astype(float) / n_clusters)
            ax2.scatter(X[:, 0],
                        X[:, 1],
                        marker='.',
                        s=100,
                        lw=0,
                        alpha=0.7,
                        c=colors,
                        edgecolor='k')

            ax2.set_title(
                "The visualization of the clustered data({}).".format(
                    dim_reduction_method))
            ax2.set_xlabel("reduced feature space of 1st dimension")
            ax2.set_ylabel("reduced feature space of 2nd dimension")

            plt.suptitle(("Silhouette analysis for clustering methods "
                          "with n_clusters = %d" % n_clusters),
                         fontsize=14,
                         fontweight='bold')

            plt.show()
Exemplo n.º 41
0
def plot_silhouette(clf, X, title='Silhouette Analysis', metric='euclidean', copy=True, ax=None,
                    figsize=None, title_fontsize="large", text_fontsize="medium"):
    """Plots silhouette analysis of clusters using fit_predict.

    Args:
        clf: Clusterer instance that implements ``fit`` and ``fit_predict`` methods.

        X (array-like, shape (n_samples, n_features)):
            Data to cluster, where n_samples is the number of samples and
            n_features is the number of features.

        title (string, optional): Title of the generated plot. Defaults to "Silhouette Analysis"

        metric (string or callable, optional): The metric to use when calculating distance
            between instances in a feature array. If metric is a string, it must be one of
            the options allowed by sklearn.metrics.pairwise.pairwise_distances. If X is
            the distance array itself, use "precomputed" as the metric.

        copy (boolean, optional): Determines whether ``fit`` is used on **clf** or on a
            copy of **clf**.

        ax (:class:`matplotlib.axes.Axes`, optional): The axes upon which to plot
            the learning curve. If None, the plot is drawn on a new set of axes.

        figsize (2-tuple, optional): Tuple denoting figure size of the plot e.g. (6, 6).
            Defaults to ``None``.

        title_fontsize (string or int, optional): Matplotlib-style fontsizes.
            Use e.g. "small", "medium", "large" or integer-values. Defaults to "large".

        text_fontsize (string or int, optional): Matplotlib-style fontsizes.
            Use e.g. "small", "medium", "large" or integer-values. Defaults to "medium".

    Returns:
        ax (:class:`matplotlib.axes.Axes`): The axes on which the plot was drawn.

    Example:
        >>> import scikitplot.plotters as skplt
        >>> kmeans = KMeans(n_clusters=4, random_state=1)
        >>> skplt.plot_silhouette(kmeans, X)
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fe967d64490>
        >>> plt.show()

        .. image:: _static/examples/plot_silhouette.png
           :align: center
           :alt: Silhouette Plot
    """
    if copy:
        clf = clone(clf)

    cluster_labels = clf.fit_predict(X)

    n_clusters = len(set(cluster_labels))

    silhouette_avg = silhouette_score(X, cluster_labels, metric=metric)

    sample_silhouette_values = silhouette_samples(X, cluster_labels, metric=metric)

    if ax is None:
        fig, ax = plt.subplots(1, 1, figsize=figsize)

    ax.set_title(title, fontsize=title_fontsize)
    ax.set_xlim([-0.1, 1])

    ax.set_ylim([0, len(X) + (n_clusters + 1) * 10 + 10])

    ax.set_xlabel('Silhouette coefficient values', fontsize=text_fontsize)
    ax.set_ylabel('Cluster label', fontsize=text_fontsize)

    y_lower = 10

    for i in range(n_clusters):
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)

        ax.fill_betweenx(np.arange(y_lower, y_upper),
                         0, ith_cluster_silhouette_values,
                         facecolor=color, edgecolor=color, alpha=0.7)

        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i), fontsize=text_fontsize)

        y_lower = y_upper + 10

    ax.axvline(x=silhouette_avg, color="red", linestyle="--",
               label='Silhouette score: {0:0.3f}'.format(silhouette_avg))

    ax.set_yticks([])  # Clear the y-axis labels / ticks
    ax.set_xticks(np.arange(-0.1, 1.0, 0.2))

    ax.tick_params(labelsize=text_fontsize)
    ax.legend(loc='best', fontsize=text_fontsize)

    return ax
def perform_silhouette_analysis(path, segment_lengths, range_nclusters,
                                plot_silhouette):

    for iseg_length in segment_lengths:

        print("Segment Length = " + str(iseg_length))

        if plot_silhouette:
            fig = plt.figure(1)

        df_features = pd.read_csv(
            os.path.join(
                path,
                "Data/length" + str(iseg_length) + "/segment_features.csv"))
        #df_xys      = pd.read_csv(os.path.join(path, "Data/length" + str(iseg_length) + "/segment_xys.csv"))

        numpy_features = df_features.iloc[:, 4:12].values
        #fit kmeans
        features_scaled = preprocessing.scale(numpy_features)

        plot_index = 0

        for n_clusters in range_nclusters:
            if plot_silhouette:
                plot_index = plot_index + 1
                ax1 = plt.subplot(len(range_nclusters), 1, plot_index)

                fig.set_size_inches(7, 18)

                # The 1st subplot is the silhouette plot
                # The silhouette coefficient can range from -1, 1 but in this example all
                # lie within [-0.1, 1]
                ax1.set_xlim([-0.1, 1])
                plt.axes
                # The (n_clusters+1)*10 is for inserting blank space between silhouette
                # plots of individual clusters, to demarcate them clearly.
                ax1.set_ylim([0, len(features_scaled) + (n_clusters + 1) * 10])

            # perform kmeans
            clusterer = KMeans(n_clusters=n_clusters,
                               random_state=0,
                               max_iter=1000)
            cluster_labels = clusterer.fit_predict(features_scaled)

            # Calculate the average silhouette value for all segments and print to screen
            #pdb.set_trace()
            silhouette_avg = silhouette_score(features_scaled, cluster_labels)
            print("n_clusters = " + str(n_clusters) +
                  "   Avg silhouette_score = " + str(silhouette_avg))

            if plot_silhouette:

                # Compute the silhouette scores for each sample
                sample_silhouette_values = silhouette_samples(
                    features_scaled, cluster_labels)

                y_lower = 10
                for i in range(n_clusters):
                    # Aggregate the silhouette scores for samples belonging to
                    # cluster i, and sort them
                    ith_cluster_silhouette_values = \
                        sample_silhouette_values[cluster_labels == i]

                    ith_cluster_silhouette_values.sort()

                    size_cluster_i = ith_cluster_silhouette_values.shape[0]
                    y_upper = y_lower + size_cluster_i

                    color = cm.spectral(float(i) / n_clusters)
                    ax1.fill_betweenx(np.arange(y_lower, y_upper),
                                      0,
                                      ith_cluster_silhouette_values,
                                      facecolor=color,
                                      edgecolor=color,
                                      alpha=0.7)

                    # Label the silhouette plots with their cluster numbers at the middle
                    ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

                    # Compute the new y_lower for next plot
                    y_lower = y_upper + 10  # 10 for the 0 samples

                #ax1.set_title("The silhouette plot for the various clusters.")
                ax1.set_xlabel("The silhouette coefficient values")
                ax1.set_ylabel("Cluster label")

                # The vertical line for average silhouette score of all the values
                ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

                ax1.set_yticks([])  # Clear the yaxis labels / ticks
                ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

                #    # 2nd Plot showing the actual clusters formed
                #    colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
                #    ax2.scatter(features_scaled[:, 0], features_scaled[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                #                c=colors, edgecolor='k')
                #
                #    # Labeling the clusters
                #    centers = clusterer.cluster_centers_
                #    # Draw white circles at cluster centers
                #    ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                #                c="white", alpha=1, s=200, edgecolor='k')
                #
                #    for i, c in enumerate(centers):
                #        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                #                    s=50, edgecolor='k')
                #
                #    ax2.set_title("The visualization of the clustered data.")
                #    ax2.set_xlabel("Feature space for the 1st feature")
                #    ax2.set_ylabel("Feature space for the 2nd feature")

                plt.suptitle((
                    "Silhouette analysis for KMeans clustering on sample data "
                    "with n_clusters = %d" % n_clusters),
                             fontsize=14,
                             fontweight='bold')

                plt.show()

        if plot_silhouette:

            plt.savefig(
                os.path.join("figs/",
                             "silhouette_length" + str(iseg_length) + ".pdf"))

            plt.close(fig)


#perform_silhouette_analysis(path = "../../",
#                            segment_lengths = [100,150,200,250,300],
#                            range_nclusters = range(2,11),
#                            plot_silhouette = False)
def test_clustering3(_x, _y, _data, _xLab, _yLab, N_CLUSTERS, _latLon_params,
                     _basemp, **kwargs):
    pred_dict = {}
    np.random.seed(0)

    colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
    colors = np.hstack([colors] * 20)

    plot_num = 1
    X = _data
    print(_data[:10])
    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)

    # Compute distances
    # create clustering estimators
    print(kwargs['models'])
    alg_list = []
    for model in kwargs['models']:

        # estimate bandwidth for mean shift
        bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)
        ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)

        if model in ['Ward', 'AgglomerativeClustering']:
            # connectivity matrix for structured Ward
            connectivity = kneighbors_graph(X, n_neighbors=10)
            # make connectivity symmetric
            connectivity = 0.5 * (connectivity + connectivity.T)
            if model == 'Ward':
                ward = cluster.AgglomerativeClustering(
                    n_clusters=N_CLUSTERS,
                    linkage='ward',
                    connectivity=connectivity)
                alg_list.append(('Ward', ward))
            if model == 'AgglomerativeClustering':
                average_linkage = cluster.AgglomerativeClustering(
                    linkage="average",
                    affinity="cityblock",
                    n_clusters=N_CLUSTERS,
                    connectivity=connectivity)
                alg_list.append(('AgglomerativeClustering', average_linkage))
        if model == 'MiniBatchKMeans':
            two_means = cluster.MiniBatchKMeans(n_clusters=N_CLUSTERS)
            alg_list.append(('MiniBatchKMeans', two_means))
        if model == 'SpectralClustering':
            spectral = cluster.SpectralClustering(n_clusters=N_CLUSTERS,
                                                  eigen_solver='arpack',
                                                  affinity="nearest_neighbors")
            alg_list.append(('SpectralClustering', spectral))
        if model == 'DBSCAN':
            dbscan = cluster.DBSCAN(eps=.2)
            alg_list.append(('DBSCAN', dbscan))
        if model == 'AffinityPropagation':
            affinity_propagation = cluster.AffinityPropagation(damping=.9,
                                                               preference=-200)
            alg_list.append(('AffinityPropagation', affinity_propagation))

    print(alg_list)
    models = {}
    for name, algorithm in alg_list:
        models[name] = {}
        # predict cluster memberships
        models[name]['start'] = time.time()
        algorithm.fit(X)
        models[name]['end'] = time.time()
        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
            models[name]['y_pred'] = y_pred
        else:
            y_pred = algorithm.predict(X)
            models[name]['y_pred'] = y_pred

        models[name]['sil_score'] = metrics.silhouette_score(
            X, y_pred, metric='euclidean')
        models[name]['sample_sil_vals'] = silhouette_samples(X, y_pred)

        models[name]['model'] = algorithm
        models[name]['N_CLUSTERS'] = N_CLUSTERS

    return models
        ax1.set_ylim([0, len(data_vectorized) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 1 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=1)
        cluster_labels = clusterer.fit_predict(data_vectorized)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(data_vectorized, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(data_vectorized,
                                                      cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
Exemplo n.º 45
0
def silhouette(name, n_clusters, x):
    averages = []

    for n_cluster in n_clusters:
        plot.style.use('seaborn-darkgrid')
        plot.title(
            f'Silhouette on the {name} dataset, using {n_cluster}-means')
        ax = plot.gca()

        ax.set_xlim([-0.1, 1])
        ax.set_ylim([0, len(x) + (n_cluster + 1) * 10])

        clusterer = KMeans(n_clusters=n_cluster, random_state=0)
        cluster_labels = clusterer.fit_predict(x)

        silhouette_avg = silhouette_score(x, cluster_labels)
        averages.append(silhouette_avg)

        sample_silhouette_values = silhouette_samples(x, cluster_labels)

        y_lower = 10
        for i in range(n_cluster):
            ith_cluster_silhouette_values = sample_silhouette_values[
                cluster_labels == i]
            ith_cluster_silhouette_values.sort()
            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_cluster)
            ax.fill_betweenx(np.arange(y_lower, y_upper),
                             0,
                             ith_cluster_silhouette_values,
                             facecolor=color,
                             edgecolor=color,
                             alpha=0.7)

            ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            y_lower = y_upper + 10

        ax.set_xlabel("The silhouette coefficient values")
        ax.set_ylabel("Cluster labels")

        ax.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax.set_yticks([])
        ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        plot.show()

    lb = np.min(averages)
    ub = np.max(averages)
    amplitude = ub - lb
    lb -= 0.2 * amplitude
    ub += 0.2 * amplitude

    plot.style.use('seaborn-darkgrid')
    plot.title(f'Silhouette averages on the {name} dataset using k-means')
    plot.bar(n_clusters, averages)
    plot.xticks(n_clusters)
    plot.xlabel('Number of clusters')
    plot.ylabel('Silhouette averages')
    plot.ylim([lb, ub])
    plot.show()

    print(f'{name}: {averages}')
Exemplo n.º 46
0
def K_Means_silhouette_analysis(X, y):
    cluster_range = [3, 5, 7, 9, 11, 13, 15]
    for num_cluster in cluster_range:
        figure_to_show, (ax1, ax2) = plt.subplots(1, 2)
        figure_to_show.set_size_inches(20, 8)

        ax1.set_xlim([-0.1, 1])
        ax1.set_ylim([0, len(X) + (num_cluster + 1) * 10])

        clusterer = KMeans(n_clusters=num_cluster, random_state=10)
        cluster_labels = clusterer.fit_predict(X)

        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters = ", num_cluster,
              "The average silhouette_score is :", silhouette_avg)
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10
        for i in range(num_cluster):
            ith_cluster_silhouette_values = sample_silhouette_values[
                cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / num_cluster)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              ith_cluster_silhouette_values,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            y_lower = y_upper + 10

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        colors = cm.nipy_spectral(cluster_labels.astype(float) / num_cluster)
        ax2.scatter(X[:, 0],
                    X[:, 1],
                    marker='.',
                    s=30,
                    lw=0,
                    alpha=0.7,
                    c=colors,
                    edgecolor='k')

        centers = clusterer.cluster_centers_

        ax2.scatter(centers[:, 0],
                    centers[:, 1],
                    marker='o',
                    c="white",
                    alpha=1,
                    s=200,
                    edgecolor='k')

        for i, c in enumerate(centers):
            ax2.scatter(c[0],
                        c[1],
                        marker='$%d$' % i,
                        alpha=1,
                        s=50,
                        edgecolor='k')

        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle((
            "Silhouette analysis for KMeans clustering on sample data with num_cluster = %d"
            % num_cluster),
                     fontsize=14,
                     fontweight='bold')
    plt.show()
    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=6, n_jobs=-1)
    cluster_labels = clusterer.fit_predict(sample_descriptors)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(sample_descriptors,
                                      cluster_labels,
                                      sample_size=100)
    print("For n_clusters =", n_clusters, "The average silhouette_score is :",
          silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(sample_descriptors,
                                                  cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels
                                                                 == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
import pandas as pd
from sklearn import cluster
from sklearn import metrics

votes = pd.read_csv("/home/algo/Downloads/congress.csv")
votes.shape
cluster_model = cluster.AgglomerativeClustering(n_clusters=2,
                                                affinity='euclidean',
                                                linkage='ward')
cluster_model.fit(votes.iloc[:, 3:])

labels = cluster_model.labels_

silhouette_avg = metrics.silhouette_score(votes.iloc[:, 3:],
                                          labels,
                                          metric='euclidean')
silhouette_samples = metrics.silhouette_samples(votes.iloc[:, 3:],
                                                labels,
                                                metric='euclidean')
ch_score = metrics.calinski_harabaz_score(votes.iloc[:, 3:], labels)

for n_clusters in range(2, 6):
    cluster_model = cluster.AgglomerativeClustering(n_clusters=n_clusters,
                                                    random_state=10)
    cluster_labels = cluster_model.fit_predict(votes.iloc[:, 3:])
    silhouette_avg = metrics.silhouette_score(votes.iloc[:, 3:],
                                              cluster_labels,
                                              metric='euclidean')
    print("For n_clusters =", n_clusters, "The average silhouette_score is:",
          silhouette_avg)
Exemplo n.º 49
0

for i in range(100):
    print('Trial number ', i)
    start_time = time.time()

    for k in n_clusters:
        data = init_data.copy()

        # QUALITY CHECK DATA
        for j in range(1000):
            X = data.values
            kmeans = KMeans(n_clusters=k)
            cluster_labels = kmeans.fit_predict(X)

            sample_sil_coefficients = metrics.silhouette_samples(
                X, cluster_labels)

            data, count_negative = qualityCheck(data, sample_sil_coefficients)

            print('Number of data retained after quality check', len(data))

            if count_negative == 0:
                X = data.values
                kmeans = KMeans(n_clusters=k)
                cluster_labels = kmeans.fit_predict(X)
                sil_score = metrics.silhouette_score(X, cluster_labels)
                ssd_center = kmeans.inertia_
                scores[counter] = {
                    'trial': i + 1,
                    'cluster_number': k,
                    'silhouette_score': sil_score,
Exemplo n.º 50
0
def vis(X, y, nameappendix, k):
    
    scaler = MinMaxScaler(feature_range=[0,100])
    scaler.fit(X)
    X = pd.DataFrame(scaler.transform(X))
    
    
    
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(15, 6)
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(X) + (k + 1) * 10])
    
    print("Num of clusters: ", k)
    clusters = KMeans(n_clusters = k, random_state = 10).fit(X)
    labels = clusters.labels_
    print("NMI score: %.5f" % normalized_mutual_info_score(y, labels))

    silhouette_avg = sil_score(X, labels)
    print("Silhouette score: ", silhouette_avg)
    sample_silhouette_values = silhouette_samples(X, labels)


    y_lower = 10
    for i in range(k):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

#             color = plt.spectral(float(i) / numOfCluster)
        color = plt.get_cmap('Spectral')(float(i) / k)
        ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("Silhouette Coefficients for Clusters.")
    ax1.set_xlabel("Silhouette Coefficient Values")
    ax1.set_ylabel("Cluster Labels")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
#         colors = plt.spectral(labels.astype(float) / numOfCluster)
    colors = plt.get_cmap('Spectral')(labels.astype(float) / k)
#     print(X.values[:, 10])
#         colors = ["b","g","r","c","m","y","k"]
    ax2.scatter(X.values[:, 3], X.values[:,5], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k')

    # Labeling the clusters
    centers = clusters.cluster_centers_

    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 3], centers[:, 5], marker='o', c="white", alpha=1, s=200, edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[3], c[5], marker='$%d$' % i, alpha=1, s=50, edgecolor='k')

    ax2.set_title("Clustering Visualization")
    ax2.set_xlabel("1st feature: Pressure X4")
    ax2.set_ylabel("2nd feature: Pressure X5")

    plt.suptitle("Analysis for KMeans for " + str(k) + " Clusters", fontsize=14, fontweight='bold')
#     plt.savefig('img/kmeans_vis' + str(k) + '.png')
    plt.show()
Exemplo n.º 51
0
def make_new_outputs(corr: Union[np.array, pd.DataFrame], clusters: dict,
                     clusters2: dict) -> (pd.DataFrame, dict, pd.Series):
    """
    Makes new outputs for kmeans_advanced_clustering() by recombining two sets of clusters
    together, recomputing their correlation matrix, distance matrix, kmeans labels and silhouette scores.

    Arguments
    ---------
    corr : numpy.array or pd.DataFrame
      Correlation matrix.
    clusters : dict
      First set of clusters.
    clusters2 : dict
      Second set of clusters.

    Returns
    -------
    pd.DataFrame
      Clustered correlation matrix.
    dictionary
      List of clusters and their content.
    pd.Series
      Silhouette scores.

    Notes
    -----
      Function adapted from "Machine Learning for Asset Managers",
      Marcos López de Prado (2020).
    """

    # Initializations
    # Add clusters keys to the new cluster
    clusters_new = {}
    for i in clusters.keys():
        clusters_new[len(clusters_new.keys())] = list(clusters[i])
    for i in clusters2.keys():
        clusters_new[len(clusters_new.keys())] = list(clusters2[i])

    # Compute new correlation matrix
    new_idx = [j for i in clusters_new for j in clusters_new[i]]
    corr_new = corr.loc[new_idx, new_idx]

    # Compute the observation matrix
    Xobs = (((1 - corr.fillna(0)) / 2.)**.5).values

    # Compute the Euclidean distance matrix
    X = np.zeros(shape=Xobs.shape)
    for i, j in itertools.product(range(X.shape[0]), range(X.shape[1])):
        X[i, j] = np.sqrt(sum((Xobs[i, :] - Xobs[j, :])**2))
    new_names_features = corr_new.columns.tolist()
    X = pd.DataFrame(data=X,
                     index=new_names_features,
                     columns=new_names_features)

    # Add labels together
    kmeans_labels = np.zeros(len(X.columns))
    for i in clusters_new.keys():
        idxs = [X.index.get_loc(k) for k in clusters_new[i]]
        kmeans_labels[idxs] = i

    # Compute the silhouette scores
    silh_new = pd.Series(silhouette_samples(X, kmeans_labels), index=X.index)

    return corr_new, clusters_new, silh_new
Exemplo n.º 52
0
    print("Algorithm failed")

###
###     PLOTTING SILOUHETTE
###

classes_to_test = [2, 3, 4, 5, 6]

for classNumber in classes_to_test:
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(dataset) + (classNumber + 1) * 10])
    cluster_labels, centroids = clusterer.predictLabels(dataset, classNumber)
    silhouette_avg = silhouette_score(dataset, cluster_labels)
    sample_silhouette_values = silhouette_samples(dataset, cluster_labels)
    y_lower = 10
    for i in range(classNumber):
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / classNumber)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0,
                          ith_cluster_silhouette_values,
                          facecolor=color,
Exemplo n.º 53
0
    __print_pca_info__(pca)

    # compute variance per numbers of clusters
    cs = range(2, n_clusters + 1)
    scores = np.zeros((len(cs), 2))
    silhouette = np.zeros((len(cs), 2))

    for i, n in enumerate(cs):
        # cluster the samples
        k = KMeans(n_clusters=n)
        k.fit(X)

        # compute total variance and average silhouette score
        scores[i] = [n, k.inertia_]
        silhouette[i] = [n, silhouette_score(X, k.labels_)]
        silhouette_sample_values = silhouette_samples(X, k.labels_)
        silhouette_sample_values = \
          [sorted(silhouette_sample_values[k.labels_ == c], reverse=True) for c in range(n)]

        # sample cluster data for plotting
        clusters = [X[k.labels_ == c] for c in range(n)]
        cluster_samples = [
            samples[:, FEATURES][k.labels_ == c] for c in range(n)
        ]

        # create object summaries of the data
        cluster_assignments, sample_assignments = summarize(k, samples)

        # store a heat map of the distribution of the original samples into the clusters per object
        hm = sample_heat_map(sample_assignments, n)
        with open(
Exemplo n.º 54
0
                n_clusters=n_cl,
                memory='/home/winz3r/Documents/Data/')
            t0 = time.time()
            model.fit(X)
            tim_spec = time.time() - t0

            hier_labels = model.labels_
            label_name = dir_name + 'hierarchical_cluster_labels_K' + str(
                n_cl) + '_' + str(i) + '.csv'
            f_out = open(label_name, 'w')
            for w in hier_labels:
                f_out.write(str(w) + '\n')
            f_out.close()

            ##Silhouette Calculation
            sil_spec = (silhouette_samples(X, hier_labels)).mean(axis=0)

            ##SSE Calculation
            SSE_spec = 0
            for k in range(n_cl):
                members = hier_labels == k
                centre = X[members, :].mean(axis=0)
                for x in X[members]:
                    SSE_spec += np.dot(x - centre, (x - centre).T)

            tim.append(n_cl)
            SSE.append(n_cl)
            sil.append(n_cl)
            tim.append(tim_spec)
            sil.append(sil_spec)
            SSE.append(SSE_spec)
Exemplo n.º 55
0
                 str(i - reduced_data.shape[0]),
                 color=color,
                 fontdict={
                     'weight': 'bold',
                     'size': size
                 })

# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
    silhouette_avg = silhouette_score(reduced_data, assigned_cluster)
    print("For n_clusters =", clusters, "The average silhouette_score is :",
          silhouette_avg)

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(reduced_data,
                                                  assigned_cluster)

    #ax1 = plt.subplot(111)
    #y_lower = 10
    #for i in range(clusters):
    #	# Aggregate the silhouette scores for samples belonging to
    #	# cluster i, and sort them
    #	ith_cluster_silhouette_values = \
    #	sample_silhouette_values[assigned_cluster == i]

    #	ith_cluster_silhouette_values.sort()

    #	size_cluster_i = ith_cluster_silhouette_values.shape[0]
    #	y_upper = y_lower + size_cluster_i

    #	color = cm.nipy_spectral(float(i) / clusters)
Exemplo n.º 56
0
plt.ylabel("Distortion")
plt.show()

# silhouette analysis
km = KMeans(n_clusters=3,
            init="k-means++",
            n_init=10,
            max_iter=300,
            tol=1e-04,
            random_state=0)
y_km = km.fit_predict(X)

cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
# 为每个样本计算 silhouette 系数
silhouette_vals = silhouette_samples(X, y_km, metric="euclidean")
y_ax_lower, y_ax_upper = 0, 0  # 用于柱状图在 y 轴上的宽度
yticks = []

for i, c in enumerate(cluster_labels):
    c_silhouette_vals = silhouette_vals[y_km == c]  # 某一集群的 silhouette 系数
    c_silhouette_vals.sort()
    y_ax_upper += len(c_silhouette_vals)
    color = cm.jet(i / n_clusters)
    plt.barh(
        range(y_ax_lower, y_ax_upper),  # 就是 len(c_silouette_vals 的长度
        c_silhouette_vals,
        height=1.0,
        edgecolor="none",
        color=color)
    yticks.append((y_ax_lower + y_ax_upper) / 2)  # y 轴提示的位置
Exemplo n.º 57
0
cluster = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
y_pred = cluster.labels_
pre = cluster.fit_predict(X)
cluster_smallsub = KMeans(n_clusters=n_clusters, random_state=0).fit(X[:200])
y_pred_ = cluster_smallsub.predict(X)
centroid = cluster.cluster_centers_
inertia = cluster.inertia_
print("总距离", inertia)
color = ["red", "pink", "orange", "gray"]
fig, ax1 = plt.subplots(1)
for i in range(n_clusters):
    ax1.scatter(X[y_pred == i, 0], X[y_pred == i, 1], marker='o', s=8, c=color[i]
                )
ax1.scatter(centroid[:, 0], centroid[:, 1], marker="x", s=15, c="black")
plt.show()

# 轮廓系数,评估模型的分类结果,越接近1越好,可以看到4是最好的聚类分组
n_clusters = 4
cluster_4 = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
n_clusters = 5
cluster_5 = KMeans(n_clusters=n_clusters, random_state=0).fit(X)

score3 = silhouette_score(X, y_pred)
score4 = silhouette_score(X, cluster_4.labels_)
score5 = silhouette_score(X, cluster_5.labels_)
silhouette_samples(X, y_pred)

score_cal = calinski_harabasz_score(X, y_pred)  # 这个运行速度更快很多

time = datetime.datetime.fromtimestamp(time()).strftime("%Y-%m-%d %H:%M:%S")
Exemplo n.º 58
0
def plot_silhouettes(instance_matrix, origin_path, plot_title):
    """
    "translated" from https://towardsdatascience.com/k-means-clustering-algorithm-applications-evaluation-methods-and-drawbacks-aa03e644b48a

    :param X_std:
    :param max_range:
    :return:
    """
    files = os.listdir(f"{origin_path}")

    if ".DS_Store" in files:
        files.remove(".DS_Store")

    if plot_title + ".png" in files:
        files.remove(plot_title + ".png")

    sorted_files = sorted(files,
                          key=lambda s: int(s.split("_")[0].split("=")[1]))
    #sorted_files = sorted(os.listdir("resources/small/clustering/m_1.5"), key=str.lower)
    not_odd_files = sorted_files[0::2]  # return just every 2nd item
    print(not_odd_files)

    list_k = sorted(
        [int(file.split("_")[0].split("=")[1]) for file in not_odd_files])

    for k, filename in zip(list_k, not_odd_files):

        file = origin_path + "/" + filename

        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # Kmeans object
        k_means = pickle.load(open(file, "rb"))

        labels = k_means.cluster_mapping
        centroids = np.vstack(k_means.centroids)
        #X = np.vstack(k_means.instances)

        instance_matrix_less = instance_matrix[:len(labels)]

        # Get silhouette samples
        silhouette_vals = silhouette_samples(instance_matrix_less, labels)

        # Silhouette plot
        y_ticks = []
        y_lower, y_upper = 0, 0
        for i, cluster in enumerate(np.unique(labels)):
            cluster_silhouette_vals = silhouette_vals[labels == cluster]
            cluster_silhouette_vals.sort()
            y_upper += len(cluster_silhouette_vals)
            ax1.barh(range(y_lower, y_upper),
                     cluster_silhouette_vals,
                     edgecolor='none',
                     height=1)
            ax1.text(-0.03, (y_lower + y_upper) / 2, str(i + 1))
            y_lower += len(cluster_silhouette_vals)

        # Get the average silhouette score and plot it
        avg_score = np.mean(silhouette_vals)
        ax1.axvline(avg_score, linestyle='--', linewidth=2, color='green')
        ax1.set_yticks([])
        ax1.set_xlim([-0.1, 1])
        ax1.set_xlabel('Silhouette coefficient values')
        ax1.set_ylabel('Cluster labels')
        ax1.set_title('Silhouette plot for the various clusters', y=1.02)

        # Scatter plot of data colored with labels
        ax2.scatter(instance_matrix[:, 0], instance_matrix[:, 1], c=labels)
        ax2.scatter(centroids[:, 0], centroids[:, 1], marker='*', c='r', s=250)
        ax2.set_xlim([-2, 2])
        ax2.set_xlim([-2, 2])
        ax2.set_xlabel('Eruption time in mins')
        ax2.set_ylabel('Waiting time to next eruption')
        ax2.set_title('Visualization of clustered data', y=1.02)
        ax2.set_aspect('equal')
        plt.tight_layout()
        plt.suptitle(f'Silhouette analysis using k = {k}',
                     fontsize=16,
                     fontweight='semibold',
                     y=1.05)
        plt.show()
Exemplo n.º 59
0
# elbow method to reduce distortion
distortions = []
for i in range(1, 11):
    km = KMeans(n_clusters=i, n_init=10, max_iter=300, random_state=0)
    km.fit(x)
    distortions.append(km.inertia_)

plt.plot(range(1, 11), distortions, marker='o')
plt.xlabel('Number of cluster')
plt.ylabel('Distortion')
plt.show()

# quantifying qual by silhouette plots
cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(x, y_km, metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []

for i, c in enumerate(cluster_labels):
    c_silhouette_vals = silhouette_vals[y_km == c]
    c_silhouette_vals.sort()
    y_ax_upper += len(c_silhouette_vals)
    color = cm.jet(i / n_clusters)
    plt.barh(range(y_ax_lower, y_ax_upper),
             c_silhouette_vals,
             height=1.0,
             edgecolor='none',
             color=color)
    yticks.append((y_ax_lower + y_ax_upper) / 2)
    y_ax_lower += len(c_silhouette_vals)
Exemplo n.º 60
0
    def get_silhouette_graph(self, document_list, df_result, num_clusters):
        X = self.__get_tfidf_matrix(document_list)
        figures = []
        color_list = [
            '#1f77b4',  # muted blue
            '#ff7f0e',  # safety orange
            '#2ca02c',  # cooked asparagus green
            '#d62728',  # brick red
            '#9467bd',  # muted purple
            '#8c564b',  # chestnut brown
            '#e377c2',  # raspberry yogurt pink
            '#7f7f7f',  # middle gray
            '#bcbd22',  # curry yellow-green
            '#17becf'  # blue-teal
        ]
        cmap = cm.get_cmap("Spectral")
        fig = tools.make_subplots(rows=1,
                                  cols=2,
                                  print_grid=False,
                                  subplot_titles=('Silhouette Graph',
                                                  'Clutering Graph'))

        # Initialize Silhouette Graph
        fig['layout']['xaxis1'].update(title='Silhouette Coefficient',
                                       range=[-0.1, 1])
        fig['layout']['yaxis1'].update(
            title='Cluster Label',
            showticklabels=False,
            range=[0, len(X) + (num_clusters + 1) * 10])

        # Compute K-Means Cluster
        clusterer = KMeans(n_clusters=num_clusters, random_state=10)
        #KMeans(n_clusters = num_clutsers, init='k-means++', n_init=num_init, max_iter=max_iterations, random_state=0)
        cluster_labels = clusterer.fit_predict(X)

        # Compute Average Silhouette Score
        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters =", num_clusters,
              "The average silhouette_score is :", silhouette_avg)

        # Compute the Silhouette Scores for Each Sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)
        y_lower = 10

        for i in range(num_clusters):
            ith_cluster_silhouette_values = sample_silhouette_values[
                cluster_labels == i]
            ith_cluster_silhouette_values.sort()
            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i
            colors = cmap(cluster_labels.astype(float) / num_clusters)
            filled_area = go.Scatter(y=np.arange(y_lower, y_upper),
                                     x=ith_cluster_silhouette_values,
                                     mode='lines',
                                     showlegend=False,
                                     line=dict(width=0.5, color=colors),
                                     fill='tozerox')
            fig.append_trace(filled_area, 1, 1)

            y_lower = y_upper + 10  # 10 for the 0 samples

            # Vertical Line for Average Silhouette Score
            axis_line = go.Scatter(x=[silhouette_avg],
                                   y=[0, len(X) + (num_clusters + 1) * 10],
                                   showlegend=False,
                                   mode='lines',
                                   line=dict(color="red", dash='dash',
                                             width=1))
            fig.append_trace(axis_line, 1, 1)

            # Cluster Graph
            colors = matplotlib.colors.colorConverter.to_rgb(
                cmap(float(i) / num_clusters))
            colors = 'rgb' + str(colors)
            clusters = go.Scatter(x=df_result['x'],
                                  y=df_result['y'],
                                  showlegend=False,
                                  mode='markers',
                                  text=cluster_labels,
                                  marker=dict(color=[
                                      color_list[cluster_label]
                                      for cluster_label in cluster_labels
                                  ],
                                              size=10))
            fig.append_trace(clusters, 1, 2)

            fig['layout']['xaxis2'].update(
                title='Feature space for the 1st feature', zeroline=False)
            fig['layout']['yaxis2'].update(
                title='Feature space for the 2nd feature', zeroline=False)
            fig['layout'].update(
                title="Silhouette Analysis for KMeans Clustering - " +
                str(num_clusters) + " Cluster")

        return iplot(fig, filename='basic-line')