def computeLinkage( self, printDendogram = False ): # generate two clusters: a with 100 points, b with 50: #np.random.seed(4711) # for repeatability of this tutorial #a = np.random.multivariate_normal([10, 0], [[3, 1], [1, 4]], size=[100,]) #b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]], size=[50,]) #X = np.concatenate((a, b),) self.X = array( self.buildingAverages.values() ) #print X # 150 samples with 2 dimensions #plt.scatter(X[:,0], X[:,1]) #plt.show() # generate the linkage matrix self.Z = linkage(self.X, 'ward') c, coph_dists = cophenet(self.Z, pdist(self.X)) if (printDendogram): # calculate full dendrogram plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram (truncated)') plt.xlabel('Dendogram of Dartmouth campus buildings clusters') plt.ylabel('distance') dendrogram( self.Z, #truncate_mode='lastp', # show only the last p merged clusters #p=20, # show only the last p merged clusters show_leaf_counts=True, # otherwise numbers in brackets are counts leaf_rotation=90., leaf_font_size=12., show_contracted=True, # to get a distribution impression in truncated branches ) plt.show() return self.Z
def make_good_heatmap(D): data_dist = 1. - D np.fill_diagonal(data_dist, 0.) data_dist = squareform(data_dist) # Compute and plot first dendrogram. fig = plt.figure() # x-y width height ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6]) Y = linkage(data_dist, method='complete') Z1 = dendrogram(Y, orientation='right', color_threshold=.7) ax1.set_xticks([]) ax1.set_yticks([]) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2]) Z2 = dendrogram(Y, color_threshold=.7) ax2.set_xticks([]) ax2.set_yticks([]) # Compute and plot the heatmap axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] D = D[idx1,:] D = D[:,idx2] im = axmatrix.matshow(D, aspect='auto', origin='lower') axmatrix.set_xticks([]) axmatrix.set_yticks([]) # Plot colorbar. axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.6]) plt.colorbar(im, cax=axcolor) show()
def draw_intensity(a, cmap=GREEN_CMAP, metric='euclidean', method='average', sort_x=True, sort_y=True): main_axes = plt.gca() divider = make_axes_locatable(main_axes) if sort_x is True: plt.sca(divider.append_axes("top", 0.5, pad=0)) xlinkage = linkage(pdist(a.T, metric=metric), method=method, metric=metric) xdendro = dendrogram(xlinkage, orientation='top', no_labels=True, distance_sort='descending', link_color_func=lambda x: 'black') plt.gca().set_axis_off() a = a[[a.columns[i] for i in xdendro['leaves']]] if sort_y is True: plt.sca(divider.append_axes("left", 1.0, pad=0)) ylinkage = linkage(pdist(a, metric=metric), method=method, metric=metric) ydendro = dendrogram(ylinkage, orientation='right', no_labels=True, distance_sort='descending', link_color_func=lambda x: 'black') plt.gca().set_axis_off() a = a.ix[[a.index[i] for i in ydendro['leaves']]] plt.sca(main_axes) plt.imshow(a, aspect='auto', interpolation='none', cmap=cmap, vmin=0.0, vmax=1.0) plt.colorbar(pad=0.15) plt.gca().yaxis.tick_right() plt.xticks(range(a.shape[1]), a.columns, rotation=90, size='small') plt.yticks(range(a.shape[0]), a.index, size='x-small') plt.gca().xaxis.set_ticks_position('none') plt.gca().yaxis.set_ticks_position('none') plt.gca().invert_yaxis() plt.show()
def save_mat(c2map, filepath): mat = c2map['mat'] fig = pylab.figure(figsize=(8,8)) # Compute and plot first dendrogram. ax1 = fig.add_axes([0.09,0.1,0.2,0.6]) Y = sch.linkage(mat, method='centroid') Z1 = sch.dendrogram(Y, orientation='right') ax1.set_xticks([]) ax1.set_yticks([]) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3,0.71,0.6,0.2]) Y = sch.linkage(mat, method='single') Z2 = sch.dendrogram(Y) ax2.set_xticks([]) ax2.set_yticks([]) # Plot distance matrix. axmatrix = fig.add_axes([0.3,0.1,0.6,0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] mat = mat[idx1,:] mat = mat[:,idx2] im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu) axmatrix.set_xticks([]) axmatrix.set_yticks([]) # Plot colorbar. axcolor = fig.add_axes([0.91,0.1,0.02,0.6]) pylab.colorbar(im, cax=axcolor) fig.savefig(filepath)
def plot_dist_matrix(matrix, fasta_names, heatmap_out, dendrogram_out): """Cluster the distance matrix hierarchically and plot using seaborn. Average linkage method is used.""" # Load required modules for plotting import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import seaborn as sns import pandas as pd from scipy.cluster.hierarchy import dendrogram, linkage # Create pdm = pd.DataFrame(matrix, index=fasta_names, columns=fasta_names) # Plot heatmap figsizex = max(10, len(fasta_names) / 4) clustergrid = sns.clustermap(pdm, metric='euclidean', method='average', figsize=(figsizex, figsizex)) clustergrid.savefig(heatmap_out) # Plot dendrogram sns.set_style('white') figsizey = max(10, len(fasta_names) / 8) f, ax = plt.subplots(figsize=(figsizex, figsizey)) link = linkage(pdm, metric='euclidean', method='average') dendrogram(link, labels=pdm.index, ax=ax) no_spine = {'left': True, 'bottom': True, 'right': True, 'top': True} sns.despine(**no_spine) plt.xticks(rotation=90) f.tight_layout() plt.savefig(dendrogram_out)
def make_dendrogram_w(LinkageMatrix, GraphFolder, Method, Metric, CorrCoeff, Labels, Colors, DisplayLevels): import matplotlib if not os.path.exists(GraphFolder): os.makedirs(GraphFolder) plt.figure(figsize=(12,24)) plt.title("Plays clustered by topic probabilities", fontsize=14) #plt.ylabel("Parameters: "+Method+" method, "+Metric+" metric. CorrCoeff: "+str(CorrCoeff)+".") plt.xlabel("Distance\n(Parameters: "+Method+" / "+Metric+")", fontsize=12) matplotlib.rcParams['lines.linewidth'] = 1.2 dendrogram( LinkageMatrix, p = DisplayLevels, truncate_mode="level", color_threshold = 30, show_leaf_counts = True, no_labels = False, orientation="left", labels = Labels, leaf_rotation = 0, # rotates the x axis labels leaf_font_size = 4, # font size for the x axis labels ) #plt.show() plt.savefig(GraphFolder+"dendrogram_"+Method+"-"+Metric+"-"+str(DisplayLevels)+".png", dpi=300, figsize=(12,18), bbox_inches="tight") plt.close()
def buildEmbeddingsTree(indexMap, embeddings, comparator=None): embeddingsCount = len(embeddings) embeddingIndices = numpy.arange(0, embeddingsCount) xy = [xy for xy in itertools.product(embeddingIndices, embeddingIndices)] comparator = lambda a, b: vectors.euclideanDistance(a, b) + 1 / (2 + 2*vectors.cosineSimilarity(a, b)) function = lambda xy: comparator(embeddings[xy[0]], embeddings[xy[1]]) if xy[0] != xy[1] else 0 comparisons = map(function, xy) maxComparison = max(comparisons) comparisons = numpy.reshape(comparisons, (embeddingsCount, embeddingsCount)) / maxComparison comparisons = ssd.squareform(comparisons) links = linkage(comparisons) fig, ax = plt.subplots() fig.subplots_adjust(right=0.8) names = map(lambda nameIndexPair: nameIndexPair[0].split('/')[-1], indexMap.items()) names = sorted(names) dendrogram( links, leaf_rotation=90., leaf_font_size=8., orientation='right', labels=names, show_contracted=True, show_leaf_counts=True) plt.show()
def plot_corr_dendrogram( corr, cluster_method='weighted', **dendrogram_kwargs): """ Plot a correlation matrix as a dendrogram (on the current axes). Parameters ---------- corr : numpy ndarray or pandas DataFrame cluster_method : String Method to use to amalgomate clusters. Either 'single', 'complete', 'average', or 'weighted'. See scipy.cluster.hierarchy.linkage for details. dendrogram_kwargs : Additional kwargs Pass to the call of scipy.cluster.hierarchy.dendrogram() """ # Convert to a DataFrame in all cases. if not isinstance(corr, pd.DataFrame): names = range(len(corr)) else: names = corr.index.tolist() corr = corr.values dist = (1 - corr) / 2. Z = linkage(squareform(dist), method=cluster_method) dendrogram(Z, labels=names, **dendrogram_kwargs)
def city_comparison(file_array): dphi = 40 step = 1.0/dphi cities_fingerprint = np.zeros([len(file_array),2,dphi],'double') indx = 0 for file in file_array: [alph1,alph2] = fingerprint(file, dphi, indx, plot=False) # print alph1,alph2 cities_fingerprint[indx,0,:] = alph1 cities_fingerprint[indx,1,:] = alph2 indx+=1 distance_matrix = np.zeros([len(file_array),len(file_array)],'double') for i in xrange(len(file_array)): city_i = cities_fingerprint[i,:,:] d_cities = np.square(cities_fingerprint - city_i)*0.01 d_cities = np.sum(d_cities,2) D_cities = np.sum(np.square(d_cities),1) distance_matrix[i,:] = D_cities distance_condensed = squareform(distance_matrix, checks=False) linkage_matrix = linkage(distance_condensed) g = plt.figure(indx) dendrogram(linkage_matrix, orientation='right') plt.title('Dendrogram\nHierarchical Clustering of %d Cities'%len(file_array)) plt.show() return True
def HierarchicalCluster(A): #see http://stackoverflow.com/questions/2982929/plotting-results-of-hierarchical-clustering-ontop-of-a-matrix-of-data-in-python Corr = np.corrcoef(A.T) fig = plt.figure(figsize=(8,8)) ax1 = fig.add_axes([0.09,0.1,0.2,0.6]) Y = hrc.linkage(Corr, method='centroid') Z1 = hrc.dendrogram(Y, orientation='right') ax1.set_xticks([]) ax1.set_yticks([]) ax2 = fig.add_axes([0.3,0.71,0.6,0.2]) Y = hrc.linkage(Corr, method='centroid') Z2 = hrc.dendrogram(Y) ax2.set_xticks([]) ax2.set_yticks([]) axmatrix = fig.add_axes([0.3,0.1,0.6,0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] Corr = Corr[idx1, :] Corr = Corr[:, idx2] im = axmatrix.matshow(Corr, aspect='auto', origin='lower') axcolor = fig.add_axes([0.91,0.1,0.02,0.6]) pylab.colorbar(im, cax=axcolor) fig.show() fig.savefig('dendrogram.png')
def check_dendrogram_plot(self, orientation): # Tests dendrogram plotting. Z = linkage(hierarchy_test_data.ytdist, 'single') expected = {'color_list': ['g', 'b', 'b', 'b', 'b'], 'dcoord': [[0.0, 138.0, 138.0, 0.0], [0.0, 219.0, 219.0, 0.0], [0.0, 255.0, 255.0, 219.0], [0.0, 268.0, 268.0, 255.0], [138.0, 295.0, 295.0, 268.0]], 'icoord': [[5.0, 5.0, 15.0, 15.0], [45.0, 45.0, 55.0, 55.0], [35.0, 35.0, 50.0, 50.0], [25.0, 25.0, 42.5, 42.5], [10.0, 10.0, 33.75, 33.75]], 'ivl': ['2', '5', '1', '0', '3', '4'], 'leaves': [2, 5, 1, 0, 3, 4]} fig = plt.figure() ax = fig.add_subplot(111) # test that dendrogram accepts ax keyword R1 = dendrogram(Z, ax=ax, orientation=orientation) plt.close() assert_equal(R1, expected) # test plotting to gca (will import pylab) R2 = dendrogram(Z, orientation=orientation) plt.close() assert_equal(R2, expected)
def cengci(data): X = data distMatrix = pdist(X) Z = linkage(X, 'ward') c, coph_dists = cophenet(Z, pdist(X)) print c dendrogram(Z)
def clustering(X, labels, algo='hcluster', n_clusters=5, figname='cluster_result.png'): """ Clustering data. Params: X: ndarray of n x d size (n samples, d features) labels: labels of samples, for visualizing result. algo: specify clustering algorithms, e.g., "hcluster", "kmeans" n_clusters: #.of.cluster in case of kmeans figname: file name to save figure """ assert algo in ['hcluster', 'kmeans'], "Invalid algorithm!" if algo == 'hcluster': linkage_mat = hcluster(X, metric='correlation', method='average') fig = plt.figure(figsize=(30,20), dpi=100) fig.clf() hier.dendrogram(linkage_mat, labels=labels, leaf_rotation=90, leaf_font_size=20) plt.savefig(figname) else: labels = np.asarray(labels) result = kmeans(X, n_clusters=n_clusters) for cid in xrange(n_clusters): print 'Cluster %d:' %(cid+1) for a in labels[result == cid]: print a.encode('utf-8') print '-'*30
def plot_dendrogram(self, method = 'complete', metric = 'euclidean'): import scipy.cluster.hierarchy as sch """ Plot dendogram Parameters ------------ method: str method to use for scipy.cluster.hierarachy.linkage. Default is 'complete' metric: str metric to use for scipy.cluster.hierarachy.linkage. Default is 'euclidean' Returns ------------ Dendrogram """ # Get par names pars = self.df.index.values D = np.abs(self.array) Y = sch.linkage(D, method=method, metric = metric) plt.figure() sch.dendrogram(Y, labels = pars) plt.tight_layout()
def labeledDendrogram(dmat, labels, method='complete', cmap=None): """Perform hierarchical clustering on df columns and plot square heatmap of pairwise distances""" """TODO: add tick labels, with sparsity option""" Z = sch.linkage(dmat, method=method) den = sch.dendrogram(Z, color_threshold=np.inf, no_plot=True) figh = plt.gcf() figh.clf() denAX = figh.add_axes([0.32, 0.05, 0.6, 0.9]) cbAX = figh.add_axes([0.25, 0.05, 0.05, 0.9]) plt.sca(denAX) denD = sch.dendrogram(Z, color_threshold=np.inf, orientation='left') ind = denD['leaves'] clean_axis(denAX) cbSE, lookup = mapColors2Labels(labels, cmap=cmap, returnLookup=True) axi = cbAX.imshow([[x] for x in cbSE.iloc[ind].values], interpolation='nearest', aspect='auto', origin='lower') clean_axis(cbAX) colorLegend(list(lookup.values()), list(lookup.keys()), axh=denAX)
def dendrogram_pdf(args, dm, leafLabels): from scipy.cluster.hierarchy import linkage, dendrogram #from hcluster import squareform, linkage, dendrogram #from numpy import array #import pylab import matplotlib matplotlib.use('PDF') # pdf import matplotlib.pyplot as plt #condensed_dm = distance.squareform( dm ) #plt.figure(figsize=(100,10)) leafNodes = len(leafLabels) fig = plt.figure(figsize=(14,(leafNodes*0.25)), dpi=100) #fig = plt.figure(figsize=(14,100), dpi=10) #fig.set_size_inches(14,(leafNodes*0.2)) #ax = fig.add_subplot(111) #plt.tight_layout() #ax.set_title('Dendrogram: '+args.metric.capitalize()) # padding: #plt.subplots_adjust(bottom=0.25) #plt.subplots_adjust(top=0.05) plt.subplots_adjust(left=0.01) plt.subplots_adjust(right=0.65) plt.subplots_adjust(top=0.7) plt.subplots_adjust(bottom=0.25) #leafLabels = [ '\n'.join(l.split('--')) for l in leafLabels ] linkage_matrix = linkage(dm, method="average" ) dendrogram(linkage_matrix, color_threshold=1, leaf_font_size=6, orientation='right', labels=leafLabels) image_file = os.path.join(args.basedir, 'tmp',args.prefix+'_dendrogram.pdf') plt.savefig(image_file)
def Hierarchical_cluster_part(csvFile): df=pd.read_csv(csvFile) data=df.as_matrix() data=data[:,1:] # generate the linkage matrix Z = linkage(data, 'ward') c, coph_dists = cophenet(Z, pdist(data)) print c ## Plotting a Dendrogram # calculate full dendrogram plt.figure(figsize=(140, 60)) plt.title('Hierarchical Clustering Dendrogram(part)') plt.xlabel('sample index') plt.ylabel('distance') dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=2., # font size for the x axis labels ) # fancy_dendrogram( # Z, # truncate_mode='lastp', # show only the last p merged clusters #p=18, # show only the last p merged clusters # leaf_rotation=90., # rotates the x axis labels # leaf_font_size=8., # font size for the x axis labels # show_leaf_counts=True, # numbers in brackets are counts # show_contracted=True, # to get a distribution impression in truncated branches # max_d = 6000 # max_d as in max_distance # ) plt.savefig('/Users/CeciliaLee/Dropbox/Intren/HKIA/2/Dendrogram_Tree(part).png') plt.show() return c, Z
def dendrogram(data, vectorizer, method="ward", color_threshold=1, size=10, filename=None): '"median","centroid","weighted","single","ward","complete","average"' if hasattr(data, '__iter__'): iterable = data else: raise Exception('ERROR: Input must be iterable') import itertools iterable_1, iterable_2 = itertools.tee(iterable) # get labels labels = [] for graph in iterable_2: label = graph.graph.get('id', None) if label: labels.append(label) # transform input into sparse vectors X = vectorizer.transform(iterable_1) # labels if not labels: labels = [str(i) for i in range(X.shape[0])] # embed high dimensional sparse vectors in 2D from sklearn import metrics from scipy.cluster.hierarchy import linkage, dendrogram D = metrics.pairwise.pairwise_distances(X) Z = linkage(D, method=method) plt.figure(figsize=(size, size)) dendrogram(Z, color_threshold=color_threshold, labels=labels, orientation='right') if filename is not None: plt.savefig(filename) else: plt.show()
def hcluster(self): """ .. plot:: :include-source: :width: 50% from cno import XCNOGraph, cnodata c = XCNOGraph(cnodata("PKN-ToyPB.sif"), cnodata("MD-ToyPB.csv")) c.hcluster() .. warning:: experimental """ from scipy.cluster import hierarchy from scipy.spatial import distance path_length=nx.all_pairs_shortest_path_length(self.to_undirected()) n = len(self.nodes()) distances=np.zeros((n,n)) nodes = self.nodes() for u,p in path_length.iteritems(): for v,d in p.iteritems(): distances[nodes.index(u)-1][nodes.index(v)-1] = d sd = distance.squareform(distances) hier = hierarchy.average(sd) pylab.clf(); hierarchy.dendrogram(hier) pylab.xticks(pylab.xticks()[0], nodes)
def paint_clustering(results, clusters, num, chrom, tad_names): dendros = [] axes = [] prev = 0 xlim = [-100, 100] tmp = [] for i, result in enumerate(results): if axes: axes[-1].set_xticklabels([], visible=False) clust = linkage(result, method='ward') tmp = dendrogram(clust, orientation='right', no_plot=True)['leaves'] dendros += reversed(list([clusters[i][n] for n in tmp])) axes.append(plt.subplot2grid((num, 9),(prev, 0), rowspan=len(result), colspan=4)) dendrogram(clust, orientation='right', labels=[tad_names[c] for c in clusters[i]]) if xlim[0] < axes[-1].get_xlim()[0]: xlim[0] = axes[-1].get_xlim()[0] if xlim[1] > axes[-1].get_xlim()[1]: xlim[1] = axes[-1].get_xlim()[1] prev += len(result) for ax in axes: ax.set_xlim(left=xlim[0], right=xlim[1]) axes = [] for i, j in enumerate(dendros): axes.append(plt.subplot2grid((num, 9),(i, 4)))#gs1[i])) chrom.visualize('exp1', tad=chrom.get_experiment('exp1').tads[tad_names[j]], axe=axes[-1], show=False) axes[-1].set_axis_off() ax4 = plt.subplot2grid((num, 9),(0, 5), rowspan=num, colspan=4) chrom.visualize('exp1', paint_tads=True, axe=ax4) plt.draw()
def _draw_dendrogram(axes, Z, labels=None): """Draw the given linkage information as a dendrogram on the given Axes object. Change the drawing parameters so that the dendrogram will blend nicely into the figure showing multiple dendrograms. Arguments: axes -- matplotlib.axes.Axes object where to draw the plot Z -- numpy.ndarray in the format as specified in the scipy.cluster.hierarchy.linkage's docstring Keyword arguments: labels -- list or tuple (optional) where i-th value is the text to put under the i-th leaf node """ # set current axes instance plt.sca(axes) # draw the dendrogram dendrogram(Z, labels=labels, orientation="left") # remove x-axis labels axes.set_xticks(()) # remove the black border around axes for spine in axes.spines.itervalues(): spine.set_visible(False) # decrease the font size of y tick labels for ytl in axes.get_yticklabels(): ytl.set_fontsize("small")
def plot_data(self): """ plots a dendogram of the hierarchical clustering uncomment the matplotlib import if you call this function :return: None """ matrix = self.pre_cluster() #original matrix empty if matrix is None: return Z = self.get_cluster_matrix(matrix) article_titles = self.matrix_creator.get_article_titles() # calculate full dendrogram plt.figure(figsize=(10, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('distance') plt.axhline(y=2.3, c='k') dendrogram( Z, p=5, # show only the last p merged clusters orientation="right", labels=article_titles, show_leaf_counts=False, # otherwise numbers in brackets are counts #leaf_rotation=90., # rotates the x axis labels leaf_font_size=9., # font size for the x axis labels ) plt.show()
def hier_cluster_and_display(dist_matrix, leaf_labels, colorthresh, to_cluster = 'all', m = 'complete', imgsize = 25, fontsize=16): ''' clusters domains using hierarchical clustering and displays dendrogram. arguments: dist_matrix : distance matrix between domains leaf_labels: list of domain names colorthresh: threshold to color dendrogram nodes to_cluster (list of ints, optional, default='all'): if 'all', clusters all domains else clusters only domains corresponding to indices in list m (default='complete'): method used in hierarchical clustering. 'single' and 'average' also work; as in scipy. imgsize (default=25): size of image (imgsize,imgsize) of dendrogram to produce. fontsize (default=16): font size of dendrogram leaf labels. returns: result as outputted by scipy's hierarchical clustering. ''' if to_cluster == 'all': cluster_indices = range(dist_matrix.shape[0]) else: cluster_indices = to_cluster plt.figure(figsize=(imgsize,imgsize)) result = hier_cluster(dist_matrix,cluster_indices,m) dendrogram(result,orientation='left', labels=leaf_labels[cluster_indices], color_threshold=colorthresh, leaf_font_size=fontsize) return result
def plot_dendrogram(model, **kwargs): ''' taken from online example in sklearn fork turns hierarchical model into dendrogram ''' from scipy.cluster.hierarchy import dendrogram from sklearn.datasets import load_iris from sklearn.cluster import AgglomerativeClustering from sklearn.metrics import pairwise_distances from matplotlib import pyplot as plt # Children of hierarchical clustering children = model.children_ # Distances between each pair of children # Since we don't have this information, we can use a uniform one for plotting distance = np.arange(children.shape[0]) # The number of observations contained in each cluster level no_of_observations = np.arange(2, children.shape[0]+2) # Create linkage matrix and then plot the dendrogram linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float) # Plot the corresponding dendrogram dendrogram(linkage_matrix, **kwargs)
def ben_gen(): for num_data, hier_num, grey_option in itertools.product( np.arange(len(data)), np.arange(len(desired_hier)), [0, 1] ): hierarchy_structure = all_clustering_data[num_data][grey_option][hier_num] plt.figure() dendrogram(hierarchy_structure, color_threshold=1.6) # plt.ylim(0,5) plt.title(data_names[num_data] + grey_output + names_distances[hier_num + 1]) yield "okay" plt.close() plt.figure() dendrogram(hierarchy_structure, color_threshold=1.6) plt.ylim(0, ylimit) plt.title(data_names[num_data] + grey_output + names_distances[hier_num + 1]) yield "okay" plt.close() # predict_average = fcluster(hierarchy_structure,1.6,criterion='distance') raise StopIteration
def create_dendrogram(cds, clusters=None, filename=None): num_subj = cds.shape[0] num_voxels = cds.shape[1] if clusters == None: clusters = cds.a.event_bounds num_scenes = len(clusters) ds_list = np.zeros((num_subj, num_voxels, num_scenes-1)) prev_cutoff = 0 ds_tup = () # average correlations for each scene for i in range(num_scenes - 1): ds_list[:,:,i] = np.mean(cds.samples[:,:,clusters[i]:clusters[i+1]], axis=2) Z = hierarchy.linkage(np.mean(ds_list, axis=0).T, metric='correlation') fig = plt.figure(figsize=(14,8)) hierarchy.dendrogram(Z) plt.show() if filename is not None: fig.savefig(filename)
def hierarchical_clustering(self, data = 'open_shut'): ''' Cluster the clusters in the cluster list based on the method. ''' feature_list = [] for cluster in self.cluster_list: if data == 'open_shut': feature_list.append([np.log(cluster._get_mean_open()), np.log(cluster._get_mean_shut())]) elif data == 'popen': feature_list.append([cluster.popen,]) elif data == 'amp': feature_list.append([cluster.mean_amp,]) Z = linkage(feature_list, 'ward') plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('sample index') plt.ylabel('distance') dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels ) plt.show()
def show(self, distance_metric='euclidean', linkage_method='ward'): '''階層的クラスタリング表示関数 ''' #: 指定の手法で階層的クラスタリング cluster = hierarchy.linkage(self.hofstede_data, method=linkage_method, metric=distance_metric) #: 樹形図作成 hierarchy.dendrogram(cluster, orientation='left', color_threshold=150, labels=numpy.array(self.data_name), leaf_font_size=18) #: 日本を赤くするためのおまじない ax = plt.gca() xlbls = ax.get_ymajorticklabels() for lbl in xlbls: if lbl.get_text() == self.JAPAN_NAME: lbl.set_color("r") self.cluster = cluster plt.show()
def cal_idf_overlap(): list_subj = utils.list_subject ls_distance_final = [] ls_distance_row = [] #print len(list_att) stop_words = get_stop_words('en') tmp_corpus = [] for i in range(len(list_subj)): item = str(list_subj[i]).split(" ") for token in item: if token in stop_words: pass else: tmp_corpus.append(token) #print "corpus", corpus length = len(list_subj) for i in range(0, length): if i == 500 or i == 1000 or i == 1500: print i for j in range(0, length): print i, j idf_instance = IDF.IDF(str(list_subj[i]),str(list_subj[j]), tmp_corpus) distance = idf_instance.cal_overlap() ls_distance_row.append(distance) ls_distance_final.append(ls_distance_row) ls_distance_row = [] myarray = np.asarray(ls_distance_final) print myarray Z = linkage(myarray, "ward") thefile = open('/Users/Aaron/test.txt', 'w') for item in Z: thefile.write("%s\n" % item) plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('sample index') plt.ylabel('distance') dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels ) plt.show() plt.title('Hierarchical Clustering Dendrogram (truncated)') plt.xlabel('sample index') plt.ylabel('distance') dendrogram( Z, truncate_mode='lastp', # show only the last p merged clusters p=30, # show only the last p merged clusters show_leaf_counts=True, # otherwise numbers in brackets are counts leaf_rotation=90., leaf_font_size=12., show_contracted=True, # to get a distribution impression in truncated branches ) plt.show()
def plot_corr_dendrogram( corr, cluster_method='weighted', **dendrogram_kwargs): """ Plot a correlation matrix as a dendrogram (on the current axes). Uses scipy.cluster.hierarchy.linkage to compute clusters based on distance between samples. Since correlation is passed in, this correlation must be converted to a distance (using distance_fun). The default distance_fun makes highly correlated points have low distance, and vice versa. Parameters ---------- corr : numpy ndarray or pandas DataFrame corr[i, j] is the correlation (should be between -1 and 1) of samples i and j. cluster_method : String Method to use to amalgomate clusters. Either 'single', 'complete', 'average', or 'weighted'. See scipy.cluster.hierarchy.linkage for details. dendrogram_kwargs : Additional kwargs Pass to the call of scipy.cluster.hierarchy.dendrogram() """ # Convert to a DataFrame in all cases. if not isinstance(corr, pd.DataFrame): names = range(len(corr)) else: names = corr.index.tolist() corr = corr.values dist = (1 - corr) / 2. Z = linkage(squareform(dist), method=cluster_method) dendrogram(Z, labels=names, **dendrogram_kwargs)
#plotting the results into line graph plt.plot(range(1, 11), wcss) plt.title("Elbow method") plt.xlabel("No of clusters") plt.ylabel("WCSS") plt.show() # ## Using dendogram to find optimal no of clusters. # ## Hierarchical clustering # In[12]: import scipy.cluster.hierarchy as sch dendrogram = sch.dendrogram(sch.linkage(X, method='ward')) plt.title("Dendrogram") plt.xlabel("Species") plt.ylabel("Euclidean Distance") plt.show() # optimum clusters will be cluster after which wcss remains almost constant. From above two graphs, optimum no of clusters is 3. # ## creating kmeans classifier # In[13]: kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10,
''' Hierarchical clustering of the grain data In the video, you learned that the SciPy linkage() function performs hierarchical clustering on an array of samples. Use the linkage() function to obtain a hierarchical clustering of the grain samples, and use dendrogram() to visualize the result. A sample of the grain measurements is provided in the array samples, while the variety of each grain sample is given by the list varieties. INSTRUCTIONS 100XP Import: linkage and dendrogram from scipy.cluster.hierarchy. matplotlib.pyplot as plt. Perform hierarchical clustering on samples using the linkage() function with the method='complete' keyword argument. Assign the result to mergings. Plot a dendrogram using the dendrogram() function on mergings. Specify the keyword arguments labels=varieties, leaf_rotation=90, and leaf_font_size=6. ''' # Perform the necessary imports from scipy.cluster.hierarchy import linkage, dendrogram import matplotlib.pyplot as plt # Calculate the linkage: mergings mergings = linkage(samples, method='complete') # Plot the dendrogram, using varieties as labels dendrogram( mergings, labels=varieties, leaf_rotation=90, leaf_font_size=6, ) plt.show()
np.savez('/scratch/PI/mcovert/dvanva/sequencing/smFISH/' + str(t) + "_dynamics_distance_matrix_kshape.npz", distance_matrix=distance_matrix) dynamics_load = np.load('/scratch/PI/mcovert/dvanva/sequencing/smFISH/' + str(t) + "_dynamics_distance_matrix_kshape.npz") distance_matrix = dynamics_load['distance_matrix'] Y = sch.linkage(distance_matrix, method='ward') ind_dynamics = sch.fcluster(Y, 0.5 * np.amax(Y[:, 2]), 'distance') - 1 """ Plot dendrogram """ fig = plt.figure() ax_dendro = fig.add_axes([0.09, 0.1, 0.2, 0.8], frame_on=False) Z = sch.dendrogram(Y, orientation='right', color_threshold=0.5 * np.amax(Y[:, 2])) ax_dendro.set_xticks([]) ax_dendro.set_yticks([]) """ Plot heatmap """ ax_heatmap = fig.add_axes([0.3, 0.1, 0.6, 0.8]) index = Z['leaves'] dynamics_ordered = dynamics_matrix[index, :] im = ax_heatmap.matshow(dynamics_ordered, aspect='auto', origin='lower', cmap=plt.get_cmap('Reds'),
def linkageComparison(file, num_comps, linkList): ''' Compares 2-4 linkage functions on a given set of data. linkageComparison requires a file, number of comparisons, and a list of linkage functions. Input: file - include full file path, use the tkinter filedialog functionality for ease of obtaining file path num_of_comps - make sure to give an integer the same length as the link list. linkList - list of linkage functions that you would like to have compared. Output: linkageComparison saves a .png file of the output to the current working directory. ''' #set recursion limit above the common max for our data. sys.setrecursionlimit(10**8) #Log that user called linkage comparison function logging.info(': User called the Linkage Comparison function.') #check that the file is appropriate for our data set metab_data = GB.fileCheck(file) if metab_data is None: #Logs error and returns function to ensure soft exit. logging.error(': Error loading in excel file check log file!') return #read in column data data = GB.readInColumns(metab_data) #Standardize the data before clustering the results logging.info('Standardizing the data.') for i in range(metab_data.shape[0]): data[i, :] = GB.standardize(data[i, :]) del (i) if num_comps == 2: #Create the linkage matrix linkageOne = linkage(data, linkList[0]) distMeasure = pdist(data) distMeasure = squareform(distMeasure) linkageTwo = linkage(data, linkList[1]) #Create the appropriate plt figure to allow for the comparison of linkage functions fig, axes = plt.subplots(1, 2, figsize=(8, 8)) #create the dendrograms dend1 = dendrogram(linkageOne, ax=axes[0], above_threshold_color='y', orientation='left', no_labels=True) dend2 = dendrogram(linkageTwo, ax=axes[1], above_threshold_color='y', orientation='left', no_labels=True) del (linkageOne, linkageTwo, num_comps) elif num_comps == 3: #Create the linkage matrix linkageOne = linkage(data, linkList[0]) linkageTwo = linkage(data, linkList[1]) linkageThree = linkage(data, linkList[2]) #Create the appropriate plt figure to allow for the comparison of linkage functions fig, axes = plt.subplots(1, 3, figsize=(8, 8)) #create the dendrograms dend1 = dendrogram(linkageOne, ax=axes[0], above_threshold_color='y', orientation='left', no_labels=True) dend2 = dendrogram(linkageTwo, ax=axes[1], above_threshold_color='y', orientation='left', no_labels=True) dend3 = dendrogram(linkageThree, ax=axes[2], above_threshold_color='y', orientation='left', no_labels=True) del (linkageOne, linkageTwo, linkageThree, num_comps) elif num_comps == 4: #Create the linkage matrix linkageOne = linkage(data, linkList[0]) linkageTwo = linkage(data, linkList[1]) linkageThree = linkage(data, linkList[2]) linkageFour = linkage(data, linkList[3]) #Create the appropriate figure to allow for the comparison of linkage functions fig, axes = plt.subplots(2, 2, figsize=(8, 8)) plt.title('Linkage Comparison') #create the dendrograms dend1 = dendrogram(linkageOne, ax=axes[0, 0], above_threshold_color='y', orientation='left', no_labels=True) dend2 = dendrogram(linkageTwo, ax=axes[0, 1], above_threshold_color='y', orientation='left', no_labels=True) dend3 = dendrogram(linkageThree, ax=axes[1, 0], above_threshold_color='y', orientation='left', no_labels=True) dend4 = dendrogram(linkageFour, ax=axes[1, 1], above_threshold_color='y', orientation='left', no_labels=True) del (linkageOne, linkageTwo, linkageThree, linkageFour, num_comps) linkPre = 'LinkageComparison' linkSuf = '.png' sep = '_' firstCheck = linkPre + sep for i in range(len(linkList)): #create the first file check firstCheck += linkList[i] + sep firstCheck += '01' + linkSuf chkBuffer = glob.glob("*.png") count = 1 if firstCheck in chkBuffer: checkVal = False firstCheck = firstCheck.strip(linkSuf) firstCheck = firstCheck.strip('01') while checkVal == False: count += 1 #search the "buffer" for ensemble cluster if count < 10: #determine if the file has already been made curFileCheck = firstCheck + '0' + str(count) + linkSuf if curFileCheck not in chkBuffer: checkVal = True linkFile = curFileCheck else: curFileCheck = firstCheck + str(count) + linkSuf if curFileCheck not in chkBuffer: checkVal = True linkFile = curFileCheck plt.savefig(linkFile) else: linkFile = firstCheck plt.savefig(linkFile) plt.show() #log the completion of the linkage comparison logging.info( ': Sucessfuly completed the comparison of the linkage functions!') return
import matplotlib.pyplot as plt from scipy.cluster.hierarchy import dendrogram, linkage from knock60 import load_model from knock67 import collect_target_vecs if __name__ == '__main__': countries = [] with open('./countries.txt') as fp: for line in fp: country = line.strip() countries.append(country) vecs, target_countries = collect_target_vecs(countries) plt.figure(figsize=(32.0, 24.0)) link = linkage(vecs, method='ward') dendrogram(link, labels=target_countries, leaf_rotation=90, leaf_font_size=10) plt.show() plt.savefig('ward.png')
for i in range(len(df)): ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8) plt.show() # show the plot from scipy.cluster.hierarchy import ward, dendrogram linkage_matrix = ward( dist ) # define the linkage_matrix using ward clustering pre-computed distances sklearn.externals.joblib.dump(dist, 'title_dist.pkl') sklearn.externals.joblib.dump(titles, 'titles.pkl') sklearn.externals.joblib.dump(urls, 'urls.pkl') fig, ax = plt.subplots(figsize=(30, 60)) # set size ax = dendrogram(linkage_matrix, orientation="right", labels=titles) plt.tick_params( \ axis='x', # changes apply to the x-axis which='both', # both major and minor ticks are affected bottom='off', # ticks along the bottom edge are off top='off', # ticks along the top edge are off labelbottom='off') plt.tight_layout() # show plot with tight layout # uncomment below to save figure plt.savefig(basename(args.fileName) + '.png') # save figure as ward_clusters
# to determine the number of clusters directly from scipy.cluster.hierarchy import fcluster k = 5 clusters = fcluster(Z, k, criterion='maxclust') fig = pylab.figure(figsize=(18, 50)) def llf(id): return '[%s %s %s]' % (pdf['manufact'][id], pdf['model'][id], int(float(pdf['type'][id]))) dendro = hierarchy.dendrogram(Z, leaf_label_func=llf, leaf_rotation=0, leaf_font_size=12, orientation='right') # Clustering using sci-kit learn #dist_matrix = distance_matrix(feature_mtx,feature_mtx) agglom = AgglomerativeClustering(n_clusters=6, linkage='complete') agglom.fit(feature_mtx) pdf['cluster_'] = agglom.labels_ # to save as csv file # pdf.to_csv("D:/VIT/rtt.csv") import matplotlib.cm as cm n_clusters = max(agglom.labels_) + 1
import numpy as np import matplotlib.pyplot as plt import pandas as pd import scipy.cluster.hierarchy as sch from sklearn.cluster import AgglomerativeClustering dataset = pd.read_csv('dataset.csv') X = dataset.iloc[:, [0, 3]].values dendrogram = sch.dendrogram(sch.linkage(X, method="ward")) plt.title('Dendrogram') plt.xlabel('Customers') plt.ylabel('Euclidean distances') plt.show() hc = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward') y_hc = hc.fit_predict(X) plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s=100, c='red', label='Cluster 1') plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s=100, c='blue', label='Cluster 2') plt.scatter(X[y_hc == 2, 0],
assignment = agg.fit_predict(X) mglearn.discrete_scatter(X[:,0], X[:,1], assignment) plt.legend(['cluster0', 'cluster1','cluster2'], loc='best') plt.xlabel('특성0') plt.ylabel('특성1') # %% %matplotlib inline agg1 = AgglomerativeClustering(n_clusters=3).fit_predict(X) agg2 = AgglomerativeClustering(n_clusters=5).fit_predict(X) fig = plt.figure(figsize=(8,5)) fig.add_subplot(1,2,1) mglearn.discrete_scatter(X[:,0], X[:,1], agg1) plt.legend(['cluster0', 'cluster1','cluster2'], loc='best') fig.add_subplot(1,2,2) mglearn.discrete_scatter(X[:,0], X[:,1], agg2) plt.legend(['cluster0', 'cluster1','cluster2', 'cluster3', 'cluster4'], loc='best') # %% from scipy.cluster.hierarchy import dendrogram, ward import pandas as pd X, y = make_blobs() df = pd.DataFrame(dict(x=X[:,0], y=X[:,1], label=y)) df.columns # %% linkage_array = ward(df[['x', 'y']]) linkage_array dendrogram(linkage_array) # %%
def __init__(self, files, file_format, method=None): self.files = files self.files_opened = [] for f in self.files: self.files_opened.append(OpenFile(f)) self.docLabels = [] self.db_server = db_handler() for doc in self.files_opened: self.docLabels.append(doc.location) # create a list data that stores the content of all text files in order of their names in docLabels data = [] if file_format == "docx" or file_format == "pptx": for doc in self.files_opened: #data.append(open(doc, encoding='latin-1').read()) db = db_ds data.append(doc.text) elif file_format == "xlsx": for i, doc in enumerate(self.files_opened): #data.append(open(doc, encoding='latin-1').read()) db = db_xs try: data.append(json.dumps(doc.tables, skipkeys=True)) except: print("error parsing document {}".format( self.docLabels[i])) data.append("") data = nlp_clean(data) if method == "fuzzywuzzy": for i, f1 in enumerate(data): for f2 in data[i + 1:]: # print(self.docLabels[i],self.docLabels[i+1]) x = fuzz.ratio(f1, f2) y = fuzz.partial_ratio(f1, f2) print( "overall similarity ration: {} %\npartial similarity ration: {}" .format(x, y)) db_data = { 'dok_id': { 'dok_1': self.docLabels[i], 'dok_2': self.docLabels[i + 1] }, 'kullanici': user_default, 'overall similarity ratio': x, 'partial similarity ratio': y } self.db_server.save(db, db_data, doc_id=self.docLabels[i] + "_" + self.docLabels[i + 1]) elif method == "inference": #res = self.db_server.query(db_gensim,["_attachments"],query_key="_id", query_value=file_format) #model_loc ="{}gensim_models/docx/models/doc2vec_{}.model".format(server_default,file_format) model_loc = "models/doc2vec_{}.model".format(file_format) # loading the model d2v_model = gensim.models.doc2vec.Doc2Vec.load(model_loc) # d2v_model.init_sims(replace=False) # infer_vector is non-deterministic; i.e. the resulting vector is different each time, but it should be similar enough with a good model infervec = d2v_model.infer_vector(data[0], alpha=0.025, min_alpha=0.025, steps=300) similar_doc = d2v_model.docvecs.most_similar([infervec]) most_similar = similar_doc[0][0] print(type(most_similar)) print("most similar: {}".format(most_similar)) #db_res = self.db_server.query(db_dc,["_id","docs"]) db_res = self.db_server.query(db_dc, ["docs", "clusters"], query_key="_id", query_value=file_format) print(db_res) db_res_a = [] db_res_b = [] for row in db_res: # db_res_a.append(row) for a in row.key[0]: db_res_a.append(a) for b in row.key[1]: db_res_b.append(b) # print(db_res_a) # print(db_res_b) most_similar_class = db_res_b[db_res_a.index(most_similar)] print("most likely class: {}".format(most_similar_class)) print("other documents in same category") for i in range(len(db_res_b)): if db_res_b[i] == most_similar_class: print(db_res_a[i]) else: # iterator returned over all documents it = LabeledLineSentence(data, self.docLabels) model = gensim.models.Doc2Vec(vector_size=300, min_count=0, alpha=0.025, min_alpha=0.025) model.build_vocab(it) # training of model for epoch in range(100): #print ('iteration '+str(epoch+1)) model.train(it, total_examples=model.corpus_count, epochs=3) model.alpha -= 0.002 model.min_alpha = model.alpha model.save('models/doc2vec_{}.model'.format(file_format)) db_g = db_gensim db_data = {"time": "time", "path": dataset_path} self.db_server.save( db_g, db_data, doc_id=file_format, attachment='models/doc2vec_{}.model'.format(file_format)) print("model saved") # loading the model d2v_model = gensim.models.doc2vec.Doc2Vec.load( 'models/doc2vec_{}.model'.format(file_format)) # start testing X = [] # printing the vector of documents in docLabels for i, _ in enumerate(self.docLabels): docvec = d2v_model.docvecs[i] # print(docvec) X.append(docvec) X = np.array(X) #docvec = d2v_model.docvecs[0] #print (docvec) #docvec = d2v_model.docvecs[1] #print (docvec) # to get most similar document with similarity scores using document-index #similar_doc = d2v_model.docvecs.most_similar(0) # print(similar_doc) # for doc in similar_doc: # db_data = {'dok_id' : {'dok_1' : self.docLabels[0],'dok_2' : doc[0]}, 'kullanici': user_default, 'benzerlik orani': str(doc[1])} # self.db_server.save(db, db_data) #similar_doc = d2v_model.docvecs.most_similar(1) # print(similar_doc) # printing the vector of the file using its name # docvec = d2v_model.docvecs['shakespeare-hamlet.txt'] #if string tag used in training # print(docvec) # to get most similar document with similarity scores using document- name #sims = d2v_model.docvecs.most_similar('shakespeare-hamlet.txt') # print(sims) # ############################################################################# # Compute Affinity af = AffinityPropagation(preference=-50).fit(X) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ #labels2 = [] # for i, lb in enumerate(labels): # labels2.append(self.files[i].split('/')[-1]) #print("labels: {}".format(labels)) #print("labels2: {}".format(labels2)) n_clusters_ = len(cluster_centers_indices) print("number of clusters: {}".format(n_clusters_)) dic = {i: np.where(labels == i)[0] for i in range(n_clusters_)} dic2 = {} # print(dic) for key, value in dic.items(): print("cluster {}:".format(key)) for e in value: print("{} : {}".format(e, self.files[e].split('/')[-1])) dic2[self.docLabels[e]] = key print(dic2) # print('Estimated number of clusters: %d' % n_clusters_) # print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) # print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) # print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) # print("Adjusted Rand Index: %0.3f" # % metrics.adjusted_rand_score(labels_true, labels)) # print("Adjusted Mutual Information: %0.3f" # % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, metric='sqeuclidean')) # ############################################################################# # Plot result import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from itertools import cycle plt.close('all') plt.figure(figsize=(25, 10)) plt.clf() colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk') # reduce dimensions # pca = PCA(n_components=2) # reduced = pca.fit_transform(X) # X = reduced for k, col in zip(range(n_clusters_), colors): class_members = labels == k cluster_center = X[cluster_centers_indices[k]] plt.plot(X[class_members, 0], X[class_members, 1], col + '.') plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col, markeredgecolor='k', markersize=5) for x in X[class_members]: plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) plt.title( 'Clustering with Affinity Propagation | Estimated number of clusters: %d' % n_clusters_) plt.savefig('models/{}_affinity_clusters.png'.format(file_format), dpi=300) plt.show() #db = db_dc db_data = dic2 db_data["docs"] = self.docLabels db_data["clusters"] = labels.tolist() self.db_server.save( db_dc, db_data, doc_id=file_format, attachment='models/{}_affinity_clusters.png'.format( file_format)) # ######################### # hierarchical linkage_matrix = [] #linkage_matrix.append(linkage(X, method='single', metric='euclidean')) linkage_matrix.append( linkage(X, method='average', metric='euclidean')) #linkage_matrix.append(linkage(X, method='complete', metric='euclidean')) #linkage_matrix.append(linkage(X, method='ward', metric='euclidean')) #linkage_matrix.append(linkage(X, method='single', metric='seuclidean')) # linkage_matrix.append(linkage(X, method='average', metric='seuclidean')) #linkage_matrix.append(linkage(X, method='complete', metric='seuclidean')) for n, l in enumerate(linkage_matrix): # calculate full dendrogram plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.ylabel('word') plt.xlabel('distance') dendrogram( l, leaf_rotation=0., # rotates the x axis labels leaf_font_size=16., # font size for the x axis labels orientation='left', leaf_label_func=lambda v: str(self.files[v].split('/')[-1]) ) # plt.savefig('clusters_{}.png'.format(n), dpi=200) #save figure as ward_clusters plt.savefig( 'models/{}_hierarchical_clusters.png'.format(file_format), dpi=300) plt.show() db_data = {} self.db_server.save( db_dc, db_data, doc_id=file_format, attachment='models/{}_hierarchical_clusters.png'.format( file_format))
the method='single' keyword argument. Assign the result to mergings. Plot a dendrogram of the hierarchical clustering, using the list country_names as the labels. In addition, specify the leaf_rotation=90, and leaf_font_size=6 keyword arguments as you have done earlier. ''' #Done by DataCamp import pandas as pd from numpy import genfromtxt country_names = pd.read_csv( 'E:/DataCamp/Unsupervised-learning-in-python/data/eurovision-2016.csv') country_names = country_names.iloc[:, 0].unique().tolist() samples = genfromtxt( 'E:/DataCamp/Unsupervised-learning-in-python/data/eurovision_votes.csv', delimiter=',') #End done by DataCamp # Perform the necessary imports import matplotlib.pyplot as plt from scipy.cluster.hierarchy import linkage, dendrogram # Calculate the linkage: mergings mergings = linkage(samples, method='single') # Plot the dendrogram dendrogram(mergings, labels=country_names, leaf_rotation=90, leaf_font_size=6) plt.show()
clients = pd.read_csv("data/customer_online_closing_store.csv") clients["return_rate"] = clients["items_returned"] / clients["items_purchased"] clients["average_price"] = clients["total_spent"] / clients["items_purchased"] X = clients[["average_price", "return_rate", "overall_rating"]] print(X) min_max_scaler = sk_preprocessing.MinMaxScaler() X = min_max_scaler.fit_transform(X) print(X) plt.title("Customer dendrogram") linkage_method = "ward" # single complete average ward dendrogram = sp_clustering_hr.dendrogram(sp_clustering_hr.linkage(X, method=linkage_method)) agglomerative_model = sk_clustering.AgglomerativeClustering(n_clusters=4, linkage=linkage_method) agglomerative_model.fit(X) clients["class"] = agglomerative_model.labels_ print(clients[["average_price", "return_rate", "overall_rating", "class"]]) client_pivot_table = clients.pivot_table(index="class", values=["average_price", "return_rate", "overall_rating", "customer_id"], aggfunc={"average_price": np.mean, "return_rate": np.mean, "overall_rating": np.mean, "customer_id": len}) print(client_pivot_table) plt.show()
for Method in Methods: Z = linkage(X, method=Method, metric=Metric) # Compute and display clusters by thresholding the dendrogram cls = fcluster(Z, criterion='maxclust', t=Maxclust) plt.figure(3 + 2*i, figsize=(12, 9)) plt.title('Hierarchical clustering using {} method'.format(Method)) plt.xlabel('PC 1') plt.ylabel('PC 2') clusterplot(X, cls, y=y) # Display dendrogram plt.figure(4 + 2*i, figsize=(15, 8)) plt.title('Hierarchical clustering using {} method'.format(Method)) dendrogram(Z, truncate_mode='lastp', p=max_display_levels) plt.show() # Evaluate hierarchical method Rand_hier[i], Jaccard_hier[i], NMI_hier[i] = clusterval(y, cls) i += 1 print('###################################################') print('# MODELS QUALITY EVALUATION #') print('###################################################') Jaccard = {'gmm': Jaccard_gmm} NMI = {'gmm': NMI_gmm} Rand = {'gmm': Rand_gmm} for i in range(len(Methods)): Jaccard[Methods[i]] = Jaccard_hier[i]
feature_vectors.append(vec) # print feature_vectors import numpy mat = numpy.empty((n, n)) for i in xrange(0, n): for j in xrange(0, n): mat[i][j] = nltk.cluster.util.cosine_distance( feature_vectors[i], feature_vectors[j]) #calculating the cosine distance # hierarchical clustering from scipy.cluster.hierarchy import dendrogram, linkage t = 0.8 Z = linkage(mat, 'single') d = dendrogram(Z, color_threshold=t) from matplotlib import pyplot print Z import pylab pylab.savefig("dendo.png", dpi=800) ## extract data def extract_clusters(Z, threshold, n): clusters = {} ct = n for row in Z: if row[2] < threshold: n1 = int(row[0]) n2 = int(row[1])
import pandas as pd import plotly.graph_objs as go import os values = pd.read_csv('Intensity.values.csv') values.index = values['Metabolite'] del values['Metabolite'] values_t = values.T values_t.index.name = 'Sample' #plt.figure(figsize=(10, 7)) #plt.title("Customer Dendograms") dend_metabolite = shc.dendrogram(shc.linkage(values, method='ward'),labels=values.index) dend_metabolite_order = dend_metabolite['ivl'] dend_sample = shc.dendrogram(shc.linkage(values_t, method='ward'),labels=values_t.index) dend_sample_order = dend_sample['ivl'] df = values[dend_sample_order] df = df.reindex(dend_metabolite_order) values_t = values.T values_t.index.name = 'Sample' #plt.figure(figsize=(10, 7))
#!/usr/bin/env python3 from matplotlib import pyplot as plt import numpy as np from scipy.cluster.hierarchy import dendrogram from imagecluster import calc as ic from imagecluster import io as icio images = icio.read_images('pics/', size=(224,224)) model = ic.get_model() fingerprints = ic.fingerprints(images, model) clusters,extra = ic.cluster(fingerprints, sim=0.5, extra_out=True) # linkage matrix Z fig,ax = plt.subplots() dendrogram(extra['Z'], ax=ax) # Adjust yaxis labels (values from Z[:,2]) to our definition of the `sim` # parameter. ymin, ymax = ax.yaxis.get_data_interval() tlocs = np.linspace(ymin, ymax, 5) ax.yaxis.set_ticks(tlocs) tlabels = np.linspace(1, 0, len(tlocs)) ax.yaxis.set_ticklabels(tlabels) ax.set_xlabel("image index") ax.set_ylabel("sim") fig.savefig('dendrogram.png') plt.show()
def heatmap_vec(x, y, vec): # Compute and plot first dendrogram. vec = np.array(vec) mn = np.mean(vec, axis=1) mat = [] ylb = [] for i in xrange(len(mn)): if mn[i] > 0.: mat.append(vec[i]) ylb.append(y[i]) mat = np.array(mat) norm, corr, dist = analyse.all_corr(mat.T) del corr fig = plt.figure(figsize=(8, 8)) ax1 = fig.add_axes([0.09, 0.1, 0.25, 0.6]) print 'fastcluster...' z = fastcluster.linkage(dist, method='complete') del dist #print 'dendogramming...' Z1 = sch.dendrogram(z, orientation='right') ticks = ax1.get_xticks() ticks = np.array(ticks) ticks /= 2. ticks = ['%.1f' % a for a in ticks] ax1.set_xticklabels(ticks) idx1 = Z1['leaves'] yy = [] for i in idx1: yy.append(ylb[int(i)]) ax1.set_yticks(range(len(yy)), yy) if len(yy) < 20: ax1.set_yticklabels(yy, fontsize=12) elif len(yy) < 50: ax1.set_yticklabels(yy, fontsize=6) elif len(yy) < 150: ax1.set_yticklabels(yy, fontsize=4) elif len(yy) < 250: ax1.set_yticklabels(yy, fontsize=3) elif len(yy) < 500: ax1.set_yticklabels(yy, fontsize=2) elif len(yy) < 1500: ax1.set_yticklabels(yy, fontsize=1) else: ax1.set_yticklabels(yy, fontsize=.2) # Plot distance matrix. axmatrix = fig.add_axes([0.4, 0.1, 0.5, 0.6]) D = norm[idx1, :] D = D[::-1, :] im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap='RdYlBu', alpha=0.8, vmin=0) plt.xticks(np.arange(len(x)), x) plt.xticks(rotation=90) mytemplate(D) plt.xticks(fontsize=6) axmatrix.set_yticks([]) #print x # Plot colorbar. axcolor = fig.add_axes([0.91, 0.3, 0.01, 0.4]) plt.colorbar(im, cax=axcolor) return yy
motif_dict[i] = 1 motif_list = list(motif_dict.keys()) #### Matrix for levenstein_distances cols = len(motif_list) rows = cols distance_matrix = np.zeros((rows, cols)) for i in range(rows): distance_matrix[i][i] = 0 for ii in range(i + 1, cols): distance_matrix[i][ii] = \ iterative_levenshtein(motif_list[i],motif_list[ii],costs =cost) ################# flip matrix distance_matrix[ii][i] = distance_matrix[i][ii] return distance_matrix dist_matrix = levenstein_distances(motif, (2, 2, 2)) print(dist_matrix) #dist_matrix_unroll = [item for sublist in dist_matrix for item in sublist] from scipy.cluster import hierarchy import matplotlib.pyplot as plt Z = hierarchy.linkage(dist_matrix) plt.xlabel('Selected Motifs') plt.ylabel('Levinstein Edit Distances (No. of Edits)') plt.title('Motif Candidate Cluster') hierarchy.dendrogram(Z, leaf_rotation=10, leaf_font_size=7, labels=motif) plt.show()
# LabelEncoder from sklearn.preprocessing import LabelEncoder le = LabelEncoder() train_x['fueltype'] = le.fit_transform(train_x['fueltype']) train_x['aspiration'] = le.fit_transform(train_x['aspiration']) train_x['doornumber'] = le.fit_transform(train_x['doornumber']) train_x['carbody'] = le.fit_transform(train_x['carbody']) train_x['drivewheel'] = le.fit_transform(train_x['drivewheel']) train_x['enginelocation'] = le.fit_transform(train_x['enginelocation']) train_x['cylindernumber'] = le.fit_transform(train_x['cylindernumber']) train_x['fuelsystem'] = le.fit_transform(train_x['fuelsystem']) # 规范化到 [0,1] 空间 min_max_scaler=preprocessing.MinMaxScaler() train_x=min_max_scaler.fit_transform(train_x) #聚类分析 from scipy.cluster.hierarchy import dendrogram, ward from sklearn.cluster import KMeans, AgglomerativeClustering import matplotlib.pyplot as plt model = AgglomerativeClustering(linkage='ward', n_clusters=10) y = model.fit_predict(train_x) print(y) linkage_matrix = ward(train_x) dendrogram(linkage_matrix) plt.show()
topics_matrix = lda.show_topics(formatted=False) from sklearn.metrics.pairwise import cosine_similarity dist = 1 - cosine_similarity(lda.get_topics()) # ============================================================================= # MDS() # mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1) # # pos = mds.fit_transform(dist) # shape (n_components, n_samples) # # xs, ys = pos[:, 0], pos[:, 1] # # # plt.scatter(x = xs, y = ys) # plt.show() # ============================================================================= linkage_matrix = ward(dist) plt.clf() plt.figure(figsize=(40, 20)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('person Id') plt.ylabel('Distance') dendrogram(linkage_matrix, leaf_rotation=90., leaf_font_size=12.) plt.savefig('ward_dendrogram.png', dpi=200) plt.show() # print(linkage_matrix) # print(lda.get_topics())
def create_heatmap(matrix, main_title=None, output_filename=None, pd_data_1=None, pd_data_2=None, data_hist_1=None, data_hist_2=None, graphs=None, ordered=False, fitness_slice=500): matrixdf = pd.DataFrame(matrix) font = {'family': 'normal', 'weight': 'normal', 'size': 8} last_data_1 = 0.0 matplotlib.rc('font', **font) # look at raw data #axi = plt.imshow(matrixdf,interpolation='nearest') #ax = axi.get_axes() #plt.clean_axis(ax) # row clusters if ordered: row_pairwise_dists = squareform(pdist(matrixdf)) row_clusters = linkage(row_pairwise_dists, method='complete') row_dendogram = dendrogram(row_clusters, no_plot=True, count_sort='ascending') # calculate pairwise distances for columns if ordered: col_pairwise_dists = squareform(pdist(matrixdf.T)) col_clusters = linkage(col_pairwise_dists, method='complete') col_dendogram = dendrogram(col_clusters, no_plot=True, count_sort='ascending') # plot the results fig = plt.figure(figsize=(12.5, 10)) #plot_gridspec = gridspec.GridSpec(3,2, wspace=0.05, # hspace=0.05, width_ratios=[0.25,1],height_ratios=[0.25,1,0.25]) plot_gridspec = gridspec.GridSpec( 5, 5, width_ratios=[0.15, 0.15, 0.2, 0.2, 0.2]) ### col dendrogram #### #col_denAX = fig.add_subplot(plot_gridspec[0,1]) if pd_data_1 is not None: title = '' if type(pd_data_1) == tuple: # not so pythonic title = pd_data_1[0] pd_data_1 = pd_data_1[1] last_data_1 = pd_data_1['y'][len(pd_data_1) - 1] #ax3 = fig.add_subplot(plot_gridspec[0,1]) ax1 = plt.subplot(plot_gridspec[0, 2:]) slice_base = max(0, len(pd_data_1) - fitness_slice) plt.plot(pd_data_1['x'], pd_data_1['y'], linestyle='-') plt.xlim(slice_base, len(pd_data_1)) plt.title(title) # else: # col_denAX = fig.add_subplot(plot_gridspec[0,1]) #create an empty graph ### row dendrogram ### ## t ODO: fix that please: if ordered: pass #row_denAX = fig.add_subplot(plot_gridspec[1,0]) #row_denD = dendrogram(row_clusters, orientation='right', count_sort='ascending') #row_denAX.get_xaxis().set_ticks([]) # removes ticks #slice_base = max(0, max(pd_data_1['x']) - fitness_slice) #plt.plot(pd_data_1['x'], pd_data_1['y'], linestyle='-') #plt.xlim(slice_base, len(pd_data_1)) if graphs is not None: gs_index = 0 for title_graph in graphs: title, graph, graph_histogram = title_graph ax3 = plt.subplot(plot_gridspec[gs_index, 0]) graph = graph.to_undirected() # we don't care about the weight because we already are filtering here nx.draw(graph, node_size=2, width=0.4, with_labels=False, pos=nx.spring_layout(graph, weight=None)) plt.title(title) ax3 = plt.subplot(plot_gridspec[gs_index, 1]) # let's add the histogram, but remove all 1 values graph_histogram_without_one = [] for v in graph_histogram: if v != 1: graph_histogram_without_one.append(v) print(str(graph_histogram_without_one)) # print str(graph_histogram) if not graph_histogram_without_one: continue binwidth = 1 min_bin = numpy.min(graph_histogram_without_one) max_bin = numpy.max(graph_histogram_without_one) bins = range(min_bin, max_bin + binwidth, binwidth) ax3.hist(graph_histogram_without_one, bins=bins, facecolor='red', alpha=0.45) plt.xticks(numpy.unique(graph_histogram_without_one)) plt.tick_params(axis='both', which='major', labelsize=5) plt.tick_params(axis='both', which='minor', labelsize=5) # plt.xticks(range(numpy.min(graph_histogram_without_one), # numpy.max(graph_histogram_without_one), # (numpy.min(graph_histogram_without_one) + numpy.max(graph_histogram_without_one))/5)) #plt.xlim(1, numpy.max(graph_histogram)) if gs_index == 0: plt.title("Components size\nhistogram") gs_index += 1 ### heatmap ### heatmap_subplot = fig.add_subplot(plot_gridspec[1:4, 2:]) if ordered: pass axi = heatmap_subplot.imshow(matrixdf.ix[row_dendogram['leaves'], col_dendogram['leaves']], interpolation='nearest', aspect='auto', origin='lower') else: axi = heatmap_subplot.imshow(matrixdf, interpolation='nearest', aspect='auto', origin='lower') # removes ticks heatmap_subplot.get_xaxis().set_ticks([]) heatmap_subplot.get_yaxis().set_ticks([]) axcolor = fig.add_axes([0.91, 0.27, 0.02, 0.45]) plt.colorbar(axi, cax=axcolor) #fig.tight_layout() if pd_data_2 is not None: title = '' if type(pd_data_2) == tuple: # not so pythonic title = pd_data_2[0] pd_data_2 = pd_data_2[1] ax3 = fig.add_subplot(plot_gridspec[4, 2]) plt.plot(pd_data_2['x'], pd_data_2['y'], linestyle='-', marker='.') plt.xlim(min(pd_data_2['x']), max(pd_data_2['x'])) #plt.ylim(0, 1.1) plt.title(title) if data_hist_1 is not None: title = '' if type(data_hist_1) == tuple: # not so pythonic title = data_hist_1[0] data_hist_1 = data_hist_1[1] #binwidth = 1 ax3 = fig.add_subplot(plot_gridspec[4, 3]) #min_bin = numpy.min(data_hist_1) #max_bin = numpy.max(data_hist_1) #bins = range(min_bin,max_bin+binwidth,binwidth) ax3.hist(data_hist_1, facecolor='blue', alpha=0.45) #plt.xticks(numpy.unique(data_hist_1)) plt.tick_params(axis='both', which='major', labelsize=5) plt.tick_params(axis='both', which='minor', labelsize=5) plt.title(title) if data_hist_2 is not None: title = '' if type(data_hist_2) == tuple: # not so pythonic title = data_hist_2[0] data_hist_2 = data_hist_2[1] if data_hist_2: ax3 = fig.add_subplot(plot_gridspec[4, 4]) #bins = range(min_bin,max_bin+binwidth,binwidth) ax3.hist(data_hist_2, facecolor='blue', alpha=0.45) #plt.xticks(numpy.unique(data_hist_1)) plt.tick_params(axis='both', which='major', labelsize=5) plt.tick_params(axis='both', which='minor', labelsize=5) plt.title(title) if main_title: if pd_data_1 is not None: main_title = main_title + '\n(' + str( last_data_1).strip() + ')' plt.suptitle(main_title) if output_filename: plt.savefig(output_filename) #plt.clf() plt.close() else: plt.show()
# In[117]: market = pd.read_csv('C:/Users/USER/Desktop/test/DirectMarketing.csv') from scipy.cluster.hierarchy import linkage, dendrogram from scipy.cluster.hierarchy import fcluster from sklearn.cluster import AgglomerativeClustering from sklearn.preprocessing import StandardScaler market = market.dropna() scaler_s = StandardScaler() market = market[['Salary', 'Children', 'Location', 'AmountSpent']] used_data_dummy = pd.get_dummies(market, drop_first=True) data_features_s = pd.DataFrame(scaler_s.fit_transform(used_data_dummy)) clustering = linkage(data_features_s, 'ward') dn = dendrogram(clustering) # #### (b) 군집의 수를 2개부터 10개까지 늘려가면서 실루엣 스코어를 구하고, 군집 수와 실루엣 스코어를 그래프로 나타내라. 실루엣 스코어로 보았을 때 가장 적절한 군 집의 수는 몇개인가? (7점) # In[114]: si = [] K = range(2, 11) for k in K: cl = fcluster(clustering, k, criterion='maxclust') si.append(silhouette_score(data_features_s, cl, metric='euclidean')) plt.plot(K, si, 'bx-') plt.xlabel('k') plt.ylabel('silhouette Score') plt.show()
def cluster_distance_mat(dist_mat, names, figsize=(8, 8)): """ Parameters ---------- dist_mat : np.array Distance matrix array. names : list_like Names of ticks for distance matrix figsize : tuple Size of figure, passed to matplotlib Returns ------- """ # Compute and plot first dendrogram. fig = plt.figure(figsize=figsize) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2]) Y = sch.linkage(dist_mat, method='average') Z2 = sch.dendrogram(Y) ax2.set_xticks([]) ax2.set_yticks([]) # Plot distance matrix. axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6]) # reorder matrix idx1 = Z2['leaves'] dist_mat = dist_mat[idx1, :] dist_mat = dist_mat[:, idx1] names = names[idx1] # create figure im = axmatrix.matshow(dist_mat, aspect='auto', origin='lower', cmap=plt.cm.Reds, vmin=0, vmax=1) # add xtick labels axmatrix.set_xticks(range(len(names))) axmatrix.set_xticklabels(names, minor=False) axmatrix.xaxis.set_label_position('bottom') axmatrix.xaxis.tick_bottom() plt.xticks(rotation=90, fontsize=8) # add ytick labels axmatrix.set_yticks(range(len(names))) axmatrix.set_yticklabels(names, minor=False) axmatrix.yaxis.set_label_position('left') axmatrix.yaxis.tick_left() plt.yticks(rotation=0, fontsize=8) # add colorbar axcolor = fig.add_axes([0.94, 0.1, 0.02, 0.6]) plt.colorbar(im, cax=axcolor) return fig
def plot_heatmap(matrix=None, matrixdf=None, main_title=None, output_filename=None, titles=None, ordered=False, font_size=9, font_family='normal', font_weight='normal', figsize=None, tight_layout=None, titles_x=None, titles_y=None, values_on=False, values_on_text=None, vmin=-1.0, vmax=1.0, grid=False, x_label=None, y_label=None, set_yticks=None, set_xticks=None, subplot_adjust=None, colorbar_on=True, **kargs): assert (matrix is not None or matrixdf is not None), "Give me matrix or matrixdf!" if matrix is not None: matrixdf = pd.DataFrame(matrix) # font = {'family': font_family, # 'weight': font_weight, # 'size': font_size} # matplotlib.rc('font', **font) # look at raw data #axi = plt.imshow(matrixdf,interpolation='nearest') #ax = axi.get_axes() #plt.clean_axis(ax) # row clusters if ordered: row_pairwise_dists = squareform(pdist(matrixdf)) row_clusters = linkage(row_pairwise_dists, method='complete') row_dendogram = dendrogram(row_clusters, no_plot=True, count_sort='ascending') # calculate pairwise distances for columns if ordered: col_pairwise_dists = squareform(pdist(matrixdf.T)) col_clusters = linkage(col_pairwise_dists, method='complete') col_dendogram = dendrogram(col_clusters, no_plot=True, count_sort='ascending') # plot the results if figsize is not None: fig = plt.figure(figsize=figsize) else: fig = plt.figure() #plot_gridspec = gridspec.GridSpec(3,2, wspace=0.05, # hspace=0.05, width_ratios=[0.25,1],height_ratios=[0.25,1,0.25]) # plot_gridspec = gridspec.GridSpec(5, 5, width_ratios=[0.15, 0.15, 0.2, 0.2, 0.2]) ### heatmap ### heatmap_subplot = fig.add_subplot(111) if titles and not titles_x: titles_x = titles if titles and not titles_y: titles_y = titles if ordered: axi = heatmap_subplot.matshow(matrixdf.ix[row_dendogram['leaves'], col_dendogram['leaves']], interpolation='nearest', aspect='auto', origin='lower', vmin=vmin, vmax=vmax, **kargs) if titles_x: heatmap_subplot.set_xticklabels( [titles_x[i] for i in col_dendogram['leaves']], rotation=90) if titles_y: heatmap_subplot.set_yticklabels( [titles_y[i] for i in row_dendogram['leaves']]) else: axi = heatmap_subplot.matshow(matrixdf, interpolation='nearest', aspect='auto', origin='lower', vmin=vmin, vmax=vmax, **kargs) if titles_x: heatmap_subplot.set_xticklabels(titles_x, rotation=0) heatmap_subplot.tick_params(labelbottom='on', labeltop='off') if titles_y: heatmap_subplot.set_yticklabels(titles_y) if set_xticks: heatmap_subplot.set_xticks(set_xticks) else: pass # heatmap_subplot.set_xticks(range(len(matrixdf.columns))) if set_yticks: heatmap_subplot.set_yticks(set_yticks) else: pass # heatmap_subplot.set_yticks(range(len(matrixdf))) if colorbar_on: plt.colorbar(axi) values_on_text_format = '{:s}' if values_on_text is None: values_on_text = matrixdf values_on_text_format = '{:0.2f}' if values_on: if ordered: for (i, j), z in np.ndenumerate( values_on_text.ix[row_dendogram['leaves'], col_dendogram['leaves']]): heatmap_subplot.text(j, i, values_on_text_format.format(z), ha='center', va='center', weight='medium') else: for (i, j), z in np.ndenumerate(values_on_text): heatmap_subplot.text(j, i, values_on_text_format.format(z), ha='center', va='center', weight='medium') if tight_layout is not None: if not tight_layout: plt.tight_layout() else: plt.tight_layout(rect=tight_layout) if subplot_adjust: plt.subplots_adjust(*subplot_adjust) if grid: plt.grid() if x_label: plt.xlabel(x_label) if y_label: plt.ylabel(y_label) if main_title: plt.suptitle(main_title) if output_filename: plt.savefig(output_filename) #plt.clf() plt.close()
def plot_heat_dendrogram(Y1, Y2, dist, labels, figname, cmap=pylab.cm.YlGnBu, ratio=0.6): """ This function allows you to compare two clustering method, e.g. centroid vs single, @feature is your input feature [nsample, ndim] @title is the name of your plot @method1/method2, two methods for comparison @cmap, color map to use """ Dist_Matrix = squareform(dist) # Compute and plot first dendrogram. fig = pylab.figure(figsize=(25, 25)) ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6]) Z1 = dendrogram(Y1, orientation='right', color_threshold=ratio * max(Y1[:, 2])) ax1.set_xticks([]) ax1.set_yticks([]) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2]) Z2 = dendrogram(Y2, color_threshold=ratio * max(Y2[:, 2])) ax2.set_xticks([]) ax2.set_yticks([]) # Plot distance matrix. axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6]) idx1 = Z1['leaves'] idx2 = Z2['leaves'] Dist_Matrix = Dist_Matrix[idx1, :] Dist_Matrix = Dist_Matrix[:, idx2] idx1 = labels[Z1['leaves']] idx2 = labels[Z2['leaves']] im = axmatrix.matshow(Dist_Matrix, aspect='auto', origin='lower', cmap=cmap) axmatrix.set_xticks(range(np.shape(Dist_Matrix)[0])) axmatrix.set_xticklabels(idx1, minor=False) axmatrix.xaxis.set_label_position('bottom') axmatrix.xaxis.tick_bottom() pylab.xticks(rotation=-90, fontsize=9) axmatrix.set_yticks(range(np.shape(Dist_Matrix)[0])) axmatrix.set_yticklabels(idx2, minor=False) axmatrix.yaxis.set_label_position('right') axmatrix.yaxis.tick_right() # Plot colorbar axcolor = fig.add_axes([0.95, 0.1, 0.02, 0.6]) pylab.colorbar(im, cax=axcolor) if not figname: fig.savefig(figname)
def cluster(p_y_x, beta, visual): # The main clustering function - performs bottom-up clustering using the IB criterion # Inputs: # p_y_x: Conditional probability p(y|x) # A numpy array of size [N,P] # beta: Tradeoff parameter in the IB objective # A scalar # visual: Print dendrogram # Boolean value # # Outputs: # C: Cluster assignment; an m-partitiion of X, 1 <= m <= |X| # A numpy array of size [N,1] # # Objective: Min (1/beta)*I(X,C) - I(Y,C) # X: Features at segment-level # Y: Relevance variable, typically components from a GMM # # NOTE: This function ALWAYS creates 2 clusters. Use the fcluster() method to prune the dendrogram # variable with the desired criterion. Refer infoBottleneck.py for usage print("Performing agglomerative clustering using IB objective...") N, P = np.shape(p_y_x) np.random.seed(1000) p_c = np.empty(N) p_y_c = np.empty((N, P)) # p(y|c), NOT p(y,c) p_c_x = np.zeros((N, N)) p_x_c = np.zeros((N, N)) p_x_y_joint = getJointFromConditional(p_y_x) delta_F = np.zeros((N, N)) N_init = N print("Initialization...") C = range(N) for i in range(N): p_c[i] = 1.0 / N p_c_x[i, i] = 1.0 p_x_c[i, i] = 1.0 for j in range(P): p_y_c[i, j] = p_y_x[i, j] for i in range(N): for j in range(i): # delta_F[i,j] = (p_c[i] + p_c[j])*(JS_div(p_y_c,p_c,i,j,1) - (1/beta)*JS_div(p_x_c,p_c,i,j,2)) # Slower alternative delta_F[i, j] = fastverbose_computeDeltaObj(p_y_c[i, :], p_y_c[j, :], p_x_c[i, :], p_x_c[j, :], p_c, i, j, beta) for j in range(i, N): delta_F[i, j] = float("inf") # print p_y_c # print p_c_x # print p_x_c # Clustering max_clust_ind = max(C) Z = np.empty((max_clust_ind, 4)) curr_val = 0 iterIndex = 0 print("Number of clusters = " + str(N)) while len(np.unique(C)) > 2: if N % 100 == 0: print("Number of clusters = " + str(N)) # print("Performing one iteration of clustering..") [i_opt, j_opt] = np.unravel_index(np.argmin(delta_F), delta_F.shape) # print ("Optimal indices: ("+str(i_opt)+","+str(j_opt)+")") curr_val += abs(np.min(delta_F)) Z[iterIndex] = [C[i_opt], C[j_opt], curr_val, 2] iterIndex += 1 # Create temporary variables for storing the new distributions C_new = [] p_c_new = [] for i in range(N): if i != i_opt and i != j_opt: C_new.append(C[i]) p_c_new.append(p_c[i]) p_y_c_new = np.delete(p_y_c, (i_opt, j_opt), 0) p_c_x_new = np.delete(p_c_x, (i_opt, j_opt), 1) delta_F = np.delete(np.delete(delta_F, (i_opt, j_opt), 0), (i_opt, j_opt), 1) # Update p(y|c) C_new.append(max_clust_ind + 1) temp1 = np.zeros(P) for j in range(P): temp1[j] = (p_y_c[i_opt, j] * p_c[i_opt] + p_y_c[j_opt, j] * p_c[j_opt]) / (p_c[i_opt] + p_c[j_opt]) p_y_c_new = np.vstack((p_y_c_new, temp1)) # Update p(c|x) temp2 = np.zeros(N_init) for i in range(N): if i != i_opt and i != j_opt: temp2[i] = 0 else: temp2[i] = 1 p_c_x_new = np.concatenate( (p_c_x_new, np.reshape(temp2, (len(temp2), 1))), 1) # Update p(c) p_c_new.append(p_c[i_opt] + p_c[j_opt]) max_clust_ind += 1 C = C_new p_y_c = p_y_c_new p_c_x = p_c_x_new p_c = np.asarray(p_c_new) # Update p(x|c) p_x_c = np.divide( p_c_x.T, N_init * repmat(p_c, N_init, 1).T) # this should be of shape (N-1,N_init) N -= 1 p_y_c[p_y_c < 10e-10] = 0. p_c_x[p_c_x < 10e-10] = 0. p_x_c[p_x_c < 10e-10] = 0. p_c[p_c < 10e-10] = 0. # Update delta_F # Add a row newrow = np.zeros(N - 1) for i in range(N - 1): newrow[i] = fastverbose_computeDeltaObj(p_y_c[i, :], p_y_c[len(p_c) - 1, :], p_x_c[i, :], p_x_c[len(p_c) - 1, :], p_c, i, len(p_c) - 1, beta) # Add a column of "inf" newcol = float("inf") * np.ones(N) delta_F = np.concatenate((np.vstack( (delta_F, newrow)), np.reshape(newcol, (len(newcol), 1))), 1) # print p_y_c.shape # print p_c_x.shape # print p_x_c.shape # print p_c.shape # # print "p_y_c:" # print p_y_c # print "p_c_x:" # print p_c_x # print "p_x_c:" # print p_x_c # print "p_c:" # print p_c # Complete the dendrogram variable max_val = Z[-2, 2] Z[-1] = [C[0], C[1], max_val + 0.01, 2] # Visualization, not really feasible for large utterances if visual == 1: plt.figure(figsize=(25, 10)) dendrogram(Z) plt.show() return Z, C
# + from scipy.cluster.hierarchy import fcluster, linkage, dendrogram Z = linkage(X_train, method="complete", metric="euclidean") clusters = fcluster(Z, 3, criterion="maxclust") clusters = pd.Series(clusters).map({1: "r", 2: "b", 3: "y"}) X_train.plot.scatter(x="PetalLength", y="PetalWidth", c=clusters, marker="x", alpha=.5) # - dendrogram(Z) # # Exercises # **Exercise 1.** Fit a hierarchical clustering model to the Titanic passengers dataset (`https://raw.githubusercontent.com/dlsun/data-science-book/master/data/titanic.csv`). You are free to choose which features to include (but include both categorical and quantitative features) and the linkage function. Then, choose a number of clusters that seems appropriate. Look at the profiles of the passengers in each cluster. Can you come up with an "interpretation" of each cluster based on the passengers in it? # + # TYPE YOUR CODE HERE # - # **Exercise 2.** The code below reads in the "two moons" dataset, a synthetic dataset that is used to evaluate clustering algorithms. What clusters do you think hierarchical clustering will find if you use single linkage? What if you use average linkage? Once you have a hypothesis for each type of linkage, test out your hypothesis by fitting the model to this dataset and plotting the resulting clusters. # TYPE YOUR CODE HERE moons = pd.read_csv( "https://raw.githubusercontent.com/dlsun/data-science-book/master/data/two_moons.csv" )
#Hierarchical Clustering #Importing the Libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd #Importing the dataset dataset = pd.read_csv('Mall_Customers.csv') X = dataset.iloc[:, 3: 5].values #it is always better if x is a matrix and not an array #using the dendogram to find the optimal number of clusters import scipy.cluster.hierarchy as sch dendogram = sch.dendrogram(sch.linkage( X, method='ward')) #to min the variance within the clusters plt.title('Dendogram') plt.xlabel('Customers') plt.ylabel('Eucledian Distance') plt.show #Fitting hierarchical clustering to the mall dataset from sklearn.cluster import AgglomerativeClustering hc = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward') y_hc = hc.fit_predict(X) #Visualising the clusters plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s=100, c='red', label='Careful') plt.scatter(X[y_hc == 1, 0],
def cluster_dendogram( corpus: List[str], vectorizer, titles: List[str] = None, stopwords=get_stopwords, cleaning=simple_textcleaning, random_samples: float = 0.3, ngram: Tuple[int, int] = (1, 3), figsize: Tuple[int, int] = (17, 9), batch_size: int = 20, ): """ plot hierarchical dendogram with similar texts. Parameters ---------- corpus: List[str] vectorizer: class vectorizer class. num_clusters: int, (default=5) size of unsupervised clusters. titles: List[str], (default=None) list of titles, length must same with corpus. stopwords: List[str], (default=malaya.texts.function.get_stopwords) A callable that returned a List[str], or a List[str], or a Tuple[str] cleaning: function, (default=malaya.text.function.simple_textcleaning) function to clean the corpus. random_samples: float, (default=0.3) random samples from the corpus, 0.3 means 30%. ngram: Tuple[int, int], (default=(1,3)) n-grams size to train a corpus. batch_size: int, (default=20) size of strings for each vectorization and attention. Only useful if use transformer vectorizer. Returns ------- dictionary: {'linkage_matrix': linkage_matrix, 'titles': titles} """ if titles: if len(titles) != len(corpus): raise ValueError('length of titles must be same with corpus') validator.validate_object_methods(vectorizer, ['vectorize', 'fit'], 'vectorizer') stopwords = validator.validate_stopwords(stopwords) validator.validate_function(cleaning, 'cleaning') if not (random_samples < 1 and random_samples > 0): raise ValueError('random_samples must be between 0 and 1') try: import matplotlib.pyplot as plt import seaborn as sns from scipy.cluster.hierarchy import ward, dendrogram sns.set() except: raise ModuleNotFoundError( 'matplotlib and seaborn not installed. Please install it and try again.' ) corpus = random.sample(corpus, k=int(random_samples * len(corpus))) if cleaning is not None: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) text_clean = [] for text in corpus: text_clean.append(' '.join( [word for word in text.split() if word not in stopwords])) if hasattr(vectorizer, 'fit'): vectorizer.fit(text_clean) transformed_text_clean = vectorizer.transform(text_clean) features = vectorizer.get_feature_names() else: transformed_text_clean, attentions = [], [] for i in range(0, len(text_clean), batch_size): index = min(i + batch_size, len(text_clean)) transformed_text_clean.append( vectorizer.vectorize(text_clean[i:index])) if hasattr(vectorizer, 'attention'): attentions.extend(vectorizer.attention(text_clean[i:index])) else: t = [] for s in text_clean[i:index]: t.append([(w, 1.0) for w in s.split()]) attentions.extend(t) transformed_text_clean = np.concatenate(transformed_text_clean, axis=0) dist = 1 - cosine_similarity(transformed_text_clean) linkage_matrix = ward(dist) if not titles: titles = [] for i in range(transformed_text_clean.shape[0]): if hasattr(vectorizer, 'fit'): indices = np.argsort( np.array(transformed_text_clean[i].todense())[0])[::-1] titles.append(' '.join( [features[i] for i in indices[:ngram[1]]])) else: attentions[i].sort(key=lambda x: x[1]) titles.append(' '.join( [i[0] for i in attentions[i][-ngram[1]:]])) plt.figure(figsize=figsize) ax = dendrogram(linkage_matrix, orientation='right', labels=titles) plt.tick_params( axis='x', which='both', bottom='off', top='off', labelbottom='off', ) plt.tight_layout() plt.show() return {'linkage_matrix': linkage_matrix, 'titles': titles}
for novel in novels: print ("处理:{}".format(novel)) with codecs.open('{}.txt'.format(novel), encoding="utf8") as f: sentences += [list(jieba.cut(line.strip())) for line in f] model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) #for k, s in model.most_similar(positive=["牧尘"]): #print (k,s) all_names = np.array(list(filter(lambda c: c in model, novel_names["\ufeff斗破苍穹"]))) word_vectors = np.array(list(map(lambda c: model[c], all_names))) import scipy.cluster.hierarchy as sch Y = sch.linkage(word_vectors, method="ward") _, ax = plt.subplots(figsize=(10, 40)) Z = sch.dendrogram(Y, orientation='right') idx = Z['leaves'] ax.set_xticks([]) ax.set_yticklabels(all_names[idx], fontproperties=font_yahei_consolas) ax.set_frame_on(False) plt.show() #designed by pwy