def dendro(X,metric='cosine',combine='average',showdendro=True,leaf_label_func=identity,**kw): Y = pdist(X,metric) Z = linkage(Y,combine) if showdendro: dendrogram(Z,leaf_label_func=leaf_label_func,**kw) show() return Z
def gethclinks(exparray, method): hcdists = hcluster.pdist(exparray, method) hclinks = hcluster.linkage(hcdists) links = [] for hclink in hclinks: links.append([int(hclink[0]), int(hclink[1])]) return links
def main(): print "hola" X = rand(10,100) X[0:5,:] *= 2 Y = pdist(X) Z = linkage(Y) dendrogram(Z)
def test_cluster_slink(repeat, runs, dist_m): np.random.seed(int(time.time())) clocks = np.empty((repeat, runs)) times = np.empty((repeat, runs)) for i in xrange(repeat): for j in xrange(runs): print 'a' t1 = time.time() c1 = time.clock() Z = hcluster.linkage(dist_m, 'single') c2 = time.clock() t2 = time.time() print 'b' dt = t2 - t1 dc = c2 - c1 clocks[i, j] = c2 - c1 times[i, j] = t2 - t1 mean_clock = np.mean(clocks) std_clock = np.std(clocks) mean_time = np.mean(times) std_time = np.std(times) print '5000 objects, 20 features: clocks=%f +- %f, times=%f +- %f' % (mean_clock, std_clock, mean_time, std_time) return mean_time, std_time, mean_clock, std_clock
def dendro(X, metric="cosine", combine="average", showdendro=True, leaf_label_func=identity, **kw): Y = pdist(X, metric) Z = linkage(Y, combine) if showdendro: dendrogram(Z, leaf_label_func=leaf_label_func, **kw) show() return Z
def plotSampleDistanceDendrogram(ds): """Plot a sample distance cluster dendrogram using all samples and features of a dataset. :Parameter: ds: Dataset The source dataset. """ # generate map from num labels to literal labels # to put them on the dendrogram leaves lmap = dict([(v, k) for k, v in ds.labels_map.iteritems()]) # compute distance matrix, default is squared euclidean distance dist = clust.pdist(ds.samples) # determine clusters link = clust.linkage(dist, 'complete') # plot dendrogram with literal labels on leaves # this does not work with etch's version of matplotlib (verified for # matplotlib 0.98) clust.dendrogram( link, colorthreshold=0, labels=[lmap[l] for l in ds.labels], # all black link_color_func=lambda x: 'black', distance_sort=False) labels = P.gca().get_xticklabels() # rotate labels P.setp(labels, rotation=90, fontsize=9)
def test_cluster_ward(repeat, runs, data): np.random.seed(int(time.time())) clocks = np.empty((repeat, runs)) times = np.empty((repeat, runs)) for i in xrange(repeat): for j in xrange(runs): print 'a' t1 = time.time() c1 = time.clock() Z = hcluster.linkage(data, 'ward') c2 = time.clock() t2 = time.time() print 'b' dt = t2 - t1 dc = c2 - c1 clocks[i, j] = c2 - c1 times[i, j] = t2 - t1 mean_clock = np.mean(clocks) std_clock = np.std(clocks) mean_time = np.mean(times) std_time = np.std(times) print '%d objects, %d features: clocks=%f +- %f, times=%f +- %f' % (data.shape[0], data.shape[1], mean_clock, std_clock, mean_time, std_time) return mean_time, std_time, mean_clock, std_clock
def hierarchicalClustering(p_dist, word_list, cons_words): Z = linkage(p_dist) index1 = word_list.index(cons_words[0]) assert index1 >= 0 path1 = findPath(Z, index1, len(word_list)) index2 = word_list.index(cons_words[1]) assert index2 >= 0 path2 = findPath(Z, index2, len(word_list)) print Z print path1 print path2 common = set(path1).intersection(set(path2)) # at least have the common root first = min(common) assert(first >= len(word_list)) first -= len(word_list) cluster_root = Z[first][0] merge1 = findCluster(Z, cluster_root, word_list) cluster_root = Z[first][1] merge2 = findCluster(Z, cluster_root, word_list) print word_list print merge1 print merge2 split_pair = (cons_words[0], cons_words[1]) return split_pair, merge1, merge2
def _train(self, trainset): self._dataset = trainset self.ulabels = trainset.uniquelabels # Do cross-validation for normal classifier self.cvterr = CrossValidatedTransferError(TransferError(self._clf), self._splitter, enable_states=["confusion"]) self.cvterr(self._dataset) # From the confusion matrix, calculate linkage and tree-structure # First prepare distance matrix from confusion matrix dist = self.cvterr.confusion.matrix dist = dist.max( ) - dist # Kind of inversion. High values in confusion -> similar -> small distance dist = (dist + dist.T) / 2 # Distance must be symmetric (property of a norm) dist -= np.diag( np.diag(dist) ) # Distance to self must be zero -> make diagonal elements zero # Calculate linkage matrix self.linkage = hcluster.linkage(hcluster.squareform(dist)) # Build tree and according TreeClassifier self.tree = hcluster.to_tree(self.linkage) self._tree_clf = self.build_tree_classifier_from_linkage_tree( self.tree)[0] self._tree_clf.train(trainset)
def test(): word_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O' ] cons_words = ['C', 'B'] X = rand(15, 2) #X = [[0.35, 0.37], [0.40, 0.40], [0.53, 0.53], [0.34, 0.51]] print X Y = pdist(X) print Y Z = linkage(Y) R = dendrogram(Z) index1 = word_list.index(cons_words[0]) assert index1 >= 0 path1 = findPath(Z, index1, len(word_list)) index2 = word_list.index(cons_words[1]) assert index2 >= 0 path2 = findPath(Z, index2, len(word_list)) print Z print path1 print path2 common = set(path1).intersection(set(path2)) first = min(common) assert(first >= len(word_list)) first -= len(word_list) cluster_root = Z[first][0] merge1 = findCluster(Z, cluster_root, word_list) cluster_root = Z[first][1] merge2 = findCluster(Z, cluster_root, word_list) print merge1 print merge2
def do_it(self, sources): for source in sources: words = nltk.wordpunct_tokenize(source.headline) words.extend(nltk.wordpunct_tokenize(source.summary)) lowerwords=[x.lower() for x in words if len(x) > 1] self.ct += 1 print self.ct, "TITLE",source.headline self.corpus.append(lowerwords) self.titles.append(source.headline) self.links.append(source.url) [[self.key_word_list.add(x) for x in self.top_keywords(self.nkeywords,doc,self.corpus)] for doc in self.corpus] self.ct=-1 for doc in self.corpus: self.ct+=1 print self.ct,"KEYWORDS"," ".join(self.top_keywords(self.nkeywords,doc,self.corpus)) for document in self.corpus: vec=[] [vec.append(self.tfidf(word, document, self.corpus) if word in document else 0) for word in self.key_word_list] self.feature_vectors.append(vec) self.n=len(self.corpus) mat = numpy.empty((self.n, self.n)) for i in xrange(0,self.n): for j in xrange(0,self.n): mat[i][j] = nltk.cluster.util.cosine_distance(self.feature_vectors[i],self.feature_vectors[j]) Z = linkage(mat, 'single') dendrogram(Z, color_threshold=self.t) clusters = self.extract_clusters(Z,self.t,self.n) stories = [] for key in clusters: print "=============================================" story = Story() for id in clusters[key]: story.add_source(sources[id]) print id,self.titles[id],sources[id].url stories.append(story) return stories
def time_subcluster(self, locs): # Getting subclusters at Mapzen's limit cluster_linkage = linkage(locs, method='ward') clusters = fcluster(cluster_linkage, 50, criterion='maxclust') cluster_means = np.array([np.mean( locs[np.where(clusters == i)], axis=0 ) for i in range(1, 51)]) mapzen_locs = [{'lat': p[1], 'lon': p[0]} for p in cluster_means] mapzen_matrix = self.mapzen_matrix(mapzen_locs) # Cluster labels used for mapping back together # Subtracting one to use 0 index cl = clusters - 1 # Get a matching distance matrix of lat/lon distance, get ratios cluster_km_dist = squareform(pdist(cluster_means, (lambda u,v: haversine(u,v)))) dist_ratio_matrix = np.nan_to_num(np.divide(mapzen_matrix, cluster_km_dist)) # Divide items by mean to normalize a bit dist_ratio_matrix = np.nan_to_num(np.divide(dist_ratio_matrix, dist_ratio_matrix.mean())) locs_km_dist = squareform(pdist(locs, (lambda u,v: haversine(u,v)))) # Iterate through each, updating by ratio in dist_ratio_matrix it = np.nditer(locs_km_dist, flags=['multi_index'], op_flags=['readwrite']) while not it.finished: it[0] = it[0] * dist_ratio_matrix[cl[it.multi_index[0]]][cl[it.multi_index[1]]] it.iternext() return locs_km_dist
def time_series_clusters(Y, ct=0.5, return_clusters=False): D = pdist(transpose(Y), 'correlation') D = abs(D) if return_clusters: L = linkage(D, method='single', metric='cosine') C = fcluster(L, ct, criterion='distance') return cluster_sets(C) plot_clusters(D, ct)
def cluster_elut(mat): import hcluster ymat = hcluster.pdist(mat) zmat = hcluster.linkage(ymat) figure() order = hcluster.dendrogram(zmat)['leaves'] clf() imshow(mat[order,:])
def time_series_clusters(Y,ct=0.5,return_clusters=False): D = pdist(transpose(Y),'correlation') D = abs(D) if return_clusters: L = linkage(D,method='single',metric='cosine') C = fcluster(L,ct,criterion='distance') return cluster_sets(C) plot_clusters(D,ct)
def __call__(self): # Can continue to play around with these self.cluster_linkage = linkage(self.point_arr, method='ward') self.clusters = fcluster(self.cluster_linkage, self.num_clusters, criterion='maxclust') [p[0].update({'group': p[1]}) for p in zip(self.locations, self.clusters.tolist())] return self.locations
def performHierarchicalClusterin(matrix, titlesCat): #compute the distance matrix with "cosine" metric distanceMatrix =pairwise_distances(matrix, metric='cosine') #Computer the hierarchical clutering, similaritiy with cluster #is caclulated with the average of element similarities Z=linkage(distanceMatrix,method='average') #Create a dendogram image image=dendrogram(Z,labels=titlesCat, distance_sort='descendent', leaf_font_size=2, orientation='left', show_contracted=False) #Save generated dendogram image pylab.savefig("images/clusteringImage.png",dpi=300,bbox_inches='tight')
def t_dendrogram(X, nclusters): from matplotlib.pyplot import show from hcluster import pdist, linkage, dendrogram import numpy from numpy.random import rand # X = X[:10, :] Y = pdist(X) Z = linkage(Y) res = dendrogram(Z) show() pass
def do_it(self): for feed in self.feeds: d = feedparser.parse(feed) for e in d['entries']: words = nltk.wordpunct_tokenize(self.clean_html(e['description'])) words.extend(nltk.wordpunct_tokenize(e['title'])) lowerwords=[x.lower() for x in words if len(x) > 1] self.ct += 1 print self.ct, "TITLE",e['title'] self.corpus.append(lowerwords) self.titles.append(e['title']) self.links.append(e['link']) [[self.key_word_list.add(x) for x in self.top_keywords(self.nkeywords,doc,self.corpus)] for doc in self.corpus] self.ct=-1 for doc in self.corpus: self.ct+=1 print self.ct,"KEYWORDS"," ".join(self.top_keywords(self.nkeywords,doc,self.corpus)) for document in self.corpus: vec=[] [vec.append(self.tfidf(word, document, self.corpus) if word in document else 0) for word in self.key_word_list] self.feature_vectors.append(vec) self.n=len(self.corpus) mat = numpy.empty((self.n, self.n)) for i in xrange(0,self.n): for j in xrange(0,self.n): mat[i][j] = nltk.cluster.util.cosine_distance(self.feature_vectors[i],self.feature_vectors[j]) Z = linkage(mat, 'single') dendrogram(Z, color_threshold=self.t) clusters = self.extract_clusters(Z,self.t,self.n) for key in clusters: print "=============================================" for id in clusters[key]: print id,self.titles[id]
def printMostSimilarCluster(matrix, titlesCat): #compute the distance matrix with "cosine" metric distanceMatrix =pairwise_distances(matrix, metric='cosine') #Computer the hierarchical clutering, similaritiy with cluster #is caclulated with the average of element similarities Z=linkage(distanceMatrix,method='average')#,method='centroid') print "first closest cluster\n" for idx in range(10): lenTitle=len(titlesCat) if (int(Z[idx,0])<lenTitle) & (int(Z[idx,1])<lenTitle): print "itr "+str(idx)+":\n"+titlesCat[int(Z[idx,0])]+" "+titlesCat[int(Z[idx,1])]
def get_clustering_as_tree(vectors, ward = True, clustering_distance='euclidean', clustering_method = 'complete', progress = progress): if ward: progress.update('Clustering data with Ward linkage and euclidean distances') clustering_result = hcluster.ward(vectors) else: progress.update('Computing distance matrix using "%s" distance' % clustering_distance) distance_matrix = hcluster.pdist(vectors, clustering_distance) progress.update('Clustering data with "%s" linkage' % clustering_method) clustering_result = hcluster.linkage(distance_matrix, method = clustering_method) progress.update('Returning results') return hcluster.to_tree(clustering_result)
def do_clusters(cluster_coords,Labels=None,link_method='single',d=0.2): D = pdist(cluster_coords,'cosine') # SEEMS THERE MAY SOMETIME BE VERY SMALL NEGATIVE DISTANCES ie -2*10**-16 D = abs(D) L = linkage(D,method=link_method,metric='cosine') F = fcluster(L,d,'distance','cosine') C = defaultdict(list) for i in range(len(F)): if Labels: C[F[i]].append(Labels[i]) else: C[F[i]].append(i) return C
def generate_dendrogram(root): from hcluster import pdist, linkage, dendrogram import numpy from numpy.random import rand import matplotlib X = rand(10,100) X[0:5,:] *= 2 Y = pdist(X) Z = linkage(Y) print Y print Z dendrogram(Z)
def fetch_clusters(self, mat, n): """ Fetch the cluster from the similarity matrix :param mat: The similarity matrix :param n: The length of the corpus :return: The clusters """ Z = linkage(mat, 'single') dendrogram(Z, color_threshold=self.t) pylab.savefig(self.cluster_image, dpi=self.dpi) clusters = self.__extract_clusters(Z, self.t, n) return clusters
def wavelet_clusters(Y,ct=0.5,weights=False,return_clusters=False,swt=False): if weights: D = abs(c_dists(Y,level_weights=True,use_swt=False)) Dr = [] for i in range(D.shape[0]-1): Dr += list(D[i,i+1:]) else: Dr = c_dists(Y,use_swt=swt) if return_clusters: L = linkage(Dr,method='single',metric='cosine') C = fcluster(L,ct,criterion='distance') return cluster_sets(C) plot_clusters(Dr,ct)
def plot_cluster_tree(cluster_coords,Labels=None,link_method='single',color_thresh=.25,fontsize=8): D = pdist(cluster_coords,'cosine') # SEEMS THERE MAY SOMETIME BE VERY SMALL NEGATIVE DISTANCES ie -2*10**-16 D = abs(D) L = linkage(D,method=link_method,metric='cosine') if Labels: dendrogram(L,labels=Labels,orientation='left',color_threshold=color_thresh) else: dendrogram(L,orientation='left',color_threshold=color_thresh) pylab.title('HMP Buccal Mucosa - Latent Strain Analysis') pylab.xlabel('Cosine Distance') pylab.ylabel('Strain with the Most Alignments to Each Cluster') pylab.rcParams.update({'font.size': fontsize}) pylab.show()
def printSummary(updatedtfidfMatrix, queriedSentences): print "\n" a = pdist(updatedtfidfMatrix,'cosine') print a b = linkage(a) dendrogram(b) show() print b sumOrder = [] count = 0 f = open("foo.txt", "w") for i in range(len(b)): x = int(b[i][0]) y = int(b[i][1]) if x <= (len(queriedSentences)-1): sumOrder.append(x) if y <= (len(queriedSentences)-1): sumOrder.append(y) if x <= (len(queriedSentences)-1) and y > (len(queriedSentences)-1): sumOrder.append(y) if x > (len(queriedSentences)-1) and y > (len(queriedSentences)-1): sumOrder.append(x) previous = 0 queriedSentences = [sentence.capitalize() for sentence in queriedSentences] for num in sumOrder: if num > (len(queriedSentences)-1): f.write('<br></br>') else: f.write(queriedSentences[num]) f.write('.') f.write(' ') f.close() with open ("foo.txt", "r") as myfile: #print myfile data=myfile.read() print data return data
def cluster_analysis_hcluster(self, vectors): from hcluster import linkage, fcluster import numpy params = self.params.multiple_lattice_search.cluster_analysis.hcluster X = numpy.array(vectors) linkage_method = params.linkage.method linkage_metric = params.linkage.metric criterion = params.cutoff_criterion Z = linkage(X, method=linkage_method, metric=linkage_metric) cutoff = params.cutoff i_cluster = fcluster(Z, cutoff, criterion=criterion) i_cluster = flex.int(i_cluster.astype(numpy.int32)) return i_cluster
def output_dendrogram(imgs, kernel, method="complete", dend_fn="_dendrogram.png"): dst = pdist(kernel) links = linkage(dst, method=method) tmp_dend_fn = method + "_" + dend_fn axis = dendrogram(links, orientation="left", figsize=(7, 12), outfilename=tmp_dend_fn)[1] figimg = libpil.loadImage(tmp_dend_fn) labels = [label._text for label in axis.get_yticklabels()] labels = map(int, labels) labels.reverse() for i, ind in enumerate(labels): imgs[ind].thumbnail((30, 30)) offset = i * (imgs[ind].size[1] + 4) + 120 figimg.paste(imgs[ind], (52, offset)) figimg.save("fig_" + tmp_dend_fn)
def do_gen_feature_z(X_L_list, X_D_list, M_c, filename, tablename=''): num_cols = len(X_L_list[0]['column_partition']['assignments']) column_names = [M_c['idx_to_name'][str(idx)] for idx in range(num_cols)] column_names = numpy.array(column_names) # extract unordered z_matrix num_latent_states = len(X_L_list) z_matrix = numpy.zeros((num_cols, num_cols)) for X_L in X_L_list: assignments = X_L['column_partition']['assignments'] for i in range(num_cols): for j in range(num_cols): if assignments[i] == assignments[j]: z_matrix[i, j] += 1 z_matrix /= float(num_latent_states) # hierachically cluster z_matrix Y = hcluster.pdist(z_matrix) Z = hcluster.linkage(Y) pylab.figure() hcluster.dendrogram(Z) intify = lambda x: int(x.get_text()) reorder_indices = map(intify, pylab.gca().get_xticklabels()) pylab.close() # REORDER! z_matrix_reordered = z_matrix[:, reorder_indices][reorder_indices, :] column_names_reordered = column_names[reorder_indices] # actually create figure fig = pylab.figure() fig.set_size_inches(16, 12) pylab.imshow(z_matrix_reordered, interpolation='none', cmap=pylab.matplotlib.cm.Greens) pylab.colorbar() if num_cols < 14: pylab.gca().set_yticks(range(num_cols)) pylab.gca().set_yticklabels(column_names_reordered, size='x-small') pylab.gca().set_xticks(range(num_cols)) pylab.gca().set_xticklabels(column_names_reordered, rotation=90, size='x-small') else: pylab.gca().set_yticks(range(num_cols)[::2]) pylab.gca().set_yticklabels(column_names_reordered[::2], size='x-small') pylab.gca().set_xticks(range(num_cols)[1::2]) pylab.gca().set_xticklabels(column_names_reordered[1::2], rotation=90, size='small') pylab.title('column dependencies for: %s' % tablename) pylab.savefig(filename)
def OnLeftDClick(self, event): #def OnLeftDClick(event): """ Left Double Click has been invocked. This plugin call pdist function from hcluster package and plot the dendrogram using matplotlib.pyplot package. """ #canvas = event.GetEventObject() #model = canvas.getCurrentShape(event) devs = self.getDEVSModel() if devs: Y = pdist(devs.vectors) Z = linkage(Y) dendrogram(Z) show() else: wx.MessageBox(_("No DEVS model is instanciated.\nGo back to the simulation!"), _("Info"), wx.OK|wx.ICON_INFORMATION)
def cluster_path_times(self, path_times,display): recordings = path_times.recordings X=[] for recording in recordings: X.append([recording.time.seconds+recording.time.microseconds/10**6.,recording.date.hour*60+recording.date.minute]) print X Y=pdist(X) Z=linkage(Y) dendrogram(Z) for i in range(len(X)): print('{0}, {1}'.format(i,X[i])) print Z print self.calculate_variances(X,Z) if display: show()
def cluster(items, cache_clustering_file = None, dist_fn = euc_dist, \ prefix_output = None): if not cache_clustering_file: print "Generating distance matrix..." sys.stdout.flush() Y = dist_matrix(items, dist_fn) print "Linkage clustering..." sys.stdout.flush() Z = linkage(Y, "single") # average, complete = max, single = min ? print "Dumping clustering information into cache file" sys.stdout.flush() cPickle.dump([Y, Z], open(prefix_output + "clustering_dump.pkl", "w")) else: print "Loading clustering cache from '%s'" % cache_clustering_file.name Y, Z = cPickle.load(cache_clustering_file) print "Converting into ETE tree..." sys.stdout.flush() T = to_tree(Z) root = Tree() root.dist = 0 root.name = "root" item2node = {T: root} to_visit = [T] while to_visit: node = to_visit.pop() cl_dist = node.dist / 2.0 for ch_node in [node.left, node.right]: if ch_node: ch = Tree() #try: # ch.add_features(content = str(items[ch_node.id])) #except IndexError: # pass ch.dist = cl_dist ch.name = str(ch_node.id) item2node[node].add_child(ch) item2node[ch_node] = ch to_visit.append(ch_node) return root
def hierarchical(self,lst,fulldataset): #Samples are colored according to its sample type # label_color={} for i in self.numbering(self.classLabel(lst)): r=('r') b=('b') if i[0:6]=='cancer': label_color[i]=r #print label_colors elif i[0:6]=='normal' : label_color[i]=b #print label_colors else: continue tg=zip(*fulldataset) Y = pdist(tg) #average linkage is applied # Z = linkage(Y,method='average') sch.set_link_color_palette(['black']) a=sch.dendrogram(Z,leaf_font_size=6,labels=self.newlist) #dendrogram is plotted # ax = plt.gca() xlbls = ax.get_xmajorticklabels() for lbl in xlbls: lbl.set_color(label_color[lbl.get_text()]) plt.title("Average Hierarchical Clustering Algorithm") plt.savefig('Average Hierarchical Clustering.pdf',dpi=500) #plt.show() plt.close() self.labels=array([]) c=array([1]) n=array([0]) #Silhouette Test # #Samples are converted into '0' or '1' for validation # for i in self.classLabel(lst): if i=='cancer': self.labels=np.concatenate([self.labels,c]) else: self.labels=np.concatenate([self.labels,n]) self.labels=np.delete(self.labels,self.labels[-1]) self.score=metrics.silhouette_score(Z, self.labels, metric='euclidean')
def hcluster(self, stim): #from hcluster import pdist, linkage, dendrogram import hcluster iu = np.triu_indices(len(stim.group), 1) # Z = hcluster.linkage(stim.group[iu], 'single', 'ward') import pdb; pdb.set_trace() thres = Z[-2, 2] dend = hcluster.dendrogram(Z, color_threshold=thres) plt.show() clusters = self.get_clusters(Z, n_clusters=4)#thres=thres) colors = self.get_colors(len(clusters)) #import pdb; pdb.set_trace() for cluster, color in zip(clusters, colors): sel = stim.indices[np.array(cluster)] plt.plot(sel[:,1], sel[:,0],'o', color=color, ) plt.show()
def wavelet_clusters(Y, ct=0.5, weights=False, return_clusters=False, swt=False): if weights: D = abs(c_dists(Y, level_weights=True, use_swt=False)) Dr = [] for i in range(D.shape[0] - 1): Dr += list(D[i, i + 1:]) else: Dr = c_dists(Y, use_swt=swt) if return_clusters: L = linkage(Dr, method='single', metric='cosine') C = fcluster(L, ct, criterion='distance') return cluster_sets(C) plot_clusters(Dr, ct)
def hierarchicalcluster(datamatrix, dimlabels, similarity='euclidean', colorthresh='default'): '''plots dendrogram and returns clustering (item-1 x 4 array. first two columns are indices of clusters, 3rd column = distance between those clusters, 4th column = # of original observations in the cluster) and dend (dictionary of the data structures computed to render the dendrogram). see api here: http://hcluster.damianeads.com/cluster.html''' import hcluster with warnings.catch_warnings(): warnings.simplefilter("ignore") clustering = hcluster.linkage(datamatrix, metric=similarity) if colorthresh == 'default': color_threshold = 0.7 * max(clustering[:, 2]) #all descendents below a cluster node k will be assigned the same color if k is the first node below color_threshold. links connecting nodes with distances >= color_threshold are colored blue. default= 0.7*max(clustering[:,2]) else: color_threshold = colorthresh * max(clustering[:, 2]) fig = plt.figure() dend = hcluster.dendrogram(clustering, labels=dimlabels, leaf_rotation=90, color_threshold=color_threshold) plt.tight_layout() return clustering, dend
def cluster_ids(gids, unnorm_eluts, sp, gt=None, dist='cosine', do_plot=True, norm_rows=True, bigarr=None, **kwargs): import plotting as pl import hcluster arr = (bigarr if bigarr is not None else single_array(gids, unnorm_eluts, sp, norm_rows=norm_rows)) ymat = hcluster.pdist(arr, metric=dist) zmat = hcluster.linkage(ymat) zmat = np.clip(zmat, 0, 10**8) if do_plot: pl.figure() order = hcluster.dendrogram(zmat, no_plot=bool(1-do_plot), **kwargs)['leaves'] if do_plot: ax = pl.gca() ax.axes.set_xticklabels([gt.id2name[gids[ind]] for ind in order]) pl.figure() pl.imshow(arr[order,:]) return list(np.array(list(gids))[order])
def _train(self, dataset): self._dataset = dataset self.ulabels=self._dataset.uniquelabels # Do cross-validation for normal classifier self.cvterr = CrossValidatedTransferError(TransferError(self._clf),self._splitter,enable_states=["confusion"]) self.cvterr(self._dataset) # From the confusion matrix, calculate linkage and tree-structure # First prepare distance matrix from confusion matrix dist = self.cvterr.confusion.matrix dist = (dist+dist.T)/2 # Distance must be symmetric (property of a norm) dist = dist.max()-dist # Kind of inversion. High values in confusion -> similar -> small distance dist -= np.diag(np.diag(dist)) # Distance to self must be zero -> make diagonal elements zero # Calculate linkage matrix self.linkage = hcluster.linkage(hcluster.squareform(dist)) # Build tree and according TreeClassifier self.tree = hcluster.to_tree(self.linkage) self._tree_clf = self.build_tree_classifier_from_linkage_tree(self.tree)[0] self._tree_clf.train(self._dataset)
def DrawDendrogram(feature_vector, obj_names, motion_name): distances = pdist(feature_vector) linkage_list = ['single', 'average', 'complete'] Z = linkage(distances, linkage_list[1]) render = hierarchy.dendrogram(Z, #p=51, #truncate_mode='level', #show_contracted=True, color_threshold=1.5, labels=obj_names, orientation='left', show_leaf_counts=True, leaf_font_size=10, ) plt.title(motion_name+'_'+linkage_list[1]) plt.show() #plt.savefig(motion_name+'_dendro_complete.png') return render
def get_clustering_as_tree(vectors, ward=True, clustering_distance='euclidean', clustering_method='complete', progress=progress): if ward: progress.update( 'Clustering data with Ward linkage and euclidean distances') clustering_result = hcluster.ward(vectors) else: progress.update('Computing distance matrix using "%s" distance' % clustering_distance) distance_matrix = hcluster.pdist(vectors, clustering_distance) progress.update('Clustering data with "%s" linkage' % clustering_method) clustering_result = hcluster.linkage(distance_matrix, method=clustering_method) progress.update('Returning results') return hcluster.to_tree(clustering_result)
def dendrogramBuild(tfidfMatrix,queriedSentences,degree): a = pdist(tfidfMatrix,'cosine') print a b = linkage(a) print b if b[0][2] < degree: mag1 = tfidf.magnitude(tfidfMatrix[int(b[0][0])]) mag2 = tfidf.magnitude(tfidfMatrix[int(b[0][1])]) if mag1 > mag2: print int(b[0][1]) tfidfMatrix.pop(int(b[0][1])) queriedSentences.pop(int(b[0][1])) else: print int(b[0][0]) tfidfMatrix.pop(int(b[0][0])) queriedSentences.pop(int(b[0][0])) dendrogramBuild(tfidfMatrix,queriedSentences,degree) return (tfidfMatrix,queriedSentences)
import numpy as np import matplotlib.pyplot as plt from hcluster import pdist, linkage, dendrogram, squareform # same as import them from scipy data = np.genfromtxt("../../data/ExpRawData-E-TABM-84-A-AFFY-44.tab", names=True, usecols=tuple(range(1, 30)), dtype=float, delimiter="\t") data_array = data.view((np.float, len(data.dtype.names))) data_array = data_array[1:1000].transpose() data_dist = pdist(data_array) # computing the distance data_link = linkage(data_dist) # computing the linkage # just plot the dendrogram. dendrogram(data_link, labels=data.dtype.names) plt.savefig('../../results/dendrogram.png') # or plot the heatmap too! # Compute and plot first dendrogram. fig = plt.figure(figsize=(8, 8)) # x ywidth height ax1 = fig.add_axes([0.05, 0.1, 0.2, 0.6]) Y = linkage(data_dist, method='single') Z1 = dendrogram(Y, orientation='right', labels=data.dtype.names) # adding/removing the axes ax1.set_xticks([])
def main(argv): print argv if (len(argv) > 0): params = argv[::2] param_values = argv[1::2] crit_func = squared_criterion merge_func = d_min for i in range(0, len(argv), 2): if params[i] == "--criterium": if param_values[i + 1] == "silhoette": crit_func = silhouette_criterion elif param_values[i + 1] == "squared": crit_func = squared_criterion else: crit_func = silhouette_criterion elif params[i] == "--merge": if param_values[i + 1] == "de": merge_func = d_e elif param_values[i + 1] == "dmax": merge_func = d_max else: merge_func = d_e Cluster.clusters = [] Cluster.squared_criterion_values = [] Cluster.silhouette_criterion_values = [] my_data = np.genfromtxt('./data.csv', delimiter=',', dtype=float) #Make only clusterization params in array data_list = my_data[1:].tolist() maximum = 0 data_list = data_list[:] etalon = data_list[:] for i in range(len(data_list)): data_list[i] = data_list[i][2:] #normalize all lists: data_list = np.array(data_list) #count all distances print "Precounting distances" for i in range(len(data_list)): for j in range(len(data_list)): print ".", Cluster.counted_distances[(tuple(data_list[i]), tuple( data_list[j]))] = hexic_euqlid_distance( data_list[i], data_list[j]) print "Distances Counted" for i in range(len(data_list)): Cluster.etalon_clasters[tuple(data_list[i][2:])] = etalon[i][1] print Cluster.etalon_clasters.values() #Make each element = 1 cluster for x in data_list: Cluster.clusters.append(Cluster(x)) print(len(Cluster.clusters)) K_num = 1 swo(K_num, merge_func, crit_func) Y = Cluster.merge_history[1:] Z = linkage(Y) plt.subplot(121) dendrogram(Z, labels=range(len(data_list))) squared_criterion_values = Cluster.squared_criterion_values[::-1] silhouette_criterion_values = Cluster.silhouette_criterion_values[::-1] plt.subplot(122) if (crit_func == silhouette_criterion): plt.plot(range(len(silhouette_criterion_values)), silhouette_criterion_values) plt.axis([ K_num, 30, min(silhouette_criterion_values), max(silhouette_criterion_values) ]) else: plt.plot(range(len(squared_criterion_values)), squared_criterion_values) plt.axis([ K_num, 30, min(squared_criterion_values), max(squared_criterion_values) ]) plt.show() for x in Cluster.clusters: x.etalon_to_current_mapping() print x.etalon_map
ct += 1 boom = " ".join(top_keywords(nkeywords, doc, corpus)) keywords.append(boom) feature_vectors = [] n = len(corpus) for document in corpus: vec = [] [vec.append(tfidf(word, document, corpus) if word in document else 0) for word in key_word_list] feature_vectors.append(vec) mat = numpy.empty((n, n)) for i in range(0, n): for j in range(0, n): mat[i][j] = nltk.cluster.util.cosine_distance(feature_vectors[i], feature_vectors[j]) t = 0.8 Z = linkage(mat,'complete') posts = [] clusters = extract_clusters(Z, t, n) ct = -1 for key in clusters: print("-------------------------------") for id in clusters[key]: ct += 1 print(ct, titles[id]) print(ct, " - ",keywords[id])
from hcluster import pdist, linkage, leaves_list, squareform, dendrogram import numpy as np import matplotlib as mp metric = 'euclidean' method = 'single' data = np.matrix([[1, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1, 1], [0, 0, 0, 0, 0, 0, 1, 1, 0], [0, 0, 0, 1, 1, 1, 1, 1, 0], [0, 0, 0, 1, 1, 1, 0, 0, 0], [0, 0, 1, 1, 1, 1, 0, 0, 0]]) y = pdist(data, metric=metric) Z = linkage(y, method=method, metric=metric) dendrogram(Z) Z = [(int(l), int(r), max(0., s), int(n)) for (l, r, s, n) in Z] # cleaning leaves = list(leaves_list(Z)) count = len(leaves) root = len(Z) + count - 1 X = squareform(y) assert len(X) == count from utils import memoise # bar-joseph optimal ordering ################################################ from barjoseph import optimal leaves = optimal( root, **{
##| cosine similarities ##`---- import numpy from nltk import cluster mat = numpy.empty((n, n)) for i in xrange(0, n): for j in xrange(0, n): mat[i][j] = nltk.cluster.util.cosine_distance(feature_vectors[i], feature_vectors[j]) ##,---- ##| Hierarchically Cluster mat ##`---- from hcluster import linkage t = 0.9 Z = linkage(mat, 'single') #dendrogram(Z, color_threshold=t) #import pylab #pylab.savefig( "new_agg_cluster.png" ,dpi=800) ##,---- ##| Cluster Extraction ##`---- def extract_clusters(Z, threshold, n): clusters = {} ct = n for row in Z: if row[2] < threshold:
import hcluster import matplotlib.pyplot as plt import pickle import urllib url = "http://examples.obspy.org/dissimilarities.pkl" dissimilarity = pickle.load(urllib.urlopen(url)) plt.subplot(121) plt.imshow(1 - dissimilarity, interpolation="nearest") dissimilarity = hcluster.squareform(dissimilarity) threshold = 0.3 linkage = hcluster.linkage(dissimilarity, method="single") clusters = hcluster.fcluster(linkage, 0.3, criterion="distance") plt.subplot(122) hcluster.dendrogram(linkage, color_threshold=0.3) plt.xlabel("Event number") plt.ylabel("Dissimilarity") plt.show()
def cluster(M, method='complete'): return hcluster.linkage(hcluster.squareform(M), method=method)
def _do_gen_matrix(self, col_function_name, X_L_list, X_D_list, M_c, T, tablename='', filename=None, col=None, confidence=None, limit=None, submatrix=False): if col_function_name == 'mutual information': col_function = getattr(self, '_mutual_information') elif col_function_name == 'dependence probability': col_function = getattr(self, '_dependence_probability') elif col_function_name == 'correlation': col_function = getattr(self, '_correlation') elif col_function_name == 'view_similarity': col_function = getattr(self, '_view_similarity') else: raise Exception('Invalid column function') num_cols = len(X_L_list[0]['column_partition']['assignments']) column_names = [ M_c['idx_to_name'][str(idx)] for idx in range(num_cols) ] column_names = numpy.array(column_names) # extract unordered z_matrix num_latent_states = len(X_L_list) z_matrix = numpy.zeros((num_cols, num_cols)) for i in range(num_cols): for j in range(num_cols): z_matrix[i][j] = col_function(i, j, X_L_list, X_D_list, M_c, T) if col: z_column = list(z_matrix[M_c['name_to_idx'][col]]) data_tuples = zip(z_column, range(num_cols)) data_tuples.sort(reverse=True) if confidence: data_tuples = filter(lambda tup: tup[0] >= float(confidence), data_tuples) if limit and limit != float("inf"): data_tuples = data_tuples[:int(limit)] data = [tuple([d[0] for d in data_tuples])] columns = [d[1] for d in data_tuples] column_names = [ M_c['idx_to_name'][str(idx)] for idx in range(num_cols) ] column_names = numpy.array(column_names) column_names_reordered = column_names[columns] if submatrix: z_matrix = z_matrix[columns, :][:, columns] z_matrix_reordered = z_matrix else: return {'data': data, 'columns': column_names_reordered} else: # hierachically cluster z_matrix import hcluster Y = hcluster.pdist(z_matrix) Z = hcluster.linkage(Y) pylab.figure() hcluster.dendrogram(Z) intify = lambda x: int(x.get_text()) reorder_indices = map(intify, pylab.gca().get_xticklabels()) pylab.close() # REORDER! z_matrix_reordered = z_matrix[:, reorder_indices][reorder_indices, :] column_names_reordered = column_names[reorder_indices] title = 'Pairwise column %s for %s' % (col_function_name, tablename) if filename: utils.plot_matrix(z_matrix_reordered, column_names_reordered, title, filename) return dict(matrix=z_matrix_reordered, column_names=column_names_reordered, title=title, filename=filename, message="Created " + title)
m = castoverlap_numgenes elif o.method == 'numsamemono_norm': m = monoallelic_numgenes_norm elif o.method == 'numsamemono100_norm': m = monoallelic_numgenes_norm_100 elif o.method == 'numsameC57_norm': m = c57overlap_numgenes_norm elif o.method == 'numsameCAST_norm': m = castoverlap_numgenes_norm else: m = o.method # make clusters exparray = character_matrix hcdists = hcluster.pdist(exparray, metric=m) hclinks = hcluster.linkage(hcdists, method=o.linkage) draw_order = hcluster.leaves_list(hclinks) # draw tree scipyhcluster.dendrogram(hclinks, labels=samplenames, leaf_rotation=90) pylab.subplots_adjust(bottom=0.3) pylab.ylabel('%s (linkage=%s)' % (o.method, o.linkage)) if o.method in ('numsamemono', 'numsameC57', 'numsameCAST', 'numsamemono_norm', 'numsameC57_norm', 'numsameCAST_norm'): pylab.yticks([1.0, 0.8, 0.6, 0.4, 0.2, 0.0], [0, 100, 200, 300, 400, 500]) elif o.method in ('numsamemono100', 'numsamemono100_norm'): pylab.yticks([1.0, 0.8, 0.6, 0.4, 0.2, 0.0], [0, 20, 40, 60, 80, 100]) pylab.savefig(o.fig) # bootstrap
from matplotlib.pyplot import show from hcluster import pdist, linkage, dendrogram import numpy from numpy.random import rand X = rand(10, 100) X[0:5, :] *= 2 Y = pdist(X) Z = linkage(Y) dendrogram(Z) show()
M = len(actsind) data = zeros((N, M), dtype=int) i = 0 parikhdict = {} for case in uniq_cases.keys(): data[i] = get_parikh(case, actsind) str_i = ','.join(map(str, data[i])) if str_i not in parikhdict: parikhdict[str_i] = [i] else: parikhdict[str_i].append(i) i = i + 1 df = DataFrame(data) data_uniq = df.drop_duplicates() Y = pdist(data_uniq, metric='euclidean') Z = linkage(Y, method='average') dendrogram(Z) show() def similarity_clusters(log, show_plot=None): """Translates traces to Parikh vectors and computes in the vector space a K-means clustering.""" def get_parikh(case, alphabet): v = zeros(len(alphabet), dtype=int) for act in case: v[alphabet[act]] = v[alphabet[act]] + 1 return v actsind = {} i = 0