def do_it(self, sources): for source in sources: words = nltk.wordpunct_tokenize(source.headline) words.extend(nltk.wordpunct_tokenize(source.summary)) lowerwords=[x.lower() for x in words if len(x) > 1] self.ct += 1 print self.ct, "TITLE",source.headline self.corpus.append(lowerwords) self.titles.append(source.headline) self.links.append(source.url) [[self.key_word_list.add(x) for x in self.top_keywords(self.nkeywords,doc,self.corpus)] for doc in self.corpus] self.ct=-1 for doc in self.corpus: self.ct+=1 print self.ct,"KEYWORDS"," ".join(self.top_keywords(self.nkeywords,doc,self.corpus)) for document in self.corpus: vec=[] [vec.append(self.tfidf(word, document, self.corpus) if word in document else 0) for word in self.key_word_list] self.feature_vectors.append(vec) self.n=len(self.corpus) mat = numpy.empty((self.n, self.n)) for i in xrange(0,self.n): for j in xrange(0,self.n): mat[i][j] = nltk.cluster.util.cosine_distance(self.feature_vectors[i],self.feature_vectors[j]) Z = linkage(mat, 'single') dendrogram(Z, color_threshold=self.t) clusters = self.extract_clusters(Z,self.t,self.n) stories = [] for key in clusters: print "=============================================" story = Story() for id in clusters[key]: story.add_source(sources[id]) print id,self.titles[id],sources[id].url stories.append(story) return stories
def dendro(X,metric='cosine',combine='average',showdendro=True,leaf_label_func=identity,**kw): Y = pdist(X,metric) Z = linkage(Y,combine) if showdendro: dendrogram(Z,leaf_label_func=leaf_label_func,**kw) show() return Z
def dendro(X, metric="cosine", combine="average", showdendro=True, leaf_label_func=identity, **kw): Y = pdist(X, metric) Z = linkage(Y, combine) if showdendro: dendrogram(Z, leaf_label_func=leaf_label_func, **kw) show() return Z
def main(): print "hola" X = rand(10,100) X[0:5,:] *= 2 Y = pdist(X) Z = linkage(Y) dendrogram(Z)
def plotSampleDistanceDendrogram(ds): """Plot a sample distance cluster dendrogram using all samples and features of a dataset. :Parameter: ds: Dataset The source dataset. """ # generate map from num labels to literal labels # to put them on the dendrogram leaves lmap = dict([(v, k) for k, v in ds.labels_map.iteritems()]) # compute distance matrix, default is squared euclidean distance dist = clust.pdist(ds.samples) # determine clusters link = clust.linkage(dist, 'complete') # plot dendrogram with literal labels on leaves # this does not work with etch's version of matplotlib (verified for # matplotlib 0.98) clust.dendrogram( link, colorthreshold=0, labels=[lmap[l] for l in ds.labels], # all black link_color_func=lambda x: 'black', distance_sort=False) labels = P.gca().get_xticklabels() # rotate labels P.setp(labels, rotation=90, fontsize=9)
def main(): filename='iris2d.data' if len(sys.argv) > 1: filename = sys.argv[1] else: print 'Assuming filename \'iris2d.data\'' data = parse_file(filename) minclass = int(raw_input('Minclass: ')) maxclass = int(raw_input('Maxclass: ')) standartizate(data) dists = dist_from_data(data) nick = len(dists) lend=nick n1 = lend-1 linkage=[] merge_points=[] # charlie[index used in dist matrix] = [ new cluster nick, number of children ] charlie=dict() for i in range(lend): charlie[i] = [i,1] while n1: n1-=1 dists, e0, e1, d = agglomerate(dists) #charlie[e0][1] has all the children of both e0 and e1 charlie[e0][1] = charlie[e0][1] + charlie[e1][1] linkage.append([charlie[e0][0], charlie[e1][0], d, charlie[e0][1]]) #Fixing the indexes due to the deletion of e1 for i in range(e1,lend-1): charlie[i] = charlie[i+1] charlie[e0][0] = nick nick+=1 #n1 contains the number of classes if n1 <= maxclass and n1 >= minclass: merge_points.append(d) # Finding the cutting point max_dist=0 index=-1 print merge_points for i in range(len(merge_points)-1): d = merge_points[i+1] - merge_points[i] if d > max_dist: max_dist = d index = i assert index >= 0 print 'Cutting point is at y='+str(merge_points[index]) print 'Showing the image...' dendrogram(linkage) show()
def do_it(self): for feed in self.feeds: d = feedparser.parse(feed) for e in d['entries']: words = nltk.wordpunct_tokenize(self.clean_html(e['description'])) words.extend(nltk.wordpunct_tokenize(e['title'])) lowerwords=[x.lower() for x in words if len(x) > 1] self.ct += 1 print self.ct, "TITLE",e['title'] self.corpus.append(lowerwords) self.titles.append(e['title']) self.links.append(e['link']) [[self.key_word_list.add(x) for x in self.top_keywords(self.nkeywords,doc,self.corpus)] for doc in self.corpus] self.ct=-1 for doc in self.corpus: self.ct+=1 print self.ct,"KEYWORDS"," ".join(self.top_keywords(self.nkeywords,doc,self.corpus)) for document in self.corpus: vec=[] [vec.append(self.tfidf(word, document, self.corpus) if word in document else 0) for word in self.key_word_list] self.feature_vectors.append(vec) self.n=len(self.corpus) mat = numpy.empty((self.n, self.n)) for i in xrange(0,self.n): for j in xrange(0,self.n): mat[i][j] = nltk.cluster.util.cosine_distance(self.feature_vectors[i],self.feature_vectors[j]) Z = linkage(mat, 'single') dendrogram(Z, color_threshold=self.t) clusters = self.extract_clusters(Z,self.t,self.n) for key in clusters: print "=============================================" for id in clusters[key]: print id,self.titles[id]
def fetch_clusters(self, mat, n): """ Fetch the cluster from the similarity matrix :param mat: The similarity matrix :param n: The length of the corpus :return: The clusters """ Z = linkage(mat, 'single') dendrogram(Z, color_threshold=self.t) pylab.savefig(self.cluster_image, dpi=self.dpi) clusters = self.__extract_clusters(Z, self.t, n) return clusters
def generate_dendrogram(root): from hcluster import pdist, linkage, dendrogram import numpy from numpy.random import rand import matplotlib X = rand(10,100) X[0:5,:] *= 2 Y = pdist(X) Z = linkage(Y) print Y print Z dendrogram(Z)
def plot_cluster_tree(cluster_coords,Labels=None,link_method='single',color_thresh=.25,fontsize=8): D = pdist(cluster_coords,'cosine') # SEEMS THERE MAY SOMETIME BE VERY SMALL NEGATIVE DISTANCES ie -2*10**-16 D = abs(D) L = linkage(D,method=link_method,metric='cosine') if Labels: dendrogram(L,labels=Labels,orientation='left',color_threshold=color_thresh) else: dendrogram(L,orientation='left',color_threshold=color_thresh) pylab.title('HMP Buccal Mucosa - Latent Strain Analysis') pylab.xlabel('Cosine Distance') pylab.ylabel('Strain with the Most Alignments to Each Cluster') pylab.rcParams.update({'font.size': fontsize}) pylab.show()
def printSummary(updatedtfidfMatrix, queriedSentences): print "\n" a = pdist(updatedtfidfMatrix,'cosine') print a b = linkage(a) dendrogram(b) show() print b sumOrder = [] count = 0 f = open("foo.txt", "w") for i in range(len(b)): x = int(b[i][0]) y = int(b[i][1]) if x <= (len(queriedSentences)-1): sumOrder.append(x) if y <= (len(queriedSentences)-1): sumOrder.append(y) if x <= (len(queriedSentences)-1) and y > (len(queriedSentences)-1): sumOrder.append(y) if x > (len(queriedSentences)-1) and y > (len(queriedSentences)-1): sumOrder.append(x) previous = 0 queriedSentences = [sentence.capitalize() for sentence in queriedSentences] for num in sumOrder: if num > (len(queriedSentences)-1): f.write('<br></br>') else: f.write(queriedSentences[num]) f.write('.') f.write(' ') f.close() with open ("foo.txt", "r") as myfile: #print myfile data=myfile.read() print data return data
def do_gen_feature_z(X_L_list, X_D_list, M_c, filename, tablename=''): num_cols = len(X_L_list[0]['column_partition']['assignments']) column_names = [M_c['idx_to_name'][str(idx)] for idx in range(num_cols)] column_names = numpy.array(column_names) # extract unordered z_matrix num_latent_states = len(X_L_list) z_matrix = numpy.zeros((num_cols, num_cols)) for X_L in X_L_list: assignments = X_L['column_partition']['assignments'] for i in range(num_cols): for j in range(num_cols): if assignments[i] == assignments[j]: z_matrix[i, j] += 1 z_matrix /= float(num_latent_states) # hierachically cluster z_matrix Y = hcluster.pdist(z_matrix) Z = hcluster.linkage(Y) pylab.figure() hcluster.dendrogram(Z) intify = lambda x: int(x.get_text()) reorder_indices = map(intify, pylab.gca().get_xticklabels()) pylab.close() # REORDER! z_matrix_reordered = z_matrix[:, reorder_indices][reorder_indices, :] column_names_reordered = column_names[reorder_indices] # actually create figure fig = pylab.figure() fig.set_size_inches(16, 12) pylab.imshow(z_matrix_reordered, interpolation='none', cmap=pylab.matplotlib.cm.Greens) pylab.colorbar() if num_cols < 14: pylab.gca().set_yticks(range(num_cols)) pylab.gca().set_yticklabels(column_names_reordered, size='x-small') pylab.gca().set_xticks(range(num_cols)) pylab.gca().set_xticklabels(column_names_reordered, rotation=90, size='x-small') else: pylab.gca().set_yticks(range(num_cols)[::2]) pylab.gca().set_yticklabels(column_names_reordered[::2], size='x-small') pylab.gca().set_xticks(range(num_cols)[1::2]) pylab.gca().set_xticklabels(column_names_reordered[1::2], rotation=90, size='small') pylab.title('column dependencies for: %s' % tablename) pylab.savefig(filename)
def dendrogram(M, method="complete", title="complete linkage clustering", **kw): s = StringIO.StringIO() pylab.figure() if title: pylab.title(title) try: hcluster.dendrogram(cluster(M, method), **kw) except ValueError: # Empty distance matrix pass finally: pylab.savefig(s, format="png") s.seek(0) pylab.close() return s
def dendrogram(M, method='complete', **kw): s = StringIO.StringIO() if pylab: try: pylab.figure() pylab.title('complete linkage clustering') hcluster.dendrogram(cluster(M, method), **kw) except: pass else: pylab.savefig(s, format='png') s.seek(0) finally: pylab.close() return s
def cluster_path_times(self, path_times,display): recordings = path_times.recordings X=[] for recording in recordings: X.append([recording.time.seconds+recording.time.microseconds/10**6.,recording.date.hour*60+recording.date.minute]) print X Y=pdist(X) Z=linkage(Y) dendrogram(Z) for i in range(len(X)): print('{0}, {1}'.format(i,X[i])) print Z print self.calculate_variances(X,Z) if display: show()
def test(): word_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O' ] cons_words = ['C', 'B'] X = rand(15, 2) #X = [[0.35, 0.37], [0.40, 0.40], [0.53, 0.53], [0.34, 0.51]] print X Y = pdist(X) print Y Z = linkage(Y) R = dendrogram(Z) index1 = word_list.index(cons_words[0]) assert index1 >= 0 path1 = findPath(Z, index1, len(word_list)) index2 = word_list.index(cons_words[1]) assert index2 >= 0 path2 = findPath(Z, index2, len(word_list)) print Z print path1 print path2 common = set(path1).intersection(set(path2)) first = min(common) assert(first >= len(word_list)) first -= len(word_list) cluster_root = Z[first][0] merge1 = findCluster(Z, cluster_root, word_list) cluster_root = Z[first][1] merge2 = findCluster(Z, cluster_root, word_list) print merge1 print merge2
def OnLeftDClick(self, event): #def OnLeftDClick(event): """ Left Double Click has been invocked. This plugin call pdist function from hcluster package and plot the dendrogram using matplotlib.pyplot package. """ #canvas = event.GetEventObject() #model = canvas.getCurrentShape(event) devs = self.getDEVSModel() if devs: Y = pdist(devs.vectors) Z = linkage(Y) dendrogram(Z) show() else: wx.MessageBox(_("No DEVS model is instanciated.\nGo back to the simulation!"), _("Info"), wx.OK|wx.ICON_INFORMATION)
def cluster_elut(mat): import hcluster ymat = hcluster.pdist(mat) zmat = hcluster.linkage(ymat) figure() order = hcluster.dendrogram(zmat)['leaves'] clf() imshow(mat[order,:])
def plot_with_labels(Z, num_clust): threshold = Z[-num_clust + 1, 2] dg = dendrogram(Z, no_labels=True, color_threshold=threshold) color = [colors[int(rowHeaders[k])] for k in dg["leaves"]] b = 0.1 * Z[-1, 2] plt.bar(np.arange(N) * 10, np.ones(N) * b, bottom=-b, width=10, color=color, edgecolor="none") plt.gca().set_ylim((-b, None)) plt.show()
def dendrogram(M, method='complete', title='complete linkage clustering', **kw): s = StringIO.StringIO() pylab.figure() if title: pylab.title(title) try: hcluster.dendrogram(cluster(M, method), **kw) except ValueError: # Empty distance matrix pass finally: pylab.savefig(s, format='png') s.seek(0) pylab.close() return s
def run(self,): if self.debug: # 2010-4-18 enter debug mode "~/.../variation/misc.py -b" import pdb pdb.set_trace() debug = True else: debug =False sampleId2index, samplePair2data = self.readInput(self.inputFnameLs) sys.stderr.write("Calculating distance matrix for aggregated data ...") distanceMatrix = numpy.zeros([len(sampleId2index), len(sampleId2index)]) for samplePair, data in samplePair2data.iteritems(): no_of_mismatches, no_of_total_non_NA = data[:2] distance = no_of_mismatches/no_of_total_non_NA sample1Id, sample2Id = samplePair[:2] sample1Index = sampleId2index[sample1Id] sample2Index = sampleId2index[sample2Id] distanceMatrix[sample1Index, sample2Index] = distance distanceMatrix[sample2Index, sample1Index] = distance sys.stderr.write("Done.\n") sampleIdLs = sampleId2index.keys() for sampleId, list_index in sampleId2index.iteritems(): sampleIdLs[list_index] = sampleId if self.outputFname: self.outputMismatchData(self.outputFname, samplePair2data, distanceMatrix, sampleId2index, sampleIdLs) massagedSampleIDLs = self.massageSampleId(sampleIdLs) #2012.9-6 stop massaging sample IDs for PCA output. mapper/AppendInfo2SmartPCAOutput.py could be applied to this. self.runPCAOnDistanceMatrix(distanceMatrix, col_id_ls=sampleIdLs, outputFname='%s_PCA.tsv'%(self.figureFnamePrefix)) import pylab from hcluster import pdist, linkage, dendrogram pylab.clf() Z=linkage(distanceMatrix, 'single') yh_matplotlib.setFontAndLabelSize(base_size=3) dendrogram(Z, color_threshold=0.001, labels=massagedSampleIDLs, orientation='right', leaf_font_size=None) #leaf_font_size=1 or 5 has no effect pylab.savefig('%s.svg'%self.figureFnamePrefix, dpi=200) pylab.savefig('%s.png'%self.figureFnamePrefix, dpi=300) sys.exit(0)
def performHierarchicalClusterin(matrix, titlesCat): #compute the distance matrix with "cosine" metric distanceMatrix =pairwise_distances(matrix, metric='cosine') #Computer the hierarchical clutering, similaritiy with cluster #is caclulated with the average of element similarities Z=linkage(distanceMatrix,method='average') #Create a dendogram image image=dendrogram(Z,labels=titlesCat, distance_sort='descendent', leaf_font_size=2, orientation='left', show_contracted=False) #Save generated dendogram image pylab.savefig("images/clusteringImage.png",dpi=300,bbox_inches='tight')
def t_dendrogram(X, nclusters): from matplotlib.pyplot import show from hcluster import pdist, linkage, dendrogram import numpy from numpy.random import rand # X = X[:10, :] Y = pdist(X) Z = linkage(Y) res = dendrogram(Z) show() pass
def augmented_dendrogram(*args, **kwargs): ddata = dendrogram(*args, **kwargs) if not kwargs.get('no_plot', False): for i, d in zip(ddata['icoord'], ddata['dcoord']): x = 0.5 * sum(i[1:3]) y = d[1] plt.plot(x, y, 'ro') plt.annotate("%.3g" % y, (x, y), xytext=(0, -8), textcoords='offset points', va='top', ha='center') return ddata
def output_dendrogram(imgs, kernel, method="complete", dend_fn="_dendrogram.png"): dst = pdist(kernel) links = linkage(dst, method=method) tmp_dend_fn = method + "_" + dend_fn axis = dendrogram(links, orientation="left", figsize=(7, 12), outfilename=tmp_dend_fn)[1] figimg = libpil.loadImage(tmp_dend_fn) labels = [label._text for label in axis.get_yticklabels()] labels = map(int, labels) labels.reverse() for i, ind in enumerate(labels): imgs[ind].thumbnail((30, 30)) offset = i * (imgs[ind].size[1] + 4) + 120 figimg.paste(imgs[ind], (52, offset)) figimg.save("fig_" + tmp_dend_fn)
def hierarchical_clusters( log, show_plot=None ): """Translates traces to Parikh vectors and computes in the vector space a hierarchical clustering.""" def get_parikh(case,alphabet): v = zeros(len(alphabet),dtype=int) for act in case: v[alphabet[act]] = v[alphabet[act]] +1 # canonical representation m = min(v) return v - m actsind = {} i = 0 for act in log.get_alphabet(): actsind[act] = i i = i +1 uniq_cases = log.get_uniq_cases() N = len(uniq_cases) M = len(actsind) data = zeros((N,M),dtype=int) i = 0 parikhdict = {} for case in uniq_cases.keys(): data[i] = get_parikh(case,actsind) str_i = ','.join(map(str,data[i])) if str_i not in parikhdict: parikhdict[str_i] = [i] else: parikhdict[str_i].append(i) i = i + 1 df = DataFrame(data) data_uniq = df.drop_duplicates() Y = pdist(data_uniq,metric='euclidean') Z = linkage(Y,method='average') dendrogram(Z) show()
def hcluster(self, stim): #from hcluster import pdist, linkage, dendrogram import hcluster iu = np.triu_indices(len(stim.group), 1) # Z = hcluster.linkage(stim.group[iu], 'single', 'ward') import pdb; pdb.set_trace() thres = Z[-2, 2] dend = hcluster.dendrogram(Z, color_threshold=thres) plt.show() clusters = self.get_clusters(Z, n_clusters=4)#thres=thres) colors = self.get_colors(len(clusters)) #import pdb; pdb.set_trace() for cluster, color in zip(clusters, colors): sel = stim.indices[np.array(cluster)] plt.plot(sel[:,1], sel[:,0],'o', color=color, ) plt.show()
def cluster_ids(gids, unnorm_eluts, sp, gt=None, dist='cosine', do_plot=True, norm_rows=True, bigarr=None, **kwargs): import plotting as pl import hcluster arr = (bigarr if bigarr is not None else single_array(gids, unnorm_eluts, sp, norm_rows=norm_rows)) ymat = hcluster.pdist(arr, metric=dist) zmat = hcluster.linkage(ymat) zmat = np.clip(zmat, 0, 10**8) if do_plot: pl.figure() order = hcluster.dendrogram(zmat, no_plot=bool(1-do_plot), **kwargs)['leaves'] if do_plot: ax = pl.gca() ax.axes.set_xticklabels([gt.id2name[gids[ind]] for ind in order]) pl.figure() pl.imshow(arr[order,:]) return list(np.array(list(gids))[order])
def hierarchicalcluster(datamatrix, dimlabels, similarity='euclidean', colorthresh='default'): '''plots dendrogram and returns clustering (item-1 x 4 array. first two columns are indices of clusters, 3rd column = distance between those clusters, 4th column = # of original observations in the cluster) and dend (dictionary of the data structures computed to render the dendrogram). see api here: http://hcluster.damianeads.com/cluster.html''' import hcluster with warnings.catch_warnings(): warnings.simplefilter("ignore") clustering = hcluster.linkage(datamatrix, metric=similarity) if colorthresh == 'default': color_threshold = 0.7 * max(clustering[:, 2]) #all descendents below a cluster node k will be assigned the same color if k is the first node below color_threshold. links connecting nodes with distances >= color_threshold are colored blue. default= 0.7*max(clustering[:,2]) else: color_threshold = colorthresh * max(clustering[:, 2]) fig = plt.figure() dend = hcluster.dendrogram(clustering, labels=dimlabels, leaf_rotation=90, color_threshold=color_threshold) plt.tight_layout() return clustering, dend
import numpy as np from matplotlib.pyplot import show from fastcluster import * from hcluster import dendrogram # Loading the data data = np.genfromtxt("../../data/ExpRawData-E-TABM-84-A-AFFY-44.tab",names=True,usecols=tuple(range(1,30)),dtype=float, delimiter="\t") data_array = data.view((np.float, len(data.dtype.names))) data_link = linkage(data_array[1:1000], method='single', metric='euclidean', preserve_input=True) dendrogram(data_link) show()
def dendrogram(self): #import pylab as p if not self.linkage == None: hcluster.dendrogram(self.linkage,labels=np.unique(self._dataset.labels))
def _do_gen_matrix(self, col_function_name, X_L_list, X_D_list, M_c, T, tablename='', filename=None, col=None, confidence=None, limit=None, submatrix=False): if col_function_name == 'mutual information': col_function = getattr(self, '_mutual_information') elif col_function_name == 'dependence probability': col_function = getattr(self, '_dependence_probability') elif col_function_name == 'correlation': col_function = getattr(self, '_correlation') elif col_function_name == 'view_similarity': col_function = getattr(self, '_view_similarity') else: raise Exception('Invalid column function') num_cols = len(X_L_list[0]['column_partition']['assignments']) column_names = [ M_c['idx_to_name'][str(idx)] for idx in range(num_cols) ] column_names = numpy.array(column_names) # extract unordered z_matrix num_latent_states = len(X_L_list) z_matrix = numpy.zeros((num_cols, num_cols)) for i in range(num_cols): for j in range(num_cols): z_matrix[i][j] = col_function(i, j, X_L_list, X_D_list, M_c, T) if col: z_column = list(z_matrix[M_c['name_to_idx'][col]]) data_tuples = zip(z_column, range(num_cols)) data_tuples.sort(reverse=True) if confidence: data_tuples = filter(lambda tup: tup[0] >= float(confidence), data_tuples) if limit and limit != float("inf"): data_tuples = data_tuples[:int(limit)] data = [tuple([d[0] for d in data_tuples])] columns = [d[1] for d in data_tuples] column_names = [ M_c['idx_to_name'][str(idx)] for idx in range(num_cols) ] column_names = numpy.array(column_names) column_names_reordered = column_names[columns] if submatrix: z_matrix = z_matrix[columns, :][:, columns] z_matrix_reordered = z_matrix else: return {'data': data, 'columns': column_names_reordered} else: # hierachically cluster z_matrix import hcluster Y = hcluster.pdist(z_matrix) Z = hcluster.linkage(Y) pylab.figure() hcluster.dendrogram(Z) intify = lambda x: int(x.get_text()) reorder_indices = map(intify, pylab.gca().get_xticklabels()) pylab.close() # REORDER! z_matrix_reordered = z_matrix[:, reorder_indices][reorder_indices, :] column_names_reordered = column_names[reorder_indices] title = 'Pairwise column %s for %s' % (col_function_name, tablename) if filename: utils.plot_matrix(z_matrix_reordered, column_names_reordered, title, filename) return dict(matrix=z_matrix_reordered, column_names=column_names_reordered, title=title, filename=filename, message="Created " + title)
# cosine similarities ######################################### import numpy mat = numpy.empty((n, n)) for i in xrange(0, n): for j in xrange(0, n): mat[i][j] = nltk.cluster.util.cosine_distance(feature_vectors[i], feature_vectors[j]) ######################################### # now hierarchically cluster mat ######################################### from hcluster import linkage, dendrogram t = 0.8 Z = linkage(mat, 'single') dendrogram(Z, color_threshold=t) import pylab pylab.savefig("hcluster.png", dpi=800) ######################################### # extract our clusters ######################################### def extract_clusters(Z, threshold, n): clusters = {} ct = n for row in Z: if row[2] < threshold: n1 = int(row[0]) n2 = int(row[1])
def drawDendrogram(self, dist): Z = hcluster.linkage(dist) hcluster.dendrogram(Z) show()
for x in range(50): plt.text(50,x,'%.2f'%meancorr[x],size=6) for y in range(50): if not x==y: # skip diagonal plt.text(x-0.3,y,'%.2f'%cc_pcorr[x,y],size=6,color='red') plt.savefig(basedir+'9_correlation_analysis/pcorr_corrcoefs.pdf',format='pdf') #do clustering if 1==1: dst=pdist(data_pcorr[2:,:]) Z=linkage(dst,method='complete') plt.figure(figsize=(14,12)) dendrogram(Z,labels=tasknames_pcorr) plt.savefig(basedir+'9_correlation_analysis/pcorr_task_cluster.pdf',format='pdf') # decompose connections using ICA and save adjacency matrices data_pcorr_fmri=data_pcorr[2:,:] if 1==0: ica = FastICA(n_components=20) S_ = ica.fit(data_pcorr_fmri.T).transform(data_pcorr_fmri.T) # Get the estimated sources A_ = ica.get_mixing_matrix() # Get estimated mixing matrix #ncomps=20 #nmf=decomposition.ProjectedGradientNMF(n_components=ncomps,sparseness='components',init='nndsvd') #nmf.fit(data_pcorr_fmri+100)
nconds = ctr mask = nib.load(os.path.join(dataprepdir, 'goodvoxmask.nii.gz')) maskvox = N.where(mask.get_data() > 0) data = N.zeros((nconds, len(maskvox[0]))) ctr = 0 for ds in contrasts_to_use.iterkeys(): for task in contrasts_to_use[ds].iterkeys(): for contrast in contrasts_to_use[ds][task]: tmp = nib.load( os.path.join( datadir, 'mean_%s_task%03d_zstat%d_run1.nii.gz' % (ds, task, contrast))).get_data() data[ctr, :] = tmp[maskvox] ctr += 1 l = fastcluster.linkage(data, method=clustering_type, metric='euclidean') plot_data = True if plot_data: plt.figure(figsize=(16, 10)) plt.hold(True) dendrogram(l, labels=contrast_labels, orientation='right') #plt.show() plt.savefig(os.path.join(outdir, 'cluster_figure_%s.pdf' % clustering_type), format='pdf')
for contrast in contrasts_to_use[ds][task]: contrast_labels.append(ds+'_task%d:%s'%(task,contrasts[ds]['task%03d'%task][contrast])) contrast_labels_short.append(ds+'_t%d_z%d'%(task,contrast)) ctr+=1 nconds=ctr mask=nib.load(os.path.join(dataprepdir,'goodvoxmask.nii.gz')) maskvox=N.where(mask.get_data()>0) data=N.zeros((nconds,len(maskvox[0]))) ctr=0 for ds in contrasts_to_use.iterkeys(): for task in contrasts_to_use[ds].iterkeys(): for contrast in contrasts_to_use[ds][task]: tmp=nib.load(os.path.join(datadir,'mean_%s_task%03d_zstat%d_run1.nii.gz'%(ds,task,contrast))).get_data() data[ctr,:]=tmp[maskvox] ctr+=1 l=fastcluster.linkage(data,method=clustering_type,metric='euclidean') plot_data=True if plot_data: plt.figure(figsize=(16,10)) plt.hold(True) dendrogram(l,labels=contrast_labels,orientation='right') #plt.show() plt.savefig(os.path.join(outdir,'cluster_figure_%s.pdf'%clustering_type),format='pdf')
from math import * import numpy as np # dendrogram import hcluster similarity = [ [1.00, 0.93, 0.86, 0.84, 0.69, 0.65], [0.93, 1.00, 0.79, 0.83, 0.64, 0.67], [0.86, 0.79, 1.00, 0.75, 0.82, 0.54], [0.84, 0.83, 0.75, 1.00, 0.57, 0.79], [0.69, 0.64, 0.82, 0.57, 1.00, 0.36], [0.65, 0.67, 0.54, 0.79, 0.36, 1.00], ] Z = hcluster.single(similarity) hcluster.dendrogram(Z) # k-means data = np.array([6, 12, 18, 24, 30, 42, 48]) centroids = np.array([18, 45]) # centroids = np.array([15, 40]) def dist(x, y): return abs(x - y) clusters = {} for center in centroids: clusters[center] = []
def plot_clusters(Dr, ct): L = linkage(Dr, method='single', metric='cosine') dendrogram(L, color_threshold=ct) pylab.show()
from matplotlib.pyplot import show from hcluster import pdist, linkage, dendrogram import numpy from numpy.random import rand X = rand(10, 100) X[0:5, :] *= 2 Y = pdist(X) Z = linkage(Y) dendrogram(Z) show()
# now turn that into symmatrix matrix of # cosine similarities ######################################### import numpy mat = numpy.empty((n, n)) for i in xrange(0,n): for j in xrange(0,n): mat[i][j] = nltk.cluster.util.cosine_distance(feature_vectors[i],feature_vectors[j]) ######################################### # now hierarchically cluster mat ######################################### from hcluster import linkage, dendrogram t = 0.8 Z = linkage(mat, 'single') dendrogram(Z, color_threshold=t) import pylab pylab.savefig( "hcluster.png" ,dpi=800) ######################################### # extract our clusters ######################################### def extract_clusters(Z,threshold,n): clusters={} ct=n for row in Z: if row[2] < threshold: n1=int(row[0]) n2=int(row[1])
def dendrogram(self): #import pylab as p if not self.linkage == None: hcluster.dendrogram(self.linkage, labels=np.unique(self._dataset.labels))
data = np.genfromtxt("../../data/ExpRawData-E-TABM-84-A-AFFY-44.tab", names=True, usecols=tuple(range(1, 30)), dtype=float, delimiter="\t") data_array = data.view((np.float, len(data.dtype.names))) data_array = data_array[1:1000].transpose() data_dist = pdist(data_array) # computing the distance data_link = linkage(data_dist) # computing the linkage # just plot the dendrogram. dendrogram(data_link, labels=data.dtype.names) plt.savefig('../../results/dendrogram.png') # or plot the heatmap too! # Compute and plot first dendrogram. fig = plt.figure(figsize=(8, 8)) # x ywidth height ax1 = fig.add_axes([0.05, 0.1, 0.2, 0.6]) Y = linkage(data_dist, method='single') Z1 = dendrogram(Y, orientation='right', labels=data.dtype.names) # adding/removing the axes ax1.set_xticks([]) # Compute and plot second dendrogram. ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
("http://www.gutenberg.org/files/21593/21593-0.txt", "Urteil"), ("http://www.gutenberg.org/cache/epub/22367/pg22367.txt", "Verwand.")] # get it from the interwebs catalogue = [] for url, name in urls: headers = {'User-Agent': 'Mozilla/5.0'} req = urllib2.Request(url, None, headers) catalogue.append(urllib2.urlopen(req).read()) # calc similarity matrix M = all_pairs(catalogue, distance=distances.jaccard, dist_kwargs=dict(mode=1), parallel=True) # plot similarity matrix pylab.figure(1) pylab.title("similarity matrix") pylab.imshow(M, aspect='auto', interpolation="nearest", cmap="Reds") pylab.colorbar() # plot complete linkage pylab.figure(2) pylab.title("complete linkage clustering") hcluster.dendrogram(cluster(M, method='complete'), leaf_label_func=lambda i: urls[i][1]) # finally show pylab.show()
def main(argv): print argv if (len(argv) > 0): params = argv[::2] param_values = argv[1::2] crit_func = squared_criterion merge_func = d_min for i in range(0, len(argv), 2): if params[i] == "--criterium": if param_values[i + 1] == "silhoette": crit_func = silhouette_criterion elif param_values[i + 1] == "squared": crit_func = squared_criterion else: crit_func = silhouette_criterion elif params[i] == "--merge": if param_values[i + 1] == "de": merge_func = d_e elif param_values[i + 1] == "dmax": merge_func = d_max else: merge_func = d_e Cluster.clusters = [] Cluster.squared_criterion_values = [] Cluster.silhouette_criterion_values = [] my_data = np.genfromtxt('./data.csv', delimiter=',', dtype=float) #Make only clusterization params in array data_list = my_data[1:].tolist() maximum = 0 data_list = data_list[:] etalon = data_list[:] for i in range(len(data_list)): data_list[i] = data_list[i][2:] #normalize all lists: data_list = np.array(data_list) #count all distances print "Precounting distances" for i in range(len(data_list)): for j in range(len(data_list)): print ".", Cluster.counted_distances[(tuple(data_list[i]), tuple( data_list[j]))] = hexic_euqlid_distance( data_list[i], data_list[j]) print "Distances Counted" for i in range(len(data_list)): Cluster.etalon_clasters[tuple(data_list[i][2:])] = etalon[i][1] print Cluster.etalon_clasters.values() #Make each element = 1 cluster for x in data_list: Cluster.clusters.append(Cluster(x)) print(len(Cluster.clusters)) K_num = 1 swo(K_num, merge_func, crit_func) Y = Cluster.merge_history[1:] Z = linkage(Y) plt.subplot(121) dendrogram(Z, labels=range(len(data_list))) squared_criterion_values = Cluster.squared_criterion_values[::-1] silhouette_criterion_values = Cluster.silhouette_criterion_values[::-1] plt.subplot(122) if (crit_func == silhouette_criterion): plt.plot(range(len(silhouette_criterion_values)), silhouette_criterion_values) plt.axis([ K_num, 30, min(silhouette_criterion_values), max(silhouette_criterion_values) ]) else: plt.plot(range(len(squared_criterion_values)), squared_criterion_values) plt.axis([ K_num, 30, min(squared_criterion_values), max(squared_criterion_values) ]) plt.show() for x in Cluster.clusters: x.etalon_to_current_mapping() print x.etalon_map
import hcluster import matplotlib.pyplot as plt import pickle import urllib url = "http://examples.obspy.org/dissimilarities.pkl" dissimilarity = pickle.load(urllib.urlopen(url)) plt.subplot(121) plt.imshow(1 - dissimilarity, interpolation="nearest") dissimilarity = hcluster.squareform(dissimilarity) threshold = 0.3 linkage = hcluster.linkage(dissimilarity, method="single") clusters = hcluster.fcluster(linkage, 0.3, criterion="distance") plt.subplot(122) hcluster.dendrogram(linkage, color_threshold=0.3) plt.xlabel("Event number") plt.ylabel("Dissimilarity") plt.show()
p['data_label'], 'data.pickle'))) for key, val in data.iteritems(): # for bla in [1]: # key, val = 'eagle', data['eagle'] fig = plt.figure() fig.canvas.mpl_connect('pick_event', onpick) plt.subplot(3, 1, 1) plt.title(key) proj = np.dot(val['U'][:, 0:2].T, val['vecs']) Y = pdist(proj.T) Z = linkage(Y) dendrogram(Z) ax = plt.subplot(3, 1, 2) for i in range(proj.shape[1]): col = (1 - (val['ratings'][i] / 100.0)) * 0.7 pt, = ax.plot(proj[0, i], proj[1, i], '.', color=('%f' % col), picker=3) ax.text(proj[0, i], proj[1, i], i) pt.name = val['keys'][i] plt.subplot(3, 1, 3) plt.plot(val['d']) plt.savefig(path.join(output_dir, key + ".png"))
("http://www.gutenberg.org/cache/epub/22367/pg22367.txt", "Verwand.")] # get it from the interwebs catalogue = [] for url, name in urls: headers = {'User-Agent': 'Mozilla/5.0'} req = urllib2.Request(url, None, headers) catalogue.append(urllib2.urlopen(req).read()) # calc similarity matrix M = all_pairs(catalogue, distance=distances.jaccard, dist_kwargs=dict(mode=1), parallel=True) # plot similarity matrix pylab.figure(1) pylab.title("similarity matrix") pylab.imshow(M, aspect='auto', interpolation="nearest", cmap="Reds") pylab.colorbar() # plot complete linkage pylab.figure(2) pylab.title("complete linkage clustering") hcluster.dendrogram(cluster(M, method='complete'), leaf_label_func=lambda i: urls[i][1]) # finally show pylab.show()