예제 #1
0
파일: dendro.py 프로젝트: jungikim/sbmt
def dendro(X,metric='cosine',combine='average',showdendro=True,leaf_label_func=identity,**kw):
    Y = pdist(X,metric)
    Z = linkage(Y,combine)
    if showdendro:
        dendrogram(Z,leaf_label_func=leaf_label_func,**kw)
        show()
    return Z
def gethclinks(exparray, method):
    hcdists = hcluster.pdist(exparray, method)
    hclinks = hcluster.linkage(hcdists)
    links = []
    for hclink in hclinks:
        links.append([int(hclink[0]), int(hclink[1])])
    return links
예제 #3
0
def main():
    print "hola"
    X = rand(10,100)
    X[0:5,:] *= 2
    Y = pdist(X)
    Z = linkage(Y)
    dendrogram(Z)
예제 #4
0
def test_cluster_slink(repeat, runs, dist_m):

    np.random.seed(int(time.time()))

    clocks = np.empty((repeat, runs))
    times = np.empty((repeat, runs))

    for i in xrange(repeat):
        for j in xrange(runs):
            print 'a'
            t1 = time.time()
            c1 = time.clock()
            Z = hcluster.linkage(dist_m, 'single')
            c2 = time.clock()
            t2 = time.time()
            print 'b'
            dt = t2 - t1
            dc = c2 - c1
            clocks[i, j] = c2 - c1
            times[i, j] = t2 - t1

    mean_clock = np.mean(clocks)
    std_clock = np.std(clocks)
    mean_time = np.mean(times)
    std_time = np.std(times)

    print '5000 objects, 20 features: clocks=%f +- %f, times=%f +- %f' % (mean_clock, std_clock, mean_time, std_time)

    return mean_time, std_time, mean_clock, std_clock
예제 #5
0
파일: dendro.py 프로젝트: isi-nlp/sbmt
def dendro(X, metric="cosine", combine="average", showdendro=True, leaf_label_func=identity, **kw):
    Y = pdist(X, metric)
    Z = linkage(Y, combine)
    if showdendro:
        dendrogram(Z, leaf_label_func=leaf_label_func, **kw)
        show()
    return Z
예제 #6
0
def plotSampleDistanceDendrogram(ds):
    """Plot a sample distance cluster dendrogram using all samples and features
    of a dataset.

    :Parameter:
      ds: Dataset
        The source dataset.
    """
    # generate map from num labels to literal labels
    # to put them on the dendrogram leaves
    lmap = dict([(v, k) for k, v in ds.labels_map.iteritems()])

    # compute distance matrix, default is squared euclidean distance
    dist = clust.pdist(ds.samples)

    # determine clusters
    link = clust.linkage(dist, 'complete')

    # plot dendrogram with literal labels on leaves
    # this does not work with etch's version of matplotlib (verified for
    # matplotlib 0.98)
    clust.dendrogram(
        link,
        colorthreshold=0,
        labels=[lmap[l] for l in ds.labels],
        # all black
        link_color_func=lambda x: 'black',
        distance_sort=False)
    labels = P.gca().get_xticklabels()
    # rotate labels
    P.setp(labels, rotation=90, fontsize=9)
예제 #7
0
def test_cluster_ward(repeat, runs, data):

    np.random.seed(int(time.time()))

    clocks = np.empty((repeat, runs))
    times = np.empty((repeat, runs))

    for i in xrange(repeat):
        for j in xrange(runs):
            print 'a'
            t1 = time.time()
            c1 = time.clock()
            Z = hcluster.linkage(data, 'ward')
            c2 = time.clock()
            t2 = time.time()
            print 'b'
            dt = t2 - t1
            dc = c2 - c1
            clocks[i, j] = c2 - c1
            times[i, j] = t2 - t1

    mean_clock = np.mean(clocks)
    std_clock = np.std(clocks)
    mean_time = np.mean(times)
    std_time = np.std(times)

    print '%d objects, %d features: clocks=%f +- %f, times=%f +- %f' % (data.shape[0], data.shape[1], mean_clock, std_clock, mean_time, std_time)

    return mean_time, std_time, mean_clock, std_clock
예제 #8
0
def hierarchicalClustering(p_dist, word_list, cons_words):

  Z = linkage(p_dist)

  index1 = word_list.index(cons_words[0])
  assert index1 >= 0
  path1 = findPath(Z, index1, len(word_list))
  index2 = word_list.index(cons_words[1])
  assert index2 >= 0  
  path2 = findPath(Z, index2, len(word_list))

  print Z
  print path1
  print path2

  common = set(path1).intersection(set(path2))
  # at least have the common root
  first = min(common)
  assert(first >= len(word_list))
  first -= len(word_list) 
  cluster_root = Z[first][0]
  merge1 = findCluster(Z, cluster_root, word_list)
  cluster_root = Z[first][1]
  merge2 = findCluster(Z, cluster_root, word_list)

  print word_list
  print merge1
  print merge2

  split_pair = (cons_words[0], cons_words[1])

  return split_pair, merge1, merge2
예제 #9
0
 def _train(self, trainset):
     self._dataset = trainset
     self.ulabels = trainset.uniquelabels
     # Do cross-validation for normal classifier
     self.cvterr = CrossValidatedTransferError(TransferError(self._clf),
                                               self._splitter,
                                               enable_states=["confusion"])
     self.cvterr(self._dataset)
     # From the confusion matrix, calculate linkage and tree-structure
     # First prepare distance matrix from confusion matrix
     dist = self.cvterr.confusion.matrix
     dist = dist.max(
     ) - dist  # Kind of inversion. High values in confusion -> similar -> small distance
     dist = (dist +
             dist.T) / 2  # Distance must be symmetric (property of a norm)
     dist -= np.diag(
         np.diag(dist)
     )  # Distance to self must be zero -> make diagonal elements zero
     # Calculate linkage matrix
     self.linkage = hcluster.linkage(hcluster.squareform(dist))
     # Build tree and according TreeClassifier
     self.tree = hcluster.to_tree(self.linkage)
     self._tree_clf = self.build_tree_classifier_from_linkage_tree(
         self.tree)[0]
     self._tree_clf.train(trainset)
def gethclinks(exparray, method):
	hcdists = hcluster.pdist(exparray, method)
	hclinks = hcluster.linkage(hcdists)
	links = []
	for hclink in hclinks:
		links.append([int(hclink[0]), int(hclink[1])])
	return links
예제 #11
0
def test():
  word_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O' ]
  cons_words = ['C', 'B']
  X = rand(15, 2)
  #X = [[0.35, 0.37], [0.40, 0.40], [0.53, 0.53], [0.34, 0.51]]
  print X
  Y = pdist(X)
  print Y
  Z = linkage(Y)
  R = dendrogram(Z)

  index1 = word_list.index(cons_words[0])
  assert index1 >= 0
  path1 = findPath(Z, index1, len(word_list))
  index2 = word_list.index(cons_words[1])
  assert index2 >= 0  
  path2 = findPath(Z, index2, len(word_list))
  
  print Z
  print path1
  print path2

  common = set(path1).intersection(set(path2))
  first = min(common)
  assert(first >= len(word_list))
  first -= len(word_list) 
  cluster_root = Z[first][0]
  merge1 = findCluster(Z, cluster_root, word_list)
  cluster_root = Z[first][1]
  merge2 = findCluster(Z, cluster_root, word_list)

  print merge1
  print merge2
예제 #12
0
파일: run2.py 프로젝트: gitzain/project-x
	def do_it(self, sources):

		for source in sources:
			words = nltk.wordpunct_tokenize(source.headline)
			words.extend(nltk.wordpunct_tokenize(source.summary))
			lowerwords=[x.lower() for x in words if len(x) > 1]
			self.ct += 1
			print self.ct, "TITLE",source.headline
			self.corpus.append(lowerwords)
			self.titles.append(source.headline)
			self.links.append(source.url)



		[[self.key_word_list.add(x) for x in self.top_keywords(self.nkeywords,doc,self.corpus)] for doc in self.corpus]

		self.ct=-1
		for doc in self.corpus:
		   self.ct+=1
		   print self.ct,"KEYWORDS"," ".join(self.top_keywords(self.nkeywords,doc,self.corpus))



		for document in self.corpus:
			vec=[]
			[vec.append(self.tfidf(word, document, self.corpus) if word in document else 0) for word in self.key_word_list]
			self.feature_vectors.append(vec)



		self.n=len(self.corpus)

		mat = numpy.empty((self.n, self.n))
		for i in xrange(0,self.n):
		  for j in xrange(0,self.n):
			mat[i][j] = nltk.cluster.util.cosine_distance(self.feature_vectors[i],self.feature_vectors[j])


		Z = linkage(mat, 'single')

		dendrogram(Z, color_threshold=self.t)





		clusters = self.extract_clusters(Z,self.t,self.n)
		
		stories = []

		for key in clusters:
			print "============================================="
			story = Story()  
			for id in clusters[key]:
				story.add_source(sources[id])
				print id,self.titles[id],sources[id].url
			stories.append(story)


		return stories
예제 #13
0
    def time_subcluster(self, locs):
        # Getting subclusters at Mapzen's limit
        cluster_linkage = linkage(locs, method='ward')
        clusters = fcluster(cluster_linkage, 50, criterion='maxclust')

        cluster_means = np.array([np.mean(
            locs[np.where(clusters == i)], axis=0
        ) for i in range(1, 51)])

        mapzen_locs = [{'lat': p[1], 'lon': p[0]} for p in cluster_means]
        mapzen_matrix = self.mapzen_matrix(mapzen_locs)

        # Cluster labels used for mapping back together
        # Subtracting one to use 0 index
        cl = clusters - 1

        # Get a matching distance matrix of lat/lon distance, get ratios
        cluster_km_dist = squareform(pdist(cluster_means,
                                           (lambda u,v: haversine(u,v))))

        dist_ratio_matrix = np.nan_to_num(np.divide(mapzen_matrix,
                                                    cluster_km_dist))
        # Divide items by mean to normalize a bit
        dist_ratio_matrix = np.nan_to_num(np.divide(dist_ratio_matrix,
                                                    dist_ratio_matrix.mean()))

        locs_km_dist = squareform(pdist(locs, (lambda u,v: haversine(u,v))))

        # Iterate through each, updating by ratio in dist_ratio_matrix
        it = np.nditer(locs_km_dist, flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            it[0] = it[0] * dist_ratio_matrix[cl[it.multi_index[0]]][cl[it.multi_index[1]]]
            it.iternext()

        return locs_km_dist
예제 #14
0
def time_series_clusters(Y, ct=0.5, return_clusters=False):
    D = pdist(transpose(Y), 'correlation')
    D = abs(D)
    if return_clusters:
        L = linkage(D, method='single', metric='cosine')
        C = fcluster(L, ct, criterion='distance')
        return cluster_sets(C)
    plot_clusters(D, ct)
예제 #15
0
def cluster_elut(mat):
    import hcluster
    ymat = hcluster.pdist(mat)
    zmat = hcluster.linkage(ymat)
    figure()
    order = hcluster.dendrogram(zmat)['leaves']
    clf() 
    imshow(mat[order,:])
예제 #16
0
def time_series_clusters(Y,ct=0.5,return_clusters=False):
	D = pdist(transpose(Y),'correlation')
	D = abs(D)
	if return_clusters:
		L = linkage(D,method='single',metric='cosine')
		C = fcluster(L,ct,criterion='distance')
		return cluster_sets(C)
	plot_clusters(D,ct)
예제 #17
0
    def __call__(self):
        # Can continue to play around with these
        self.cluster_linkage = linkage(self.point_arr, method='ward')
        self.clusters = fcluster(self.cluster_linkage,
                                 self.num_clusters,
                                 criterion='maxclust')

        [p[0].update({'group': p[1]}) for p in zip(self.locations, self.clusters.tolist())]

        return self.locations
def performHierarchicalClusterin(matrix, titlesCat):
    #compute the distance matrix with "cosine" metric
    distanceMatrix =pairwise_distances(matrix, metric='cosine')
    #Computer the hierarchical clutering, similaritiy with cluster
    #is caclulated with the average of element similarities
    Z=linkage(distanceMatrix,method='average')
    #Create a dendogram image
    image=dendrogram(Z,labels=titlesCat, distance_sort='descendent',
                     leaf_font_size=2, orientation='left', show_contracted=False)
    #Save generated dendogram image
    pylab.savefig("images/clusteringImage.png",dpi=300,bbox_inches='tight')
예제 #19
0
def t_dendrogram(X, nclusters):
    from matplotlib.pyplot import show
    from hcluster import pdist, linkage, dendrogram
    import numpy
    from numpy.random import rand
    #     X = X[:10, :]
    Y = pdist(X)
    Z = linkage(Y)
    res = dendrogram(Z)
    show()
    pass
예제 #20
0
  def do_it(self):

    for feed in self.feeds:
        d = feedparser.parse(feed)
        for e in d['entries']:
           words = nltk.wordpunct_tokenize(self.clean_html(e['description']))
           words.extend(nltk.wordpunct_tokenize(e['title']))
           lowerwords=[x.lower() for x in words if len(x) > 1]
           self.ct += 1
           print self.ct, "TITLE",e['title']
           self.corpus.append(lowerwords)
           self.titles.append(e['title'])
           self.links.append(e['link'])



    [[self.key_word_list.add(x) for x in self.top_keywords(self.nkeywords,doc,self.corpus)] for doc in self.corpus]

    self.ct=-1
    for doc in self.corpus:
       self.ct+=1
       print self.ct,"KEYWORDS"," ".join(self.top_keywords(self.nkeywords,doc,self.corpus))



    for document in self.corpus:
        vec=[]
        [vec.append(self.tfidf(word, document, self.corpus) if word in document else 0) for word in self.key_word_list]
        self.feature_vectors.append(vec)



    self.n=len(self.corpus)

    mat = numpy.empty((self.n, self.n))
    for i in xrange(0,self.n):
      for j in xrange(0,self.n):
        mat[i][j] = nltk.cluster.util.cosine_distance(self.feature_vectors[i],self.feature_vectors[j])


    Z = linkage(mat, 'single')

    dendrogram(Z, color_threshold=self.t)





    clusters = self.extract_clusters(Z,self.t,self.n)
     
    for key in clusters:
       print "============================================="  
       for id in clusters[key]:
           print id,self.titles[id]
예제 #21
0
def t_dendrogram(X, nclusters):
    from matplotlib.pyplot import show
    from hcluster import pdist, linkage, dendrogram
    import numpy
    from numpy.random import rand
#     X = X[:10, :]
    Y = pdist(X)
    Z = linkage(Y)
    res = dendrogram(Z)
    show()
    pass
def printMostSimilarCluster(matrix, titlesCat): 
    #compute the distance matrix with "cosine" metric
    distanceMatrix =pairwise_distances(matrix, metric='cosine')
    #Computer the hierarchical clutering, similaritiy with cluster
    #is caclulated with the average of element similarities
    Z=linkage(distanceMatrix,method='average')#,method='centroid')
    print "first closest cluster\n"
    for idx in range(10):
        lenTitle=len(titlesCat)
        if (int(Z[idx,0])<lenTitle) & (int(Z[idx,1])<lenTitle):
            print "itr "+str(idx)+":\n"+titlesCat[int(Z[idx,0])]+" "+titlesCat[int(Z[idx,1])]
예제 #23
0
파일: clustering.py 프로젝트: paczian/anvio
def get_clustering_as_tree(vectors, ward = True, clustering_distance='euclidean', clustering_method = 'complete', progress = progress):
    if ward:
        progress.update('Clustering data with Ward linkage and euclidean distances')
        clustering_result = hcluster.ward(vectors)
    else:
        progress.update('Computing distance matrix using "%s" distance' % clustering_distance)
        distance_matrix = hcluster.pdist(vectors, clustering_distance)
        progress.update('Clustering data with "%s" linkage' % clustering_method)
        clustering_result = hcluster.linkage(distance_matrix, method = clustering_method)

    progress.update('Returning results')
    return hcluster.to_tree(clustering_result)
예제 #24
0
def do_clusters(cluster_coords,Labels=None,link_method='single',d=0.2):
	D = pdist(cluster_coords,'cosine')
	# SEEMS THERE MAY SOMETIME BE VERY SMALL NEGATIVE DISTANCES ie -2*10**-16
	D = abs(D)
	L = linkage(D,method=link_method,metric='cosine')
	F = fcluster(L,d,'distance','cosine')
	C = defaultdict(list)
	for i in range(len(F)):
		if Labels:
			C[F[i]].append(Labels[i])
		else:
			C[F[i]].append(i)
	return C
예제 #25
0
def generate_dendrogram(root):
    from hcluster import pdist, linkage, dendrogram
    import numpy
    from numpy.random import rand
    import matplotlib
    
    X = rand(10,100)
    X[0:5,:] *= 2
    Y = pdist(X)
    Z = linkage(Y)
    print Y
    print Z
    dendrogram(Z)
예제 #26
0
    def fetch_clusters(self, mat, n):
        """
        Fetch the cluster from the similarity matrix
        :param mat: The similarity matrix
        :param n: The length of the corpus
        :return: The clusters
        """
        Z = linkage(mat, 'single')
        dendrogram(Z, color_threshold=self.t)

        pylab.savefig(self.cluster_image, dpi=self.dpi)
        clusters = self.__extract_clusters(Z, self.t, n)
        return clusters
예제 #27
0
def wavelet_clusters(Y,ct=0.5,weights=False,return_clusters=False,swt=False):
	if weights:
		D = abs(c_dists(Y,level_weights=True,use_swt=False))
		Dr = []
		for i in range(D.shape[0]-1):
			Dr += list(D[i,i+1:])
	else:
		Dr = c_dists(Y,use_swt=swt)
	if return_clusters:
		L = linkage(Dr,method='single',metric='cosine')
		C = fcluster(L,ct,criterion='distance')
		return cluster_sets(C)
	plot_clusters(Dr,ct)
예제 #28
0
def plot_cluster_tree(cluster_coords,Labels=None,link_method='single',color_thresh=.25,fontsize=8):
	D = pdist(cluster_coords,'cosine')
	# SEEMS THERE MAY SOMETIME BE VERY SMALL NEGATIVE DISTANCES ie -2*10**-16
	D = abs(D)
	L = linkage(D,method=link_method,metric='cosine')
	if Labels:
		dendrogram(L,labels=Labels,orientation='left',color_threshold=color_thresh)
	else:
		dendrogram(L,orientation='left',color_threshold=color_thresh)
	pylab.title('HMP Buccal Mucosa - Latent Strain Analysis')
	pylab.xlabel('Cosine Distance')
	pylab.ylabel('Strain with the Most Alignments to Each Cluster')
	pylab.rcParams.update({'font.size': fontsize})
	pylab.show()
예제 #29
0
def printSummary(updatedtfidfMatrix, queriedSentences):

    print "\n"
    a = pdist(updatedtfidfMatrix,'cosine')
    print a
    b = linkage(a)
    dendrogram(b)
    show()
    print b


    sumOrder = []
    count = 0
	
    f = open("foo.txt", "w")
    for i in range(len(b)):
	x = int(b[i][0])
	y = int(b[i][1])

	if x <= (len(queriedSentences)-1):
	   sumOrder.append(x)
	if y <= (len(queriedSentences)-1):
	   sumOrder.append(y)
        if x <= (len(queriedSentences)-1) and y > (len(queriedSentences)-1):
           sumOrder.append(y)
	if x > (len(queriedSentences)-1) and y > (len(queriedSentences)-1):
	   sumOrder.append(x)

    previous = 0
   
    queriedSentences = [sentence.capitalize() for sentence in queriedSentences]
        
    
    for num in sumOrder:
	if num > (len(queriedSentences)-1):
	   f.write('<br></br>')
	else:
	   f.write(queriedSentences[num])
	   f.write('.')
	   f.write(' ')


    f.close()

    with open ("foo.txt", "r") as myfile:
       #print myfile
       data=myfile.read()
	
    print data
    return data
예제 #30
0
  def cluster_analysis_hcluster(self, vectors):
    from hcluster import linkage, fcluster
    import numpy

    params = self.params.multiple_lattice_search.cluster_analysis.hcluster
    X = numpy.array(vectors)
    linkage_method = params.linkage.method
    linkage_metric = params.linkage.metric
    criterion = params.cutoff_criterion
    Z = linkage(X, method=linkage_method, metric=linkage_metric)
    cutoff = params.cutoff
    i_cluster = fcluster(Z, cutoff, criterion=criterion)
    i_cluster = flex.int(i_cluster.astype(numpy.int32))
    return i_cluster
예제 #31
0
def output_dendrogram(imgs, kernel, method="complete", dend_fn="_dendrogram.png"):
    dst = pdist(kernel)
    links = linkage(dst, method=method)
    tmp_dend_fn = method + "_" + dend_fn
    axis = dendrogram(links, orientation="left", figsize=(7, 12), outfilename=tmp_dend_fn)[1]
    figimg = libpil.loadImage(tmp_dend_fn)
    labels = [label._text for label in axis.get_yticklabels()]
    labels = map(int, labels)
    labels.reverse()
    for i, ind in enumerate(labels):
        imgs[ind].thumbnail((30, 30))
        offset = i * (imgs[ind].size[1] + 4) + 120
        figimg.paste(imgs[ind], (52, offset))
    figimg.save("fig_" + tmp_dend_fn)
예제 #32
0
def do_gen_feature_z(X_L_list, X_D_list, M_c, filename, tablename=''):
    num_cols = len(X_L_list[0]['column_partition']['assignments'])
    column_names = [M_c['idx_to_name'][str(idx)] for idx in range(num_cols)]
    column_names = numpy.array(column_names)
    # extract unordered z_matrix
    num_latent_states = len(X_L_list)
    z_matrix = numpy.zeros((num_cols, num_cols))
    for X_L in X_L_list:
        assignments = X_L['column_partition']['assignments']
        for i in range(num_cols):
            for j in range(num_cols):
                if assignments[i] == assignments[j]:
                    z_matrix[i, j] += 1
    z_matrix /= float(num_latent_states)
    # hierachically cluster z_matrix
    Y = hcluster.pdist(z_matrix)
    Z = hcluster.linkage(Y)
    pylab.figure()
    hcluster.dendrogram(Z)
    intify = lambda x: int(x.get_text())
    reorder_indices = map(intify, pylab.gca().get_xticklabels())
    pylab.close()
    # REORDER!
    z_matrix_reordered = z_matrix[:, reorder_indices][reorder_indices, :]
    column_names_reordered = column_names[reorder_indices]
    # actually create figure
    fig = pylab.figure()
    fig.set_size_inches(16, 12)
    pylab.imshow(z_matrix_reordered,
                 interpolation='none',
                 cmap=pylab.matplotlib.cm.Greens)
    pylab.colorbar()
    if num_cols < 14:
        pylab.gca().set_yticks(range(num_cols))
        pylab.gca().set_yticklabels(column_names_reordered, size='x-small')
        pylab.gca().set_xticks(range(num_cols))
        pylab.gca().set_xticklabels(column_names_reordered,
                                    rotation=90,
                                    size='x-small')
    else:
        pylab.gca().set_yticks(range(num_cols)[::2])
        pylab.gca().set_yticklabels(column_names_reordered[::2],
                                    size='x-small')
        pylab.gca().set_xticks(range(num_cols)[1::2])
        pylab.gca().set_xticklabels(column_names_reordered[1::2],
                                    rotation=90,
                                    size='small')
    pylab.title('column dependencies for: %s' % tablename)
    pylab.savefig(filename)
예제 #33
0
def OnLeftDClick(self, event):
#def OnLeftDClick(event):
	""" Left Double Click has been invocked.
		This plugin call pdist function from hcluster package and
		plot the dendrogram using matplotlib.pyplot package.
	"""
	#canvas = event.GetEventObject()
	#model = canvas.getCurrentShape(event)
	devs = self.getDEVSModel()
	if devs:
		Y = pdist(devs.vectors)
		Z = linkage(Y)
		dendrogram(Z)
		show()
	else:
		wx.MessageBox(_("No DEVS model is instanciated.\nGo back to the simulation!"), _("Info"), wx.OK|wx.ICON_INFORMATION)
    def cluster_path_times(self, path_times,display):
        recordings = path_times.recordings
        X=[]

        for recording in recordings:
            X.append([recording.time.seconds+recording.time.microseconds/10**6.,recording.date.hour*60+recording.date.minute])
        print X
        Y=pdist(X)
        Z=linkage(Y)
        dendrogram(Z)
        for i in range(len(X)):
            print('{0}, {1}'.format(i,X[i]))
        print Z
        print self.calculate_variances(X,Z)
        if display:
            show()
예제 #35
0
def cluster(items, cache_clustering_file = None, dist_fn = euc_dist, \
    prefix_output = None):

    if not cache_clustering_file:
        print "Generating distance matrix..."
        sys.stdout.flush()
        Y = dist_matrix(items, dist_fn)

        print "Linkage clustering..."
        sys.stdout.flush()
        Z = linkage(Y, "single")  # average, complete = max, single = min ?

        print "Dumping clustering information into cache file"
        sys.stdout.flush()
        cPickle.dump([Y, Z], open(prefix_output + "clustering_dump.pkl", "w"))

    else:
        print "Loading clustering cache from '%s'" % cache_clustering_file.name
        Y, Z = cPickle.load(cache_clustering_file)

    print "Converting into ETE tree..."
    sys.stdout.flush()
    T = to_tree(Z)

    root = Tree()
    root.dist = 0
    root.name = "root"
    item2node = {T: root}

    to_visit = [T]
    while to_visit:
        node = to_visit.pop()
        cl_dist = node.dist / 2.0
        for ch_node in [node.left, node.right]:
            if ch_node:
                ch = Tree()
                #try:
                #  ch.add_features(content = str(items[ch_node.id]))
                #except IndexError:
                #  pass
                ch.dist = cl_dist
                ch.name = str(ch_node.id)
                item2node[node].add_child(ch)
                item2node[ch_node] = ch
                to_visit.append(ch_node)

    return root
    def hierarchical(self,lst,fulldataset):
        #Samples are colored according to its sample type #
        label_color={}
        for i in self.numbering(self.classLabel(lst)):
            r=('r')
            b=('b')
            if i[0:6]=='cancer':
                label_color[i]=r
                #print label_colors
            elif i[0:6]=='normal' :
                label_color[i]=b
                #print label_colors
            else:
                continue
        tg=zip(*fulldataset)
        Y = pdist(tg)
        #average linkage is applied #
        Z = linkage(Y,method='average')
        sch.set_link_color_palette(['black'])
        a=sch.dendrogram(Z,leaf_font_size=6,labels=self.newlist)
            

        #dendrogram is plotted #
        ax = plt.gca()
        xlbls = ax.get_xmajorticklabels()
    
        for lbl in xlbls:
            lbl.set_color(label_color[lbl.get_text()])
        plt.title("Average Hierarchical Clustering Algorithm")
        plt.savefig('Average Hierarchical Clustering.pdf',dpi=500)
        #plt.show()
        plt.close()

        self.labels=array([])
        c=array([1])
        n=array([0])

        #Silhouette Test #
        #Samples are converted into '0' or '1' for validation #
        for i in self.classLabel(lst):
            if i=='cancer':
                self.labels=np.concatenate([self.labels,c])
            else:
                self.labels=np.concatenate([self.labels,n])

        self.labels=np.delete(self.labels,self.labels[-1])
        self.score=metrics.silhouette_score(Z, self.labels, metric='euclidean')
예제 #37
0
파일: galaxies.py 프로젝트: qbilius/autoart
 def hcluster(self, stim):
     #from hcluster import pdist, linkage, dendrogram
     import hcluster
     iu = np.triu_indices(len(stim.group), 1)
     #
     Z = hcluster.linkage(stim.group[iu], 'single', 'ward')
     import pdb; pdb.set_trace()
     thres = Z[-2, 2]
     dend = hcluster.dendrogram(Z, color_threshold=thres)
     plt.show()
     clusters = self.get_clusters(Z, n_clusters=4)#thres=thres)
     colors = self.get_colors(len(clusters))
     #import pdb; pdb.set_trace()
     for cluster, color in zip(clusters, colors):
         sel = stim.indices[np.array(cluster)]
         plt.plot(sel[:,1], sel[:,0],'o',  color=color, )
     plt.show()
예제 #38
0
def wavelet_clusters(Y,
                     ct=0.5,
                     weights=False,
                     return_clusters=False,
                     swt=False):
    if weights:
        D = abs(c_dists(Y, level_weights=True, use_swt=False))
        Dr = []
        for i in range(D.shape[0] - 1):
            Dr += list(D[i, i + 1:])
    else:
        Dr = c_dists(Y, use_swt=swt)
    if return_clusters:
        L = linkage(Dr, method='single', metric='cosine')
        C = fcluster(L, ct, criterion='distance')
        return cluster_sets(C)
    plot_clusters(Dr, ct)
예제 #39
0
파일: NDIM.py 프로젝트: askerry/FGE_MISC
def hierarchicalcluster(datamatrix, dimlabels, similarity='euclidean', colorthresh='default'):
    '''plots dendrogram and returns clustering (item-1 x 4 array. first two columns are indices of clusters, 3rd column = distance between those clusters, 4th column = # of
      original observations in the cluster) and dend (dictionary of the data structures computed to render the
      dendrogram). see api here: http://hcluster.damianeads.com/cluster.html'''
    import hcluster

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        clustering = hcluster.linkage(datamatrix, metric=similarity)
        if colorthresh == 'default':
            color_threshold = 0.7 * max(clustering[:,
                                        2])  #all descendents below a cluster node k will be assigned the same color if k is the first node below color_threshold. links connecting nodes with distances >= color_threshold are colored blue. default= 0.7*max(clustering[:,2])
        else:
            color_threshold = colorthresh * max(clustering[:, 2])
        fig = plt.figure()
        dend = hcluster.dendrogram(clustering, labels=dimlabels, leaf_rotation=90, color_threshold=color_threshold)
        plt.tight_layout()
    return clustering, dend
예제 #40
0
def cluster_ids(gids, unnorm_eluts, sp, gt=None, dist='cosine', do_plot=True,
        norm_rows=True, bigarr=None, **kwargs):
    import plotting as pl
    import hcluster
    arr = (bigarr if bigarr is not None else single_array(gids, unnorm_eluts,
        sp, norm_rows=norm_rows))
    ymat = hcluster.pdist(arr, metric=dist)
    zmat = hcluster.linkage(ymat)
    zmat = np.clip(zmat, 0, 10**8)
    if do_plot: pl.figure()
    order = hcluster.dendrogram(zmat, no_plot=bool(1-do_plot), 
            **kwargs)['leaves']
    if do_plot: 
        ax = pl.gca()
        ax.axes.set_xticklabels([gt.id2name[gids[ind]] for ind in order])
        pl.figure() 
        pl.imshow(arr[order,:])
    return list(np.array(list(gids))[order])
예제 #41
0
 def _train(self, dataset):
     self._dataset = dataset
     self.ulabels=self._dataset.uniquelabels
     # Do cross-validation for normal classifier
     self.cvterr = CrossValidatedTransferError(TransferError(self._clf),self._splitter,enable_states=["confusion"])
     self.cvterr(self._dataset)
     # From the confusion matrix, calculate linkage and tree-structure
     # First prepare distance matrix from confusion matrix
     dist = self.cvterr.confusion.matrix
     dist = (dist+dist.T)/2 # Distance must be symmetric (property of a norm)
     dist = dist.max()-dist # Kind of inversion. High values in confusion -> similar -> small distance
     dist -= np.diag(np.diag(dist)) # Distance to self must be zero -> make diagonal elements zero
     # Calculate linkage matrix
     self.linkage = hcluster.linkage(hcluster.squareform(dist))
     # Build tree and according TreeClassifier
     self.tree = hcluster.to_tree(self.linkage)
     self._tree_clf = self.build_tree_classifier_from_linkage_tree(self.tree)[0]
     self._tree_clf.train(self._dataset)
예제 #42
0
def do_gen_feature_z(X_L_list, X_D_list, M_c, filename, tablename=''):
    num_cols = len(X_L_list[0]['column_partition']['assignments'])
    column_names = [M_c['idx_to_name'][str(idx)] for idx in range(num_cols)]
    column_names = numpy.array(column_names)
    # extract unordered z_matrix
    num_latent_states = len(X_L_list)
    z_matrix = numpy.zeros((num_cols, num_cols))
    for X_L in X_L_list:
      assignments = X_L['column_partition']['assignments']
      for i in range(num_cols):
        for j in range(num_cols):
          if assignments[i] == assignments[j]:
            z_matrix[i, j] += 1
    z_matrix /= float(num_latent_states)
    # hierachically cluster z_matrix
    Y = hcluster.pdist(z_matrix)
    Z = hcluster.linkage(Y)
    pylab.figure()
    hcluster.dendrogram(Z)
    intify = lambda x: int(x.get_text())
    reorder_indices = map(intify, pylab.gca().get_xticklabels())
    pylab.close()
    # REORDER! 
    z_matrix_reordered = z_matrix[:, reorder_indices][reorder_indices, :]
    column_names_reordered = column_names[reorder_indices]
    # actually create figure
    fig = pylab.figure()
    fig.set_size_inches(16, 12)
    pylab.imshow(z_matrix_reordered, interpolation='none',
                 cmap=pylab.matplotlib.cm.Greens)
    pylab.colorbar()
    if num_cols < 14:
      pylab.gca().set_yticks(range(num_cols))
      pylab.gca().set_yticklabels(column_names_reordered, size='x-small')
      pylab.gca().set_xticks(range(num_cols))
      pylab.gca().set_xticklabels(column_names_reordered, rotation=90, size='x-small')
    else:
      pylab.gca().set_yticks(range(num_cols)[::2])
      pylab.gca().set_yticklabels(column_names_reordered[::2], size='x-small')
      pylab.gca().set_xticks(range(num_cols)[1::2])
      pylab.gca().set_xticklabels(column_names_reordered[1::2],
                                  rotation=90, size='small')
    pylab.title('column dependencies for: %s' % tablename)
    pylab.savefig(filename)
def DrawDendrogram(feature_vector, obj_names, motion_name):
    distances = pdist(feature_vector)
    linkage_list = ['single', 'average', 'complete']
    Z = linkage(distances, linkage_list[1])
    render = hierarchy.dendrogram(Z,
                                  #p=51,
                                  #truncate_mode='level',
                                  #show_contracted=True,
                                  color_threshold=1.5,
                                  labels=obj_names,
                                  orientation='left',
                                  show_leaf_counts=True,
                                  leaf_font_size=10,
                                  )

    plt.title(motion_name+'_'+linkage_list[1])
    plt.show()
    #plt.savefig(motion_name+'_dendro_complete.png')

    return render
예제 #44
0
def get_clustering_as_tree(vectors,
                           ward=True,
                           clustering_distance='euclidean',
                           clustering_method='complete',
                           progress=progress):
    if ward:
        progress.update(
            'Clustering data with Ward linkage and euclidean distances')
        clustering_result = hcluster.ward(vectors)
    else:
        progress.update('Computing distance matrix using "%s" distance' %
                        clustering_distance)
        distance_matrix = hcluster.pdist(vectors, clustering_distance)
        progress.update('Clustering data with "%s" linkage' %
                        clustering_method)
        clustering_result = hcluster.linkage(distance_matrix,
                                             method=clustering_method)

    progress.update('Returning results')
    return hcluster.to_tree(clustering_result)
예제 #45
0
def dendrogramBuild(tfidfMatrix,queriedSentences,degree):
 
    a = pdist(tfidfMatrix,'cosine')
    print a
    b = linkage(a)
    print b


    if b[0][2] < degree:
       mag1 = tfidf.magnitude(tfidfMatrix[int(b[0][0])])
       mag2 = tfidf.magnitude(tfidfMatrix[int(b[0][1])])
       if mag1 > mag2:
	  print int(b[0][1])
	  tfidfMatrix.pop(int(b[0][1]))
	  queriedSentences.pop(int(b[0][1]))
       else:
	  print int(b[0][0])
          tfidfMatrix.pop(int(b[0][0]))
	  queriedSentences.pop(int(b[0][0]))
       dendrogramBuild(tfidfMatrix,queriedSentences,degree)


    return (tfidfMatrix,queriedSentences)
예제 #46
0
import numpy as np
import matplotlib.pyplot as plt
from hcluster import pdist, linkage, dendrogram, squareform  # same as import them from scipy

data = np.genfromtxt("../../data/ExpRawData-E-TABM-84-A-AFFY-44.tab",
                     names=True,
                     usecols=tuple(range(1, 30)),
                     dtype=float,
                     delimiter="\t")

data_array = data.view((np.float, len(data.dtype.names)))
data_array = data_array[1:1000].transpose()

data_dist = pdist(data_array)  # computing the distance

data_link = linkage(data_dist)  # computing the linkage

# just plot the dendrogram.
dendrogram(data_link, labels=data.dtype.names)
plt.savefig('../../results/dendrogram.png')

# or plot the heatmap too!

# Compute and plot first dendrogram.
fig = plt.figure(figsize=(8, 8))
# x ywidth height
ax1 = fig.add_axes([0.05, 0.1, 0.2, 0.6])
Y = linkage(data_dist, method='single')
Z1 = dendrogram(Y, orientation='right',
                labels=data.dtype.names)  # adding/removing the axes
ax1.set_xticks([])
예제 #47
0
def main(argv):

    print argv
    if (len(argv) > 0):
        params = argv[::2]
        param_values = argv[1::2]

    crit_func = squared_criterion
    merge_func = d_min
    for i in range(0, len(argv), 2):
        if params[i] == "--criterium":
            if param_values[i + 1] == "silhoette":
                crit_func = silhouette_criterion
            elif param_values[i + 1] == "squared":
                crit_func = squared_criterion
            else:
                crit_func = silhouette_criterion
        elif params[i] == "--merge":
            if param_values[i + 1] == "de":
                merge_func = d_e
            elif param_values[i + 1] == "dmax":
                merge_func = d_max
            else:
                merge_func = d_e

    Cluster.clusters = []
    Cluster.squared_criterion_values = []
    Cluster.silhouette_criterion_values = []

    my_data = np.genfromtxt('./data.csv', delimiter=',', dtype=float)
    #Make only clusterization params in array
    data_list = my_data[1:].tolist()
    maximum = 0

    data_list = data_list[:]
    etalon = data_list[:]

    for i in range(len(data_list)):
        data_list[i] = data_list[i][2:]

    #normalize all lists:
    data_list = np.array(data_list)

    #count all distances
    print "Precounting distances"
    for i in range(len(data_list)):
        for j in range(len(data_list)):
            print ".",
            Cluster.counted_distances[(tuple(data_list[i]), tuple(
                data_list[j]))] = hexic_euqlid_distance(
                    data_list[i], data_list[j])

    print "Distances Counted"

    for i in range(len(data_list)):
        Cluster.etalon_clasters[tuple(data_list[i][2:])] = etalon[i][1]

    print Cluster.etalon_clasters.values()
    #Make each element = 1 cluster

    for x in data_list:
        Cluster.clusters.append(Cluster(x))

    print(len(Cluster.clusters))
    K_num = 1
    swo(K_num, merge_func, crit_func)
    Y = Cluster.merge_history[1:]
    Z = linkage(Y)
    plt.subplot(121)
    dendrogram(Z, labels=range(len(data_list)))
    squared_criterion_values = Cluster.squared_criterion_values[::-1]
    silhouette_criterion_values = Cluster.silhouette_criterion_values[::-1]
    plt.subplot(122)
    if (crit_func == silhouette_criterion):
        plt.plot(range(len(silhouette_criterion_values)),
                 silhouette_criterion_values)
        plt.axis([
            K_num, 30,
            min(silhouette_criterion_values),
            max(silhouette_criterion_values)
        ])
    else:
        plt.plot(range(len(squared_criterion_values)),
                 squared_criterion_values)
        plt.axis([
            K_num, 30,
            min(squared_criterion_values),
            max(squared_criterion_values)
        ])

    plt.show()

    for x in Cluster.clusters:
        x.etalon_to_current_mapping()
        print x.etalon_map
예제 #48
0
   ct += 1
   boom = " ".join(top_keywords(nkeywords, doc, corpus))
   keywords.append(boom)

feature_vectors = []
n = len(corpus)

for document in corpus:
    vec = []
    [vec.append(tfidf(word, document, corpus) if word in document else 0)
     for word in key_word_list]
    feature_vectors.append(vec)

mat = numpy.empty((n, n))
for i in range(0, n):
    for j in range(0, n):
        mat[i][j] = nltk.cluster.util.cosine_distance(feature_vectors[i], feature_vectors[j])

t = 0.8
Z = linkage(mat,'complete')

posts = []
clusters = extract_clusters(Z, t, n)
ct = -1
for key in clusters:
    print("-------------------------------")
    for id in clusters[key]:
        ct += 1
        print(ct, titles[id])
        print(ct, " - ",keywords[id])
예제 #49
0
from hcluster import pdist, linkage, leaves_list, squareform, dendrogram
import numpy as np
import matplotlib as mp

metric = 'euclidean'
method = 'single'

data = np.matrix([[1, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1, 1],
                  [0, 0, 0, 0, 0, 0, 1, 1, 0], [0, 0, 0, 1, 1, 1, 1, 1, 0],
                  [0, 0, 0, 1, 1, 1, 0, 0, 0], [0, 0, 1, 1, 1, 1, 0, 0, 0]])

y = pdist(data, metric=metric)
Z = linkage(y, method=method, metric=metric)
dendrogram(Z)
Z = [(int(l), int(r), max(0., s), int(n)) for (l, r, s, n) in Z]  # cleaning

leaves = list(leaves_list(Z))
count = len(leaves)
root = len(Z) + count - 1

X = squareform(y)
assert len(X) == count

from utils import memoise

# bar-joseph optimal ordering ################################################

from barjoseph import optimal

leaves = optimal(
    root, **{
예제 #50
0
##|  cosine similarities
##`----
import numpy
from nltk import cluster
mat = numpy.empty((n, n))
for i in xrange(0, n):
    for j in xrange(0, n):
        mat[i][j] = nltk.cluster.util.cosine_distance(feature_vectors[i],
                                                      feature_vectors[j])

##,----
##|  Hierarchically Cluster mat
##`----
from hcluster import linkage
t = 0.9
Z = linkage(mat, 'single')

#dendrogram(Z, color_threshold=t)

#import pylab
#pylab.savefig( "new_agg_cluster.png" ,dpi=800)


##,----
##|  Cluster Extraction
##`----
def extract_clusters(Z, threshold, n):
    clusters = {}
    ct = n
    for row in Z:
        if row[2] < threshold:
예제 #51
0
import hcluster
import matplotlib.pyplot as plt
import pickle
import urllib

url = "http://examples.obspy.org/dissimilarities.pkl"
dissimilarity = pickle.load(urllib.urlopen(url))

plt.subplot(121)
plt.imshow(1 - dissimilarity, interpolation="nearest")

dissimilarity = hcluster.squareform(dissimilarity)
threshold = 0.3
linkage = hcluster.linkage(dissimilarity, method="single")
clusters = hcluster.fcluster(linkage, 0.3, criterion="distance")

plt.subplot(122)
hcluster.dendrogram(linkage, color_threshold=0.3)
plt.xlabel("Event number")
plt.ylabel("Dissimilarity")
plt.show()
예제 #52
0
def cluster(M, method='complete'):
    return hcluster.linkage(hcluster.squareform(M), method=method)
예제 #53
0
    def _do_gen_matrix(self,
                       col_function_name,
                       X_L_list,
                       X_D_list,
                       M_c,
                       T,
                       tablename='',
                       filename=None,
                       col=None,
                       confidence=None,
                       limit=None,
                       submatrix=False):
        if col_function_name == 'mutual information':
            col_function = getattr(self, '_mutual_information')
        elif col_function_name == 'dependence probability':
            col_function = getattr(self, '_dependence_probability')
        elif col_function_name == 'correlation':
            col_function = getattr(self, '_correlation')
        elif col_function_name == 'view_similarity':
            col_function = getattr(self, '_view_similarity')
        else:
            raise Exception('Invalid column function')

        num_cols = len(X_L_list[0]['column_partition']['assignments'])
        column_names = [
            M_c['idx_to_name'][str(idx)] for idx in range(num_cols)
        ]
        column_names = numpy.array(column_names)
        # extract unordered z_matrix
        num_latent_states = len(X_L_list)
        z_matrix = numpy.zeros((num_cols, num_cols))
        for i in range(num_cols):
            for j in range(num_cols):
                z_matrix[i][j] = col_function(i, j, X_L_list, X_D_list, M_c, T)

        if col:
            z_column = list(z_matrix[M_c['name_to_idx'][col]])
            data_tuples = zip(z_column, range(num_cols))
            data_tuples.sort(reverse=True)
            if confidence:
                data_tuples = filter(lambda tup: tup[0] >= float(confidence),
                                     data_tuples)
            if limit and limit != float("inf"):
                data_tuples = data_tuples[:int(limit)]
            data = [tuple([d[0] for d in data_tuples])]
            columns = [d[1] for d in data_tuples]
            column_names = [
                M_c['idx_to_name'][str(idx)] for idx in range(num_cols)
            ]
            column_names = numpy.array(column_names)
            column_names_reordered = column_names[columns]
            if submatrix:
                z_matrix = z_matrix[columns, :][:, columns]
                z_matrix_reordered = z_matrix
            else:
                return {'data': data, 'columns': column_names_reordered}
        else:
            # hierachically cluster z_matrix
            import hcluster
            Y = hcluster.pdist(z_matrix)
            Z = hcluster.linkage(Y)
            pylab.figure()
            hcluster.dendrogram(Z)
            intify = lambda x: int(x.get_text())
            reorder_indices = map(intify, pylab.gca().get_xticklabels())
            pylab.close()
            # REORDER!
            z_matrix_reordered = z_matrix[:,
                                          reorder_indices][reorder_indices, :]
            column_names_reordered = column_names[reorder_indices]

        title = 'Pairwise column %s for %s' % (col_function_name, tablename)
        if filename:
            utils.plot_matrix(z_matrix_reordered, column_names_reordered,
                              title, filename)

        return dict(matrix=z_matrix_reordered,
                    column_names=column_names_reordered,
                    title=title,
                    filename=filename,
                    message="Created " + title)
        m = castoverlap_numgenes
    elif o.method == 'numsamemono_norm':
        m = monoallelic_numgenes_norm
    elif o.method == 'numsamemono100_norm':
        m = monoallelic_numgenes_norm_100
    elif o.method == 'numsameC57_norm':
        m = c57overlap_numgenes_norm
    elif o.method == 'numsameCAST_norm':
        m = castoverlap_numgenes_norm
    else:
        m = o.method

    # make clusters
    exparray = character_matrix
    hcdists = hcluster.pdist(exparray, metric=m)
    hclinks = hcluster.linkage(hcdists, method=o.linkage)
    draw_order = hcluster.leaves_list(hclinks)

    # draw tree
    scipyhcluster.dendrogram(hclinks, labels=samplenames, leaf_rotation=90)
    pylab.subplots_adjust(bottom=0.3)
    pylab.ylabel('%s (linkage=%s)' % (o.method, o.linkage))
    if o.method in ('numsamemono', 'numsameC57', 'numsameCAST',
                    'numsamemono_norm', 'numsameC57_norm', 'numsameCAST_norm'):
        pylab.yticks([1.0, 0.8, 0.6, 0.4, 0.2, 0.0],
                     [0, 100, 200, 300, 400, 500])
    elif o.method in ('numsamemono100', 'numsamemono100_norm'):
        pylab.yticks([1.0, 0.8, 0.6, 0.4, 0.2, 0.0], [0, 20, 40, 60, 80, 100])
    pylab.savefig(o.fig)

    # bootstrap
예제 #55
0
from matplotlib.pyplot import show

from hcluster import pdist, linkage, dendrogram
import numpy
from numpy.random import rand

X = rand(10, 100)
X[0:5, :] *= 2
Y = pdist(X)
Z = linkage(Y)
dendrogram(Z)

show()
예제 #56
0
    M = len(actsind)
    data = zeros((N, M), dtype=int)
    i = 0
    parikhdict = {}
    for case in uniq_cases.keys():
        data[i] = get_parikh(case, actsind)
        str_i = ','.join(map(str, data[i]))
        if str_i not in parikhdict:
            parikhdict[str_i] = [i]
        else:
            parikhdict[str_i].append(i)
        i = i + 1
    df = DataFrame(data)
    data_uniq = df.drop_duplicates()
    Y = pdist(data_uniq, metric='euclidean')
    Z = linkage(Y, method='average')
    dendrogram(Z)
    show()


def similarity_clusters(log, show_plot=None):
    """Translates traces to Parikh vectors and computes in the vector space
       a K-means clustering."""
    def get_parikh(case, alphabet):
        v = zeros(len(alphabet), dtype=int)
        for act in case:
            v[alphabet[act]] = v[alphabet[act]] + 1
        return v

    actsind = {}
    i = 0