예제 #1
0
파일: run2.py 프로젝트: gitzain/project-x
	def do_it(self, sources):

		for source in sources:
			words = nltk.wordpunct_tokenize(source.headline)
			words.extend(nltk.wordpunct_tokenize(source.summary))
			lowerwords=[x.lower() for x in words if len(x) > 1]
			self.ct += 1
			print self.ct, "TITLE",source.headline
			self.corpus.append(lowerwords)
			self.titles.append(source.headline)
			self.links.append(source.url)



		[[self.key_word_list.add(x) for x in self.top_keywords(self.nkeywords,doc,self.corpus)] for doc in self.corpus]

		self.ct=-1
		for doc in self.corpus:
		   self.ct+=1
		   print self.ct,"KEYWORDS"," ".join(self.top_keywords(self.nkeywords,doc,self.corpus))



		for document in self.corpus:
			vec=[]
			[vec.append(self.tfidf(word, document, self.corpus) if word in document else 0) for word in self.key_word_list]
			self.feature_vectors.append(vec)



		self.n=len(self.corpus)

		mat = numpy.empty((self.n, self.n))
		for i in xrange(0,self.n):
		  for j in xrange(0,self.n):
			mat[i][j] = nltk.cluster.util.cosine_distance(self.feature_vectors[i],self.feature_vectors[j])


		Z = linkage(mat, 'single')

		dendrogram(Z, color_threshold=self.t)





		clusters = self.extract_clusters(Z,self.t,self.n)
		
		stories = []

		for key in clusters:
			print "============================================="
			story = Story()  
			for id in clusters[key]:
				story.add_source(sources[id])
				print id,self.titles[id],sources[id].url
			stories.append(story)


		return stories
예제 #2
0
파일: dendro.py 프로젝트: jungikim/sbmt
def dendro(X,metric='cosine',combine='average',showdendro=True,leaf_label_func=identity,**kw):
    Y = pdist(X,metric)
    Z = linkage(Y,combine)
    if showdendro:
        dendrogram(Z,leaf_label_func=leaf_label_func,**kw)
        show()
    return Z
예제 #3
0
파일: dendro.py 프로젝트: isi-nlp/sbmt
def dendro(X, metric="cosine", combine="average", showdendro=True, leaf_label_func=identity, **kw):
    Y = pdist(X, metric)
    Z = linkage(Y, combine)
    if showdendro:
        dendrogram(Z, leaf_label_func=leaf_label_func, **kw)
        show()
    return Z
예제 #4
0
def main():
    print "hola"
    X = rand(10,100)
    X[0:5,:] *= 2
    Y = pdist(X)
    Z = linkage(Y)
    dendrogram(Z)
예제 #5
0
def plotSampleDistanceDendrogram(ds):
    """Plot a sample distance cluster dendrogram using all samples and features
    of a dataset.

    :Parameter:
      ds: Dataset
        The source dataset.
    """
    # generate map from num labels to literal labels
    # to put them on the dendrogram leaves
    lmap = dict([(v, k) for k, v in ds.labels_map.iteritems()])

    # compute distance matrix, default is squared euclidean distance
    dist = clust.pdist(ds.samples)

    # determine clusters
    link = clust.linkage(dist, 'complete')

    # plot dendrogram with literal labels on leaves
    # this does not work with etch's version of matplotlib (verified for
    # matplotlib 0.98)
    clust.dendrogram(
        link,
        colorthreshold=0,
        labels=[lmap[l] for l in ds.labels],
        # all black
        link_color_func=lambda x: 'black',
        distance_sort=False)
    labels = P.gca().get_xticklabels()
    # rotate labels
    P.setp(labels, rotation=90, fontsize=9)
예제 #6
0
파일: main.py 프로젝트: kirawrath/RP
def main():
	filename='iris2d.data'
	if len(sys.argv) > 1:
		filename = sys.argv[1]
	else:
		print 'Assuming filename \'iris2d.data\''
	data = parse_file(filename)

	minclass = int(raw_input('Minclass: '))
	maxclass = int(raw_input('Maxclass: '))

	standartizate(data)
	dists = dist_from_data(data)
	nick = len(dists)
	lend=nick
	n1 = lend-1
	linkage=[]
	merge_points=[]
	# charlie[index used in dist matrix] = [ new cluster nick, number of children ]
	charlie=dict()
	for i in range(lend):
		charlie[i] = [i,1]
	while n1:
		n1-=1
		dists, e0, e1, d = agglomerate(dists)
		
		#charlie[e0][1] has all the children of both e0 and e1
		charlie[e0][1] = charlie[e0][1] + charlie[e1][1]

		linkage.append([charlie[e0][0], charlie[e1][0], d, charlie[e0][1]])

		#Fixing the indexes due to the deletion of e1
		for i in range(e1,lend-1):
			charlie[i] = charlie[i+1]

		charlie[e0][0] = nick
		nick+=1
		#n1 contains the number of classes
		if n1 <= maxclass and n1 >= minclass:
			merge_points.append(d)
			
	# Finding the cutting point
	max_dist=0
	index=-1
	print merge_points
	for i in range(len(merge_points)-1):
		d = merge_points[i+1] - merge_points[i]
		if d > max_dist:
			max_dist = d
			index = i

	assert index >= 0
	print 'Cutting point is at y='+str(merge_points[index])
	
	print 'Showing the image...'
	
	dendrogram(linkage)
	show()
예제 #7
0
  def do_it(self):

    for feed in self.feeds:
        d = feedparser.parse(feed)
        for e in d['entries']:
           words = nltk.wordpunct_tokenize(self.clean_html(e['description']))
           words.extend(nltk.wordpunct_tokenize(e['title']))
           lowerwords=[x.lower() for x in words if len(x) > 1]
           self.ct += 1
           print self.ct, "TITLE",e['title']
           self.corpus.append(lowerwords)
           self.titles.append(e['title'])
           self.links.append(e['link'])



    [[self.key_word_list.add(x) for x in self.top_keywords(self.nkeywords,doc,self.corpus)] for doc in self.corpus]

    self.ct=-1
    for doc in self.corpus:
       self.ct+=1
       print self.ct,"KEYWORDS"," ".join(self.top_keywords(self.nkeywords,doc,self.corpus))



    for document in self.corpus:
        vec=[]
        [vec.append(self.tfidf(word, document, self.corpus) if word in document else 0) for word in self.key_word_list]
        self.feature_vectors.append(vec)



    self.n=len(self.corpus)

    mat = numpy.empty((self.n, self.n))
    for i in xrange(0,self.n):
      for j in xrange(0,self.n):
        mat[i][j] = nltk.cluster.util.cosine_distance(self.feature_vectors[i],self.feature_vectors[j])


    Z = linkage(mat, 'single')

    dendrogram(Z, color_threshold=self.t)





    clusters = self.extract_clusters(Z,self.t,self.n)
     
    for key in clusters:
       print "============================================="  
       for id in clusters[key]:
           print id,self.titles[id]
예제 #8
0
    def fetch_clusters(self, mat, n):
        """
        Fetch the cluster from the similarity matrix
        :param mat: The similarity matrix
        :param n: The length of the corpus
        :return: The clusters
        """
        Z = linkage(mat, 'single')
        dendrogram(Z, color_threshold=self.t)

        pylab.savefig(self.cluster_image, dpi=self.dpi)
        clusters = self.__extract_clusters(Z, self.t, n)
        return clusters
예제 #9
0
def generate_dendrogram(root):
    from hcluster import pdist, linkage, dendrogram
    import numpy
    from numpy.random import rand
    import matplotlib
    
    X = rand(10,100)
    X[0:5,:] *= 2
    Y = pdist(X)
    Z = linkage(Y)
    print Y
    print Z
    dendrogram(Z)
예제 #10
0
def plot_cluster_tree(cluster_coords,Labels=None,link_method='single',color_thresh=.25,fontsize=8):
	D = pdist(cluster_coords,'cosine')
	# SEEMS THERE MAY SOMETIME BE VERY SMALL NEGATIVE DISTANCES ie -2*10**-16
	D = abs(D)
	L = linkage(D,method=link_method,metric='cosine')
	if Labels:
		dendrogram(L,labels=Labels,orientation='left',color_threshold=color_thresh)
	else:
		dendrogram(L,orientation='left',color_threshold=color_thresh)
	pylab.title('HMP Buccal Mucosa - Latent Strain Analysis')
	pylab.xlabel('Cosine Distance')
	pylab.ylabel('Strain with the Most Alignments to Each Cluster')
	pylab.rcParams.update({'font.size': fontsize})
	pylab.show()
예제 #11
0
def printSummary(updatedtfidfMatrix, queriedSentences):

    print "\n"
    a = pdist(updatedtfidfMatrix,'cosine')
    print a
    b = linkage(a)
    dendrogram(b)
    show()
    print b


    sumOrder = []
    count = 0
	
    f = open("foo.txt", "w")
    for i in range(len(b)):
	x = int(b[i][0])
	y = int(b[i][1])

	if x <= (len(queriedSentences)-1):
	   sumOrder.append(x)
	if y <= (len(queriedSentences)-1):
	   sumOrder.append(y)
        if x <= (len(queriedSentences)-1) and y > (len(queriedSentences)-1):
           sumOrder.append(y)
	if x > (len(queriedSentences)-1) and y > (len(queriedSentences)-1):
	   sumOrder.append(x)

    previous = 0
   
    queriedSentences = [sentence.capitalize() for sentence in queriedSentences]
        
    
    for num in sumOrder:
	if num > (len(queriedSentences)-1):
	   f.write('<br></br>')
	else:
	   f.write(queriedSentences[num])
	   f.write('.')
	   f.write(' ')


    f.close()

    with open ("foo.txt", "r") as myfile:
       #print myfile
       data=myfile.read()
	
    print data
    return data
예제 #12
0
def do_gen_feature_z(X_L_list, X_D_list, M_c, filename, tablename=''):
    num_cols = len(X_L_list[0]['column_partition']['assignments'])
    column_names = [M_c['idx_to_name'][str(idx)] for idx in range(num_cols)]
    column_names = numpy.array(column_names)
    # extract unordered z_matrix
    num_latent_states = len(X_L_list)
    z_matrix = numpy.zeros((num_cols, num_cols))
    for X_L in X_L_list:
        assignments = X_L['column_partition']['assignments']
        for i in range(num_cols):
            for j in range(num_cols):
                if assignments[i] == assignments[j]:
                    z_matrix[i, j] += 1
    z_matrix /= float(num_latent_states)
    # hierachically cluster z_matrix
    Y = hcluster.pdist(z_matrix)
    Z = hcluster.linkage(Y)
    pylab.figure()
    hcluster.dendrogram(Z)
    intify = lambda x: int(x.get_text())
    reorder_indices = map(intify, pylab.gca().get_xticklabels())
    pylab.close()
    # REORDER!
    z_matrix_reordered = z_matrix[:, reorder_indices][reorder_indices, :]
    column_names_reordered = column_names[reorder_indices]
    # actually create figure
    fig = pylab.figure()
    fig.set_size_inches(16, 12)
    pylab.imshow(z_matrix_reordered,
                 interpolation='none',
                 cmap=pylab.matplotlib.cm.Greens)
    pylab.colorbar()
    if num_cols < 14:
        pylab.gca().set_yticks(range(num_cols))
        pylab.gca().set_yticklabels(column_names_reordered, size='x-small')
        pylab.gca().set_xticks(range(num_cols))
        pylab.gca().set_xticklabels(column_names_reordered,
                                    rotation=90,
                                    size='x-small')
    else:
        pylab.gca().set_yticks(range(num_cols)[::2])
        pylab.gca().set_yticklabels(column_names_reordered[::2],
                                    size='x-small')
        pylab.gca().set_xticks(range(num_cols)[1::2])
        pylab.gca().set_xticklabels(column_names_reordered[1::2],
                                    rotation=90,
                                    size='small')
    pylab.title('column dependencies for: %s' % tablename)
    pylab.savefig(filename)
예제 #13
0
def dendrogram(M, method="complete", title="complete linkage clustering", **kw):
    s = StringIO.StringIO()
    pylab.figure()
    if title:
        pylab.title(title)
    try:
        hcluster.dendrogram(cluster(M, method), **kw)
    except ValueError:
        # Empty distance matrix
        pass
    finally:
        pylab.savefig(s, format="png")
        s.seek(0)
        pylab.close()
    return s
예제 #14
0
def dendrogram(M, method='complete', **kw):
    s = StringIO.StringIO()
    if pylab:
        try:
            pylab.figure()
            pylab.title('complete linkage clustering')
            hcluster.dendrogram(cluster(M, method), **kw)
        except:
            pass
        else:
            pylab.savefig(s, format='png')
            s.seek(0)
        finally:
            pylab.close()
    return s
    def cluster_path_times(self, path_times,display):
        recordings = path_times.recordings
        X=[]

        for recording in recordings:
            X.append([recording.time.seconds+recording.time.microseconds/10**6.,recording.date.hour*60+recording.date.minute])
        print X
        Y=pdist(X)
        Z=linkage(Y)
        dendrogram(Z)
        for i in range(len(X)):
            print('{0}, {1}'.format(i,X[i]))
        print Z
        print self.calculate_variances(X,Z)
        if display:
            show()
예제 #16
0
def test():
  word_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O' ]
  cons_words = ['C', 'B']
  X = rand(15, 2)
  #X = [[0.35, 0.37], [0.40, 0.40], [0.53, 0.53], [0.34, 0.51]]
  print X
  Y = pdist(X)
  print Y
  Z = linkage(Y)
  R = dendrogram(Z)

  index1 = word_list.index(cons_words[0])
  assert index1 >= 0
  path1 = findPath(Z, index1, len(word_list))
  index2 = word_list.index(cons_words[1])
  assert index2 >= 0  
  path2 = findPath(Z, index2, len(word_list))
  
  print Z
  print path1
  print path2

  common = set(path1).intersection(set(path2))
  first = min(common)
  assert(first >= len(word_list))
  first -= len(word_list) 
  cluster_root = Z[first][0]
  merge1 = findCluster(Z, cluster_root, word_list)
  cluster_root = Z[first][1]
  merge2 = findCluster(Z, cluster_root, word_list)

  print merge1
  print merge2
예제 #17
0
def OnLeftDClick(self, event):
#def OnLeftDClick(event):
	""" Left Double Click has been invocked.
		This plugin call pdist function from hcluster package and
		plot the dendrogram using matplotlib.pyplot package.
	"""
	#canvas = event.GetEventObject()
	#model = canvas.getCurrentShape(event)
	devs = self.getDEVSModel()
	if devs:
		Y = pdist(devs.vectors)
		Z = linkage(Y)
		dendrogram(Z)
		show()
	else:
		wx.MessageBox(_("No DEVS model is instanciated.\nGo back to the simulation!"), _("Info"), wx.OK|wx.ICON_INFORMATION)
예제 #18
0
def cluster_elut(mat):
    import hcluster
    ymat = hcluster.pdist(mat)
    zmat = hcluster.linkage(ymat)
    figure()
    order = hcluster.dendrogram(zmat)['leaves']
    clf() 
    imshow(mat[order,:])
예제 #19
0
def plot_with_labels(Z, num_clust):
    threshold = Z[-num_clust + 1, 2]
    dg = dendrogram(Z, no_labels=True, color_threshold=threshold)
    color = [colors[int(rowHeaders[k])] for k in dg["leaves"]]
    b = 0.1 * Z[-1, 2]
    plt.bar(np.arange(N) * 10, np.ones(N) * b, bottom=-b, width=10, color=color, edgecolor="none")
    plt.gca().set_ylim((-b, None))
    plt.show()
예제 #20
0
def dendrogram(M,
               method='complete',
               title='complete linkage clustering',
               **kw):
    s = StringIO.StringIO()
    pylab.figure()
    if title:
        pylab.title(title)
    try:
        hcluster.dendrogram(cluster(M, method), **kw)
    except ValueError:
        # Empty distance matrix
        pass
    finally:
        pylab.savefig(s, format='png')
        s.seek(0)
        pylab.close()
    return s
예제 #21
0
def do_gen_feature_z(X_L_list, X_D_list, M_c, filename, tablename=''):
    num_cols = len(X_L_list[0]['column_partition']['assignments'])
    column_names = [M_c['idx_to_name'][str(idx)] for idx in range(num_cols)]
    column_names = numpy.array(column_names)
    # extract unordered z_matrix
    num_latent_states = len(X_L_list)
    z_matrix = numpy.zeros((num_cols, num_cols))
    for X_L in X_L_list:
      assignments = X_L['column_partition']['assignments']
      for i in range(num_cols):
        for j in range(num_cols):
          if assignments[i] == assignments[j]:
            z_matrix[i, j] += 1
    z_matrix /= float(num_latent_states)
    # hierachically cluster z_matrix
    Y = hcluster.pdist(z_matrix)
    Z = hcluster.linkage(Y)
    pylab.figure()
    hcluster.dendrogram(Z)
    intify = lambda x: int(x.get_text())
    reorder_indices = map(intify, pylab.gca().get_xticklabels())
    pylab.close()
    # REORDER! 
    z_matrix_reordered = z_matrix[:, reorder_indices][reorder_indices, :]
    column_names_reordered = column_names[reorder_indices]
    # actually create figure
    fig = pylab.figure()
    fig.set_size_inches(16, 12)
    pylab.imshow(z_matrix_reordered, interpolation='none',
                 cmap=pylab.matplotlib.cm.Greens)
    pylab.colorbar()
    if num_cols < 14:
      pylab.gca().set_yticks(range(num_cols))
      pylab.gca().set_yticklabels(column_names_reordered, size='x-small')
      pylab.gca().set_xticks(range(num_cols))
      pylab.gca().set_xticklabels(column_names_reordered, rotation=90, size='x-small')
    else:
      pylab.gca().set_yticks(range(num_cols)[::2])
      pylab.gca().set_yticklabels(column_names_reordered[::2], size='x-small')
      pylab.gca().set_xticks(range(num_cols)[1::2])
      pylab.gca().set_xticklabels(column_names_reordered[1::2],
                                  rotation=90, size='small')
    pylab.title('column dependencies for: %s' % tablename)
    pylab.savefig(filename)
	def run(self,):
		if self.debug:	# 2010-4-18 enter debug mode "~/.../variation/misc.py -b"
			import pdb
			pdb.set_trace()
			debug = True
		else:
			debug =False
		sampleId2index, samplePair2data = self.readInput(self.inputFnameLs)
		
		sys.stderr.write("Calculating distance matrix for aggregated data ...")
		distanceMatrix = numpy.zeros([len(sampleId2index), len(sampleId2index)])
		for samplePair, data in samplePair2data.iteritems():
			no_of_mismatches, no_of_total_non_NA = data[:2]
			distance = no_of_mismatches/no_of_total_non_NA
			sample1Id, sample2Id = samplePair[:2]
			sample1Index = sampleId2index[sample1Id]
			sample2Index = sampleId2index[sample2Id]
			distanceMatrix[sample1Index, sample2Index] = distance
			distanceMatrix[sample2Index, sample1Index] = distance
		sys.stderr.write("Done.\n")
		
		sampleIdLs = sampleId2index.keys()
		for sampleId, list_index in sampleId2index.iteritems():
			sampleIdLs[list_index] = sampleId
		
		if self.outputFname:
			self.outputMismatchData(self.outputFname, samplePair2data, distanceMatrix, sampleId2index, sampleIdLs)
		
		massagedSampleIDLs = self.massageSampleId(sampleIdLs)
		
		#2012.9-6 stop massaging sample IDs for PCA output. mapper/AppendInfo2SmartPCAOutput.py could be applied to this.
		self.runPCAOnDistanceMatrix(distanceMatrix, col_id_ls=sampleIdLs, outputFname='%s_PCA.tsv'%(self.figureFnamePrefix))
		
		import pylab
		from hcluster import pdist, linkage, dendrogram
		pylab.clf()
		Z=linkage(distanceMatrix, 'single')
		yh_matplotlib.setFontAndLabelSize(base_size=3)
		dendrogram(Z, color_threshold=0.001, labels=massagedSampleIDLs, orientation='right', leaf_font_size=None)	#leaf_font_size=1 or 5 has no effect
		pylab.savefig('%s.svg'%self.figureFnamePrefix, dpi=200)
		pylab.savefig('%s.png'%self.figureFnamePrefix, dpi=300)
		sys.exit(0)
def performHierarchicalClusterin(matrix, titlesCat):
    #compute the distance matrix with "cosine" metric
    distanceMatrix =pairwise_distances(matrix, metric='cosine')
    #Computer the hierarchical clutering, similaritiy with cluster
    #is caclulated with the average of element similarities
    Z=linkage(distanceMatrix,method='average')
    #Create a dendogram image
    image=dendrogram(Z,labels=titlesCat, distance_sort='descendent',
                     leaf_font_size=2, orientation='left', show_contracted=False)
    #Save generated dendogram image
    pylab.savefig("images/clusteringImage.png",dpi=300,bbox_inches='tight')
예제 #24
0
def t_dendrogram(X, nclusters):
    from matplotlib.pyplot import show
    from hcluster import pdist, linkage, dendrogram
    import numpy
    from numpy.random import rand
    #     X = X[:10, :]
    Y = pdist(X)
    Z = linkage(Y)
    res = dendrogram(Z)
    show()
    pass
예제 #25
0
def t_dendrogram(X, nclusters):
    from matplotlib.pyplot import show
    from hcluster import pdist, linkage, dendrogram
    import numpy
    from numpy.random import rand
#     X = X[:10, :]
    Y = pdist(X)
    Z = linkage(Y)
    res = dendrogram(Z)
    show()
    pass
예제 #26
0
 def augmented_dendrogram(*args, **kwargs):
     ddata = dendrogram(*args, **kwargs)
     if not kwargs.get('no_plot', False):
         for i, d in zip(ddata['icoord'], ddata['dcoord']):
             x = 0.5 * sum(i[1:3])
             y = d[1]
             plt.plot(x, y, 'ro')
             plt.annotate("%.3g" % y, (x, y),
                          xytext=(0, -8),
                          textcoords='offset points',
                          va='top',
                          ha='center')
     return ddata
예제 #27
0
def output_dendrogram(imgs, kernel, method="complete", dend_fn="_dendrogram.png"):
    dst = pdist(kernel)
    links = linkage(dst, method=method)
    tmp_dend_fn = method + "_" + dend_fn
    axis = dendrogram(links, orientation="left", figsize=(7, 12), outfilename=tmp_dend_fn)[1]
    figimg = libpil.loadImage(tmp_dend_fn)
    labels = [label._text for label in axis.get_yticklabels()]
    labels = map(int, labels)
    labels.reverse()
    for i, ind in enumerate(labels):
        imgs[ind].thumbnail((30, 30))
        offset = i * (imgs[ind].size[1] + 4) + 120
        figimg.paste(imgs[ind], (52, offset))
    figimg.save("fig_" + tmp_dend_fn)
예제 #28
0
파일: clustering.py 프로젝트: wx1988/PMLAB
def hierarchical_clusters( log, show_plot=None ):
    """Translates traces to Parikh vectors and computes in the vector space
       a hierarchical clustering."""
    def get_parikh(case,alphabet):
        v = zeros(len(alphabet),dtype=int)
        for act in case:
            v[alphabet[act]] = v[alphabet[act]] +1
        # canonical representation
        m = min(v)
        return v - m   
    
    actsind = {}
    i = 0
    for act in log.get_alphabet():
        actsind[act] = i
        i = i +1

    uniq_cases = log.get_uniq_cases()
    N = len(uniq_cases)
    M = len(actsind)
    data = zeros((N,M),dtype=int)
    i = 0
    parikhdict = {}
    for case in uniq_cases.keys():
        data[i] = get_parikh(case,actsind)
        str_i = ','.join(map(str,data[i]))
        if str_i not in parikhdict:
            parikhdict[str_i] = [i]
        else:
            parikhdict[str_i].append(i)
        i = i + 1
    df = DataFrame(data)
    data_uniq = df.drop_duplicates()
    Y = pdist(data_uniq,metric='euclidean')
    Z = linkage(Y,method='average')
    dendrogram(Z)
    show()
예제 #29
0
파일: galaxies.py 프로젝트: qbilius/autoart
 def hcluster(self, stim):
     #from hcluster import pdist, linkage, dendrogram
     import hcluster
     iu = np.triu_indices(len(stim.group), 1)
     #
     Z = hcluster.linkage(stim.group[iu], 'single', 'ward')
     import pdb; pdb.set_trace()
     thres = Z[-2, 2]
     dend = hcluster.dendrogram(Z, color_threshold=thres)
     plt.show()
     clusters = self.get_clusters(Z, n_clusters=4)#thres=thres)
     colors = self.get_colors(len(clusters))
     #import pdb; pdb.set_trace()
     for cluster, color in zip(clusters, colors):
         sel = stim.indices[np.array(cluster)]
         plt.plot(sel[:,1], sel[:,0],'o',  color=color, )
     plt.show()
예제 #30
0
def cluster_ids(gids, unnorm_eluts, sp, gt=None, dist='cosine', do_plot=True,
        norm_rows=True, bigarr=None, **kwargs):
    import plotting as pl
    import hcluster
    arr = (bigarr if bigarr is not None else single_array(gids, unnorm_eluts,
        sp, norm_rows=norm_rows))
    ymat = hcluster.pdist(arr, metric=dist)
    zmat = hcluster.linkage(ymat)
    zmat = np.clip(zmat, 0, 10**8)
    if do_plot: pl.figure()
    order = hcluster.dendrogram(zmat, no_plot=bool(1-do_plot), 
            **kwargs)['leaves']
    if do_plot: 
        ax = pl.gca()
        ax.axes.set_xticklabels([gt.id2name[gids[ind]] for ind in order])
        pl.figure() 
        pl.imshow(arr[order,:])
    return list(np.array(list(gids))[order])
예제 #31
0
파일: NDIM.py 프로젝트: askerry/FGE_MISC
def hierarchicalcluster(datamatrix, dimlabels, similarity='euclidean', colorthresh='default'):
    '''plots dendrogram and returns clustering (item-1 x 4 array. first two columns are indices of clusters, 3rd column = distance between those clusters, 4th column = # of
      original observations in the cluster) and dend (dictionary of the data structures computed to render the
      dendrogram). see api here: http://hcluster.damianeads.com/cluster.html'''
    import hcluster

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        clustering = hcluster.linkage(datamatrix, metric=similarity)
        if colorthresh == 'default':
            color_threshold = 0.7 * max(clustering[:,
                                        2])  #all descendents below a cluster node k will be assigned the same color if k is the first node below color_threshold. links connecting nodes with distances >= color_threshold are colored blue. default= 0.7*max(clustering[:,2])
        else:
            color_threshold = colorthresh * max(clustering[:, 2])
        fig = plt.figure()
        dend = hcluster.dendrogram(clustering, labels=dimlabels, leaf_rotation=90, color_threshold=color_threshold)
        plt.tight_layout()
    return clustering, dend
예제 #32
0
import numpy as np
from matplotlib.pyplot import show
from fastcluster import *
from hcluster import dendrogram 
# Loading the data
data = np.genfromtxt("../../data/ExpRawData-E-TABM-84-A-AFFY-44.tab",names=True,usecols=tuple(range(1,30)),dtype=float, delimiter="\t")

data_array = data.view((np.float, len(data.dtype.names)))
data_link = linkage(data_array[1:1000], method='single', metric='euclidean', preserve_input=True)
dendrogram(data_link)
show()
예제 #33
0
 def dendrogram(self):
     #import pylab as p
     if not self.linkage == None:
         hcluster.dendrogram(self.linkage,labels=np.unique(self._dataset.labels))
예제 #34
0
    def _do_gen_matrix(self,
                       col_function_name,
                       X_L_list,
                       X_D_list,
                       M_c,
                       T,
                       tablename='',
                       filename=None,
                       col=None,
                       confidence=None,
                       limit=None,
                       submatrix=False):
        if col_function_name == 'mutual information':
            col_function = getattr(self, '_mutual_information')
        elif col_function_name == 'dependence probability':
            col_function = getattr(self, '_dependence_probability')
        elif col_function_name == 'correlation':
            col_function = getattr(self, '_correlation')
        elif col_function_name == 'view_similarity':
            col_function = getattr(self, '_view_similarity')
        else:
            raise Exception('Invalid column function')

        num_cols = len(X_L_list[0]['column_partition']['assignments'])
        column_names = [
            M_c['idx_to_name'][str(idx)] for idx in range(num_cols)
        ]
        column_names = numpy.array(column_names)
        # extract unordered z_matrix
        num_latent_states = len(X_L_list)
        z_matrix = numpy.zeros((num_cols, num_cols))
        for i in range(num_cols):
            for j in range(num_cols):
                z_matrix[i][j] = col_function(i, j, X_L_list, X_D_list, M_c, T)

        if col:
            z_column = list(z_matrix[M_c['name_to_idx'][col]])
            data_tuples = zip(z_column, range(num_cols))
            data_tuples.sort(reverse=True)
            if confidence:
                data_tuples = filter(lambda tup: tup[0] >= float(confidence),
                                     data_tuples)
            if limit and limit != float("inf"):
                data_tuples = data_tuples[:int(limit)]
            data = [tuple([d[0] for d in data_tuples])]
            columns = [d[1] for d in data_tuples]
            column_names = [
                M_c['idx_to_name'][str(idx)] for idx in range(num_cols)
            ]
            column_names = numpy.array(column_names)
            column_names_reordered = column_names[columns]
            if submatrix:
                z_matrix = z_matrix[columns, :][:, columns]
                z_matrix_reordered = z_matrix
            else:
                return {'data': data, 'columns': column_names_reordered}
        else:
            # hierachically cluster z_matrix
            import hcluster
            Y = hcluster.pdist(z_matrix)
            Z = hcluster.linkage(Y)
            pylab.figure()
            hcluster.dendrogram(Z)
            intify = lambda x: int(x.get_text())
            reorder_indices = map(intify, pylab.gca().get_xticklabels())
            pylab.close()
            # REORDER!
            z_matrix_reordered = z_matrix[:,
                                          reorder_indices][reorder_indices, :]
            column_names_reordered = column_names[reorder_indices]

        title = 'Pairwise column %s for %s' % (col_function_name, tablename)
        if filename:
            utils.plot_matrix(z_matrix_reordered, column_names_reordered,
                              title, filename)

        return dict(matrix=z_matrix_reordered,
                    column_names=column_names_reordered,
                    title=title,
                    filename=filename,
                    message="Created " + title)
# cosine similarities
#########################################
import numpy
mat = numpy.empty((n, n))
for i in xrange(0, n):
    for j in xrange(0, n):
        mat[i][j] = nltk.cluster.util.cosine_distance(feature_vectors[i],
                                                      feature_vectors[j])

#########################################
# now hierarchically cluster mat
#########################################
from hcluster import linkage, dendrogram
t = 0.8
Z = linkage(mat, 'single')
dendrogram(Z, color_threshold=t)

import pylab
pylab.savefig("hcluster.png", dpi=800)


#########################################
# extract our clusters
#########################################
def extract_clusters(Z, threshold, n):
    clusters = {}
    ct = n
    for row in Z:
        if row[2] < threshold:
            n1 = int(row[0])
            n2 = int(row[1])
예제 #36
0
 def drawDendrogram(self, dist):
     Z = hcluster.linkage(dist)
     hcluster.dendrogram(Z)
     show()
예제 #37
0
    for x in range(50):
        plt.text(50,x,'%.2f'%meancorr[x],size=6)
        for y in range(50):
            if not x==y:  # skip diagonal
                plt.text(x-0.3,y,'%.2f'%cc_pcorr[x,y],size=6,color='red')

    plt.savefig(basedir+'9_correlation_analysis/pcorr_corrcoefs.pdf',format='pdf')


#do clustering

if 1==1:
    dst=pdist(data_pcorr[2:,:])
    Z=linkage(dst,method='complete')
    plt.figure(figsize=(14,12))
    dendrogram(Z,labels=tasknames_pcorr)
    plt.savefig(basedir+'9_correlation_analysis/pcorr_task_cluster.pdf',format='pdf')

# decompose connections using ICA and save adjacency matrices

data_pcorr_fmri=data_pcorr[2:,:]

if 1==0:
    ica = FastICA(n_components=20)
    S_ = ica.fit(data_pcorr_fmri.T).transform(data_pcorr_fmri.T)  # Get the estimated sources
    A_ = ica.get_mixing_matrix()  # Get estimated mixing matrix


                                  #ncomps=20
                                  #nmf=decomposition.ProjectedGradientNMF(n_components=ncomps,sparseness='components',init='nndsvd')
                                  #nmf.fit(data_pcorr_fmri+100)
예제 #38
0
nconds = ctr

mask = nib.load(os.path.join(dataprepdir, 'goodvoxmask.nii.gz'))
maskvox = N.where(mask.get_data() > 0)

data = N.zeros((nconds, len(maskvox[0])))

ctr = 0
for ds in contrasts_to_use.iterkeys():
    for task in contrasts_to_use[ds].iterkeys():
        for contrast in contrasts_to_use[ds][task]:
            tmp = nib.load(
                os.path.join(
                    datadir, 'mean_%s_task%03d_zstat%d_run1.nii.gz' %
                    (ds, task, contrast))).get_data()
            data[ctr, :] = tmp[maskvox]
            ctr += 1

l = fastcluster.linkage(data, method=clustering_type, metric='euclidean')

plot_data = True
if plot_data:
    plt.figure(figsize=(16, 10))
    plt.hold(True)
    dendrogram(l, labels=contrast_labels, orientation='right')
    #plt.show()
    plt.savefig(os.path.join(outdir,
                             'cluster_figure_%s.pdf' % clustering_type),
                format='pdf')
        for contrast in contrasts_to_use[ds][task]:
            contrast_labels.append(ds+'_task%d:%s'%(task,contrasts[ds]['task%03d'%task][contrast]))
            contrast_labels_short.append(ds+'_t%d_z%d'%(task,contrast))
            
            ctr+=1
            
nconds=ctr

mask=nib.load(os.path.join(dataprepdir,'goodvoxmask.nii.gz'))
maskvox=N.where(mask.get_data()>0)
              
data=N.zeros((nconds,len(maskvox[0])))

ctr=0
for ds in contrasts_to_use.iterkeys():
    for task in contrasts_to_use[ds].iterkeys():
        for contrast in contrasts_to_use[ds][task]:
            tmp=nib.load(os.path.join(datadir,'mean_%s_task%03d_zstat%d_run1.nii.gz'%(ds,task,contrast))).get_data()
            data[ctr,:]=tmp[maskvox]
            ctr+=1

l=fastcluster.linkage(data,method=clustering_type,metric='euclidean')

plot_data=True
if plot_data:
    plt.figure(figsize=(16,10))
    plt.hold(True)
    dendrogram(l,labels=contrast_labels,orientation='right')
    #plt.show()
    plt.savefig(os.path.join(outdir,'cluster_figure_%s.pdf'%clustering_type),format='pdf')
예제 #40
0
from math import *
import numpy as np

# dendrogram
import hcluster
similarity = [
    [1.00, 0.93, 0.86, 0.84, 0.69, 0.65],
    [0.93, 1.00, 0.79, 0.83, 0.64, 0.67],
    [0.86, 0.79, 1.00, 0.75, 0.82, 0.54],
    [0.84, 0.83, 0.75, 1.00, 0.57, 0.79],
    [0.69, 0.64, 0.82, 0.57, 1.00, 0.36],
    [0.65, 0.67, 0.54, 0.79, 0.36, 1.00],
]

Z = hcluster.single(similarity)
hcluster.dendrogram(Z)

# k-means
data = np.array([6, 12, 18, 24, 30, 42, 48])

centroids = np.array([18, 45])
# centroids = np.array([15, 40])


def dist(x, y):
    return abs(x - y)


clusters = {}
for center in centroids:
    clusters[center] = []
예제 #41
0
def plot_clusters(Dr, ct):
    L = linkage(Dr, method='single', metric='cosine')
    dendrogram(L, color_threshold=ct)
    pylab.show()
예제 #42
0
from matplotlib.pyplot import show

from hcluster import pdist, linkage, dendrogram
import numpy
from numpy.random import rand

X = rand(10, 100)
X[0:5, :] *= 2
Y = pdist(X)
Z = linkage(Y)
dendrogram(Z)

show()
# now turn that into symmatrix matrix of 
# cosine similarities
#########################################
import numpy
mat = numpy.empty((n, n))
for i in xrange(0,n):
    for j in xrange(0,n):
       mat[i][j] = nltk.cluster.util.cosine_distance(feature_vectors[i],feature_vectors[j])

#########################################
# now hierarchically cluster mat
#########################################
from hcluster import linkage, dendrogram
t = 0.8
Z = linkage(mat, 'single')
dendrogram(Z, color_threshold=t)

import pylab
pylab.savefig( "hcluster.png" ,dpi=800)

#########################################
# extract our clusters
#########################################
def extract_clusters(Z,threshold,n):
   clusters={}
   ct=n
   for row in Z:
      if row[2] < threshold:
          n1=int(row[0])
          n2=int(row[1])
예제 #44
0
 def dendrogram(self):
     #import pylab as p
     if not self.linkage == None:
         hcluster.dendrogram(self.linkage,
                             labels=np.unique(self._dataset.labels))
예제 #45
0
data = np.genfromtxt("../../data/ExpRawData-E-TABM-84-A-AFFY-44.tab",
                     names=True,
                     usecols=tuple(range(1, 30)),
                     dtype=float,
                     delimiter="\t")

data_array = data.view((np.float, len(data.dtype.names)))
data_array = data_array[1:1000].transpose()

data_dist = pdist(data_array)  # computing the distance

data_link = linkage(data_dist)  # computing the linkage

# just plot the dendrogram.
dendrogram(data_link, labels=data.dtype.names)
plt.savefig('../../results/dendrogram.png')

# or plot the heatmap too!

# Compute and plot first dendrogram.
fig = plt.figure(figsize=(8, 8))
# x ywidth height
ax1 = fig.add_axes([0.05, 0.1, 0.2, 0.6])
Y = linkage(data_dist, method='single')
Z1 = dendrogram(Y, orientation='right',
                labels=data.dtype.names)  # adding/removing the axes
ax1.set_xticks([])

# Compute and plot second dendrogram.
ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
예제 #46
0
        ("http://www.gutenberg.org/files/21593/21593-0.txt",      "Urteil"),
        ("http://www.gutenberg.org/cache/epub/22367/pg22367.txt", "Verwand.")]

# get it from the interwebs
catalogue = []

for url, name in urls:
    headers = {'User-Agent': 'Mozilla/5.0'}
    req = urllib2.Request(url, None, headers)
    catalogue.append(urllib2.urlopen(req).read())

# calc similarity matrix
M = all_pairs(catalogue,
    distance=distances.jaccard,
    dist_kwargs=dict(mode=1),
    parallel=True)

# plot similarity matrix
pylab.figure(1)
pylab.title("similarity matrix")
pylab.imshow(M, aspect='auto', interpolation="nearest", cmap="Reds")
pylab.colorbar()

# plot complete linkage
pylab.figure(2)
pylab.title("complete linkage clustering")
hcluster.dendrogram(cluster(M, method='complete'), leaf_label_func=lambda i: urls[i][1])

# finally show
pylab.show()
예제 #47
0
def main(argv):

    print argv
    if (len(argv) > 0):
        params = argv[::2]
        param_values = argv[1::2]

    crit_func = squared_criterion
    merge_func = d_min
    for i in range(0, len(argv), 2):
        if params[i] == "--criterium":
            if param_values[i + 1] == "silhoette":
                crit_func = silhouette_criterion
            elif param_values[i + 1] == "squared":
                crit_func = squared_criterion
            else:
                crit_func = silhouette_criterion
        elif params[i] == "--merge":
            if param_values[i + 1] == "de":
                merge_func = d_e
            elif param_values[i + 1] == "dmax":
                merge_func = d_max
            else:
                merge_func = d_e

    Cluster.clusters = []
    Cluster.squared_criterion_values = []
    Cluster.silhouette_criterion_values = []

    my_data = np.genfromtxt('./data.csv', delimiter=',', dtype=float)
    #Make only clusterization params in array
    data_list = my_data[1:].tolist()
    maximum = 0

    data_list = data_list[:]
    etalon = data_list[:]

    for i in range(len(data_list)):
        data_list[i] = data_list[i][2:]

    #normalize all lists:
    data_list = np.array(data_list)

    #count all distances
    print "Precounting distances"
    for i in range(len(data_list)):
        for j in range(len(data_list)):
            print ".",
            Cluster.counted_distances[(tuple(data_list[i]), tuple(
                data_list[j]))] = hexic_euqlid_distance(
                    data_list[i], data_list[j])

    print "Distances Counted"

    for i in range(len(data_list)):
        Cluster.etalon_clasters[tuple(data_list[i][2:])] = etalon[i][1]

    print Cluster.etalon_clasters.values()
    #Make each element = 1 cluster

    for x in data_list:
        Cluster.clusters.append(Cluster(x))

    print(len(Cluster.clusters))
    K_num = 1
    swo(K_num, merge_func, crit_func)
    Y = Cluster.merge_history[1:]
    Z = linkage(Y)
    plt.subplot(121)
    dendrogram(Z, labels=range(len(data_list)))
    squared_criterion_values = Cluster.squared_criterion_values[::-1]
    silhouette_criterion_values = Cluster.silhouette_criterion_values[::-1]
    plt.subplot(122)
    if (crit_func == silhouette_criterion):
        plt.plot(range(len(silhouette_criterion_values)),
                 silhouette_criterion_values)
        plt.axis([
            K_num, 30,
            min(silhouette_criterion_values),
            max(silhouette_criterion_values)
        ])
    else:
        plt.plot(range(len(squared_criterion_values)),
                 squared_criterion_values)
        plt.axis([
            K_num, 30,
            min(squared_criterion_values),
            max(squared_criterion_values)
        ])

    plt.show()

    for x in Cluster.clusters:
        x.etalon_to_current_mapping()
        print x.etalon_map
예제 #48
0
import hcluster
import matplotlib.pyplot as plt
import pickle
import urllib

url = "http://examples.obspy.org/dissimilarities.pkl"
dissimilarity = pickle.load(urllib.urlopen(url))

plt.subplot(121)
plt.imshow(1 - dissimilarity, interpolation="nearest")

dissimilarity = hcluster.squareform(dissimilarity)
threshold = 0.3
linkage = hcluster.linkage(dissimilarity, method="single")
clusters = hcluster.fcluster(linkage, 0.3, criterion="distance")

plt.subplot(122)
hcluster.dendrogram(linkage, color_threshold=0.3)
plt.xlabel("Event number")
plt.ylabel("Dissimilarity")
plt.show()
예제 #49
0
                                  p['data_label'], 'data.pickle')))

for key, val in data.iteritems():
# for bla in [1]:
#     key, val = 'eagle', data['eagle']
    

    fig = plt.figure()
    fig.canvas.mpl_connect('pick_event', onpick)
    plt.subplot(3, 1, 1)
    plt.title(key)

    proj = np.dot(val['U'][:, 0:2].T, val['vecs'])
    Y = pdist(proj.T)
    Z = linkage(Y)
    dendrogram(Z)
    ax = plt.subplot(3, 1, 2)


    for i in range(proj.shape[1]):
        col = (1 - (val['ratings'][i] / 100.0)) * 0.7
        pt, = ax.plot(proj[0, i], proj[1, i],
                       '.',
                       color=('%f' % col),
                       picker=3)
        ax.text(proj[0, i], proj[1, i], i)
        pt.name = val['keys'][i]

    plt.subplot(3, 1, 3)
    plt.plot(val['d'])
    plt.savefig(path.join(output_dir, key + ".png"))
예제 #50
0
        ("http://www.gutenberg.org/cache/epub/22367/pg22367.txt", "Verwand.")]

# get it from the interwebs
catalogue = []

for url, name in urls:
    headers = {'User-Agent': 'Mozilla/5.0'}
    req = urllib2.Request(url, None, headers)
    catalogue.append(urllib2.urlopen(req).read())

# calc similarity matrix
M = all_pairs(catalogue,
              distance=distances.jaccard,
              dist_kwargs=dict(mode=1),
              parallel=True)

# plot similarity matrix
pylab.figure(1)
pylab.title("similarity matrix")
pylab.imshow(M, aspect='auto', interpolation="nearest", cmap="Reds")
pylab.colorbar()

# plot complete linkage
pylab.figure(2)
pylab.title("complete linkage clustering")
hcluster.dendrogram(cluster(M, method='complete'),
                    leaf_label_func=lambda i: urls[i][1])

# finally show
pylab.show()