Exemplo n.º 1
0
    def cluster_data(self, row_distance="euclidean", row_linkage="single", axis="row", column_distance="euclidean", column_linkage="ward"):
        """Performs clustering according to the given parameters.
        @datatype - numeric/binary
        @row_distance/column_distance - see. DISTANCES variable
        @row_linkage/column_linkage - see. LINKAGES variable
        @axis - row/both
        """
        print("Clustering rows:", row_distance, row_linkage)
        self.clustering_axis = axis
        row_linkage = str(row_linkage)
        
        if row_linkage in RAW_LINKAGES:
            self.clustering = fastcluster.linkage(self.data, method=row_linkage, metric=row_distance)

        else:
            self.distance_vector = fastcluster.pdist(self.data, row_distance)

            if self.datatype == "numeric" and not row_distance in DISTANCES[self.datatype]:
                raise Exception("".join(["When clustering numeric data you must choose from these distance measures: ", ", ".join(DISTANCES[self.datatype])]))
            elif (self.datatype == "binary" or self.datatype == "nominal") and not row_distance in DISTANCES[self.datatype]:
                raise Exception("".join(["When clustering binary or nominal data you must choose from these distance measures: ", ", ".join(DISTANCES[self.datatype])]))

            self.clustering = fastcluster.linkage(self.distance_vector, method=str(row_linkage))


        if not self.missing_value is False:
            self.data = self.__return_missing_values__(self.data, self.missing_values_indexes)
        self.column_clustering = []

        if axis == "both" and len(self.data[0]) > 2:
            print("Clustering columns:", column_distance, column_linkage)
            self.__cluster_columns__(column_distance, column_linkage)
        
        if self.write_original or self.datatype == "nominal":
            self.data = self.original_data
Exemplo n.º 2
0
    def cluster_data(self, data_type="numeric", row_distance="euclidean", row_linkage="single", axis="row", column_distance="euclidean", column_linkage="ward"):
        """Performs clustering according to the given parameters.
        @data_type - numeric/binary
        @row_distance/column_distance - see. DISTANCES variable
        @row_linkage/column_linkage - see. LINKAGES variable
        @axis - row/both
        """
        
        print("Clustering rows:", row_distance, row_linkage)
        self.data_type = data_type
        self.clustering_axis = axis
        row_linkage = str(row_linkage)
        
        if row_linkage in RAW_LINKAGES:
            self.clustering = fastcluster.linkage(self.data, method=row_linkage, metric=row_distance)

        else:
            self.distance_vector = fastcluster.pdist(self.data, row_distance)

            if data_type in DISTANCES and not row_distance in DISTANCES[data_type]:
                raise Exception("".join(["When clustering" , data_type, "data you must choose from these distance measures: ", ", ".join(DISTANCES[data_type])]))
            elif not data_type in DISTANCES.keys():
                raise Exception("".join(["You can choose only from data types: ", ", ".join(DISTANCES.keys())]))

            self.clustering = fastcluster.linkage(self.distance_vector, method=str(row_linkage))

        self.column_clustering = []
        if axis == "both" and len(self.data[0]) > 2:
            print("Clustering columns:", column_distance, column_linkage)
            self.__cluster_columns__(column_distance, column_linkage)

        if self.write_original:
            self.data = self.original_data

        return
Exemplo n.º 3
0
    def hierarchical(
        self,
        nclusters,
        linkage_method,
        noise=False,
        ):

        if noise:
            matrix = self.distance_matrix.add_noise()
        else:
            matrix = self.distance_matrix

        linkmat = linkage(squareform(matrix), linkage_method)
        linkmat_size = len(linkmat)
        if nclusters <= 1:
            br_top = linkmat[linkmat_size - nclusters][2]
        else:
            br_top = linkmat[linkmat_size - nclusters + 1][2]
        if nclusters >= len(linkmat):
            br_bottom = 0
        else:
            br_bottom = linkmat[linkmat_size - nclusters][2]
        threshold = 0.5 * (br_top + br_bottom)
        T = fcluster(linkmat, threshold, criterion='distance')
        return Partition(T)
Exemplo n.º 4
0
def saveLinkage(distanceMatrix):
#     link = linkage(distanceMatrix, 'ward')
    link = fastcluster.linkage(distanceMatrix, method='ward') # D-distance matrix
    afile = open(structfolder + 'wardlinkage.pkl', 'wb')
    pickle.dump(link, afile);
    afile.close();
    return link
Exemplo n.º 5
0
    def test_basic_clustering(self):
        data = [
            [1.0, 2.0],
            [2.0, 1.0],
            [2.1, 1.1],
            [2, 1.1],
            [1.0, 2.1],
        ]
        data = np.array(data)

        dist = fastcluster.pdist(data)
        result = fastcluster.linkage(dist).tolist()

        assert_that(int(result[0][0])).is_equal_to(0)
        assert_that(int(result[0][1])).is_equal_to(4)
        assert_that(result[0][2]).is_close_to(0.1, 0.00001)
        assert_that(int(result[0][3])).is_equal_to(2)

        assert_that(int(result[1][0])).is_equal_to(1)
        assert_that(int(result[1][1])).is_equal_to(3)
        assert_that(result[1][2]).is_close_to(0.1, 0.00001)
        assert_that(int(result[1][3])).is_equal_to(2)

        assert_that(int(result[2][0])).is_equal_to(2)
        assert_that(int(result[2][1])).is_equal_to(6)
        assert_that(result[2][2]).is_close_to(0.1, 0.00001)
        assert_that(int(result[2][3])).is_equal_to(3)

        assert_that(int(result[3][0])).is_equal_to(5)
        assert_that(int(result[3][1])).is_equal_to(7)
        assert_that(result[3][2]).is_close_to(1.34536, 0.00001)
        assert_that(int(result[3][3])).is_equal_to(5)
Exemplo n.º 6
0
def run_entity_model(cdev, cprc):
    print '____________________________________________________'
    print 'running entity model'
    hdev, hprc, hmapping, entcorp, er = process_entities(cdev, cprc)
    print 'removed', len(cdev)- len(hdev), 'documents', len(hdev), 'left'
    voc = build_voc(entcorp, 2)
    
    ent_vectorizer = CountVectorizer(vocabulary = voc)
    E = ent_vectorizer.fit_transform(hdev)
    
    Eclean, emapping = filter_rare(E, 0)

    E_dense = np.matrix(Eclean).astype('float')
    E_scaled = preprocessing.scale(E_dense)
    E_normalized = preprocessing.normalize(E_scaled, norm='l2')
    
    EMatrix = pairwise_distances(E_normalized, metric='cosine')
    EL = fastcluster.linkage(EMatrix, method='average')
    flat_eclust = hierarchy.fcluster(EL, 0.5, 'distance')
    ec = organize_clusters(flat_eclust, th = 3)
    
    ecf = []
    for cl in ec:
        ecf.append([hmapping[emapping[t]] for t in cl])
    print 'detected', len(ecf), 'entity clusters'      
    return ecf, voc
Exemplo n.º 7
0
def run_ngram_model(cdev, cprc):
    print '____________________________________________________'
    print 'running n-gram model'
    wcorp = []
    for i in cprc:
        wcorp.append(' '.join(cprc[i]['words']))
        
    vectorizer = CountVectorizer(analyzer='word', binary=True, min_df=max(int(len(wcorp)*0.0005), 5), ngram_range=(2,3))
    X = vectorizer.fit_transform(wcorp)
    Xclean, mapping = filter_rare(X)
    
    Xdense = np.matrix(Xclean).astype('float')
    X_scaled = preprocessing.scale(Xdense)
    X_normalized = preprocessing.normalize(X_scaled, norm='l2')
    
    textMatrix = pairwise_distances(X_normalized, metric='cosine')
    L = fastcluster.linkage(textMatrix, method='average')
    flat_textclust = hierarchy.fcluster(L, 0.5, 'distance')
    ttc = organize_clusters(flat_textclust)
    
    ncf = []
    for cl in ttc:
        ncf.append([mapping[t] for t in cl])
    print 'detected', len(ncf), 'n-gram clusters'     
    return ncf
Exemplo n.º 8
0
    def cluster(self):
        
        # We cluster for each argument independently!        
        retval = ClusterResult()
        
        curOffset = 0
        argNum = 0
        for symbolsForArg in self.contentProvider.getSourceAPISymbols():
            D = self._calculateDistanceMatrix(symbolsForArg)

            curOffset = len(retval.clusterIdToDatapoint.keys())

            if len(symbolsForArg) == 0:
                argNum += 1
                continue
            
            if len(symbolsForArg) == 1:
                retval.register(curOffset, symbolsForArg[0], argNum)
                argNum += 1
                continue
            
            Z = linkage(D, method=self.linkageMethod)
            clustering = fcluster(Z, self.maxDistInCluster, criterion = 'distance')
            
            retval.registerSet(symbolsForArg, clustering, curOffset, argNum)
            argNum += 1
        
        return retval
Exemplo n.º 9
0
 def hclust(self):
     link_file = self.datafile + '.link.npy'
     if os.path.isfile(link_file) and os.path.getmtime(link_file) >= os.path.getmtime(self.datafile):
         self.link_matrix = np.load(link_file)
     else:
         blast_score = self.normalized.as_matrix()
         self.link_matrix = fastcluster.linkage(blast_score, method='average', 
                                                metric='correlation', 
                                                preserve_input=False)
         del blast_score
         np.save(link_file, self.link_matrix)
         
     self.gene_num = self.normalized.shape[0]
     self.node_num = self.gene_num + self.link_matrix.shape[0]
     self.parent_tree = np.array(np.arange(self.node_num))
     self.leaf_num = np.array([1] * self.gene_num + 
                              [0] * (self.node_num - self.gene_num))
     for i in range(self.link_matrix.shape[0]):
         assert(self.parent_tree[self.link_matrix[i, 0]] == int(self.link_matrix[i, 0]))
         assert(self.parent_tree[self.link_matrix[i, 1]] == int(self.link_matrix[i, 1]))
         assert(self.leaf_num[self.gene_num + i] == 0)
         self.parent_tree[self.link_matrix[i, 0]] = self.gene_num + i
         self.parent_tree[self.link_matrix[i, 1]] = self.gene_num + i
         self.leaf_num[i + self.gene_num] = self.leaf_num[self.link_matrix[i, 0]] + \
                                         self.leaf_num[self.link_matrix[i, 1]]
def training_predict(X, K):
    """
    Get unique masks and cluster indices on the training set.

    Parameters
    ----------
    X : (N, F) ndarray of boolean

    Returns
    -------
    umasks : (UK, F) ndarray of bool

    cluster_ind : (N,) ndarray of int
        Each cluster ind is [0, K'), with K' <= K,
        or [0, UK) if K == -1 or K >= UK.
    """
    umasks = tc.mask_distribution.get_unique_masks(X)
    UK = umasks.shape[0]
    if K < 0 or K >= UK:
        cluster_ind = np.zeros(X.shape[0], dtype=int)
        for i in range(1, UK):
            cluster_ind[(X == umasks[i]).all(1)] = i
    else:
        Z = fastcluster.linkage(X, method="single", metric="hamming")
        cluster_ind = fcluster(Z, K, criterion="maxclust") - 1
    return umasks, cluster_ind
Exemplo n.º 11
0
def writeClusters(results):
    threshold = 0.9
    results = numpy.fromiter(results, dtype=[('pairs', 'i8', 2), ('score', 'f4', 1,)])
    i_to_id, condensed_distances, N = condensedDistance(results)
    linkages = fastcluster.linkage(condensed_distances, method='ward')
    partition = hcluster.fcluster(linkages, threshold, criterion='inconsistent')
    clusters = {}
    for (i, cluster_id) in enumerate(partition):
        clusters.setdefault(cluster_id, []).append(i_to_id[i])
    i = 0
    for cluster in clusters.values():
        images = []
        for index in cluster:
            image_name = all_images[index]
            image_path = os.path.join(imagedir, image_name)
            cluster_path = 'clustered_images/{0}'.format(str(i))

            # There must be a better way to do this
            try:
                os.mkdir(cluster_path)
            except OSError:
                for f in os.listdir(cluster_path):
                    try:
                        os.remove(f)
                    except OSError:
                        pass
            print('writing %s' % image_name)
            with open(image_path, 'rb') as inp:
                with open(os.path.join('clustered_images', str(i), image_name), 'wb') as outp:
                    outp.write(inp.read())
        i += 1
Exemplo n.º 12
0
def test_all():
  D2 = D.copy()
  for method in ['single', 'complete', 'average', 'weighted', 'ward', 'centroid', 'median']:
    Z2 = fc.linkage(D, method)
    if np.any(D2!=D):
      raise AssertionError('Input array was corrupted.')
    test(Z2, method)
Exemplo n.º 13
0
def hist_per_stagione(start=1992, end=2012):
	stagione=(all_labels > start) & (all_labels < end)
	dist_selected=dist[ix_(stagione,stagione)]
	Z=linkage(squareform(dist_selected),method='complete')
	n=choose_p(Z)
	c=fcluster(Z,n,criterion='maxclust')-1

	label_anni=all_labels[stagione]
	#order by first appearance!
	first_appearance=[]
	for i in range(0,n):
	    first_appearance.append(min(label_anni[c==i]))

	order1=[index for key,index in sorted(zip(first_appearance,range(0,n)))]
	order2=[index for key,index in sorted(zip(order1,range(0,n)))]
	order=array(order2)
	c=order[c]

	#draw scatter plot
	scatter(label_anni,c,s=100,c=c)
	#grid(b=True,axis='y')
	yticks(range(0,n+1))
	xlim((min(label_anni)-0.5,max(label_anni)+0.5))
	ax=gca()
	for i in range(1993,2011+1):
		ax.add_line(Line2D([i+7./12,i+7./12],[0,n+1],linestyle='--'))
	show()
Exemplo n.º 14
0
def hierarchical_clust(d, cluster_method='average'):
    if VERBOSE:
        print 'Doing hierarchical clustering using fastcluster!'
    # some might say this function is redundant
    # d should be a distance vector
    Z = fastcluster.linkage(d, method=cluster_method)
    return Z
Exemplo n.º 15
0
def clusterAndDendogrgam(Metrix):
    out = fastcluster.linkage(Metrix, method="single", metric="euclidean", preserve_input=True)
    plt.plot()
    dend = augmented_dendrogram(
        out,
        p=30,
        truncate_mode=None,
        color_threshold=None,
        get_leaves=True,
        orientation="top",
        labels=None,
        count_sort=False,
        distance_sort=False,
        show_leaf_counts=True,
        no_plot=False,
        no_labels=False,
        color_list=None,
        leaf_font_size=None,
        leaf_rotation=None,
        leaf_label_func=None,
        no_leaves=False,
        show_contracted=False,
        link_color_func=None,
    )
    plt.show()
    return out
Exemplo n.º 16
0
def clusterHeatmap(df, title, row_label_map, col_label_map, colormap=my_cmap, 
                   cluster_rows=False, cluster_columns=False, cluster_data=None,
                   row_dendrogram=False, column_dendrogram=False, width=30, height=20, vmin=-3, vmax=3, distmethod="correlation", colorbar=True, colorbar_shrink=0.2, label_values=False):

    cm = pylab.get_cmap(colormap)
    cm.set_bad("0.9")

    # do clustering 
    if cluster_data is None:
        cluster_data = df # cluster the same data that we are plotting    

    matplotlib.rcParams['figure.figsize'] = [width, height]    
    #    pylab.figsize(20, 10)
    pylab.title(title)
#    pylab.text(0,-5,str(datetime.date.today()))
    
    # ylabels = [genesym[geneid] for geneid in pt.axes[0][Z['leaves']]]
    #  xlabels = pt.axes[1][cZ['leaves']]
    
    orderedVal = df
    
    if cluster_rows:
        distances = scipy.cluster.hierarchy.distance.pdist(cluster_data.values, distmethod)
        rowY = fastcluster.linkage(distances)
        rowZ = scipy.cluster.hierarchy.dendrogram(rowY, orientation='right', no_plot=True)
        orderedVal = df.reindex(index=df.axes[0][rowZ['leaves']])

        
    if cluster_columns:
        coldist = scipy.cluster.hierarchy.distance.pdist(df.values.transpose(), distmethod)
        cY = scipy.cluster.hierarchy.linkage(coldist)
        cZ = scipy.cluster.hierarchy.dendrogram(cY, no_plot=True)    
        orderedVal = orderedVal.reindex(columns=df.axes[1][cZ['leaves']])
    
    # row labels 
    if row_label_map is not None:
        pylab.yticks(range(0, len(orderedVal.index)), [row_label_map[i] for i in orderedVal.index])        
    else:
        pylab.yticks(range(0, len(orderedVal.index)), orderedVal.index)
    pylab.xticks(range(0, len(orderedVal.columns)), orderedVal.columns, rotation=90)

    
    if col_label_map is not None:
        pylab.xticks(range(0, len(orderedVal.columns)), [col_label_map[i] for i in orderedVal.columns])                
    

    if label_values:
        cmatrix = orderedVal.as_matrix()
        for x in range(cmatrix.shape[0]):
            for y in range(cmatrix.shape[1]):
                if cmatrix[x, y] >= 0:
                    pylab.text(y, x, "%.1f" % cmatrix[x,y], horizontalalignment='center',
                 verticalalignment='center')        
    
    #orderedVal = orderedVal[:,]
    pylab.tick_params(direction="out")
    pylab.imshow(orderedVal, interpolation="nearest", cmap=cm, aspect='auto', norm=None, vmin=vmin, vmax=vmax)
    if colorbar:
        pylab.colorbar(shrink=colorbar_shrink)
Exemplo n.º 17
0
    def test_fastcluster_other_method(self):
        import fastcluster

        kws = self.default_kws.copy()
        kws["method"] = "average"
        linkage = fastcluster.linkage(self.x_norm.T, method="average", metric="euclidean")
        p = mat._DendrogramPlotter(self.x_norm, **kws)
        npt.assert_array_equal(p.linkage, linkage)
Exemplo n.º 18
0
def heatmap_cor( x, vec, minval, maxval ):


# Compute and plots heatmap & dendrogram.
  norm,corr,dist=analyse.all_corr(vec)

  print 'statrting to cluster...'
  fig = plt.figure(figsize=(8,8))
  ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
  z=fastcluster.linkage(dist, method='complete')
  #z=fastcluster.linkage(norm,metric='euclidean', method='ward')
  print 'clustering done, drawing the dendogram' 
  Z1 = sch.dendrogram(z, labels=x,orientation='right')
  del norm
  del dist

  plt.yticks(fontsize=8)
  #ax1.set_yticks([])
  ticks = ax1.get_xticks() #/ max(ax1.get_xticks())
  ticks=map(float,ticks)
  ticks = ['%.2f' % (a/2.) for a in ticks]
  ax1.set_xticklabels(ticks)
  
# Plot distance matrix.
  axmatrix = fig.add_axes([0.4,0.1,0.5,0.6])
  axmatrix.set_xticks([])
  axmatrix.set_yticks([])

  axmatrix.xaxis.tick_top()
  axmatrix.set_frame_on(False)
  idx1 = Z1['leaves']
  idx2 = Z1['leaves']
  xx=[]
  for i in idx1:
      xx.append(x[int(i)]) 
  D = corr[idx1,:]
  D = D[:,idx2]

  print 'heatmap' 
  im = axmatrix.pcolor(D,  cmap=plt.cm.RdYlBu,edgecolor='k',)
  plt.xticks(fontsize=5)
  plt.yticks([])

  xx=[]
  for i in idx1:
      xx.append(x[int(i)])   
   
  
  #plt.yticks(np.arange(len(x)),xx,fontsize = 12)
  plt.xticks(np.arange(len(x)),xx)
  plt.xticks(rotation=90)
  plt.xticks(fontsize=8)

  axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
  plt.colorbar(im, cax=axcolor)
  fig.show()  

  return xx
Exemplo n.º 19
0
 def __cluster_columns__(self, column_distance, column_linkage):
     columns = zip(*self.data)
     self.column_clustering = fastcluster.linkage(columns, method=column_linkage, metric=column_distance)
     self.data_order = hcluster.leaves_list(self.column_clustering)
     self.data = self.__reorder_data__(self.data, self.data_order)
     self.original_data = self.__reorder_data__(self.original_data, self.data_order)
     if self.header:
         self.header = self.__reorder_data__([self.header], self.data_order)[0]
     return
Exemplo n.º 20
0
def test():
    n = np.random.randint(2,100)

    # Part 1: distance matrix input

    N = n*(n-1)//2
    D = np.random.rand(N)
    # Insert a single NaN value
    pos = np.random.randint(N)
    D[pos] = np.nan

    for method in ['single', 'complete', 'average', 'weighted', 'ward',
                   'centroid', 'median']:
        try:
            fastcluster.linkage(D, method=method)
            raise AssertionError('fastcluster did not detect a NaN value!')
        except FloatingPointError:
            pass

    # Next: the original array does not contain a NaN, but a NaN occurs
    # as an updated distance.
    for method in ['average', 'weighted', 'ward', 'centroid', 'median']:
        try:
            fastcluster.linkage([np.inf,-np.inf,-np.inf], method=method)
            raise AssertionError('fastcluster did not detect a NaN value!')
        except FloatingPointError:
            pass

    # Part 2: vector input

    dim = np.random.randint(2,13)
    X = np.random.rand(n,dim)
    pos = (np.random.randint(n), np.random.randint(dim))
    # Insert a single NaN coordinate
    X[pos] = np.nan

    for method in ['single', 'ward', 'centroid', 'median']:
        try:
            fastcluster.linkage_vector(X, method=method)
            raise AssertionError('fastcluster did not detect a NaN value!')
        except FloatingPointError:
            pass

    return True
Exemplo n.º 21
0
    def __init__(self, metric, trajectories, method='single', precomputed_values=None):
        """Initialize a hierarchical clusterer using the supplied distance
        metric and method.

        Method should be one of the fastcluster linkage methods,
        namely 'single', 'complete', 'average', 'weighted', 'centroid', 'median',
        or 'ward'.

        Parameters
        ----------
        metric : msmbuilder.metrics.AbstractDistanceMetric
            A metric capable of handling `ptraj`
        trajectory : Trajectory list of Trajectorys
            data to cluster
        method : {'single', 'complete', 'average', 'weighted', 'centroid',
                  'median', 'ward'}
        precomputed_values :
            used internally to implement load_from_disk()

        Notes
        -----
        This is implemenred with the fastcluster library, which can be downloaded
        from CRAN http://cran.r-project.org/web/packages/fastcluster/
        """

        if precomputed_values is not None:
            precomputed_z_matrix, traj_lengths = precomputed_values
            if isinstance(precomputed_z_matrix, np.ndarray) and precomputed_z_matrix.shape[1] == 4:
                self.Z = precomputed_z_matrix
                self.traj_lengths = traj_lengths
                return
            else:
                raise Exception('Something is wrong')

        if not isinstance(metric, metrics.AbstractDistanceMetric):
            raise TypeError('%s is not an abstract distance metric' % metric)
        if not method in self.allowable_methods:
            raise ValueError("%s not in %s" % (method, str(self.allowable_methods)))
        if isinstance(trajectories, md.Trajectory):
            trajectories = [trajectories]
        elif isinstance(trajectories, types.GeneratorType):
            trajectories = list(trajectories)


        self.traj_lengths = np.array([len(t) for t in trajectories])
        # self.ptrajs = [self.metric.prepare_trajectory(traj) for traj in self.trajectories]

        logger.info('Preparing...')
        flat_trajectory = concatenate_trajectories(trajectories)
        pflat_trajectory = metric.prepare_trajectory(flat_trajectory)

        logger.info('Getting all to all pairwise distance matrix...')
        dmat = metric.all_pairwise(pflat_trajectory)
        logger.info('Done with all2all')
        self.Z = fastcluster.linkage(dmat, method=method, preserve_input=False)
        logger.info('Got Z matrix')
def cluster(data_vecs, method='average', metric='cosine', save=True):
    print "Calculating the linkage matrix, metric = {0}, method = {1}".format(metric, method)
    links = fc.linkage(data_vecs, metric=metric,method=method)

    # if save:
    #     print "Saving the model to: results/" + filename + "/linkage"
    #     file = open('results/' + filename + '_linkage', 'wb')
    #     pickle.dump(links, file)
    #     file.close()
    return links
Exemplo n.º 23
0
    def test_fastcluster_non_euclidean(self):
        import fastcluster

        kws = self.default_kws.copy()
        kws['metric'] = 'cosine'
        kws['method'] = 'average'
        linkage = fastcluster.linkage(self.x_norm.T, method=kws['method'],
                                      metric=kws['metric'])
        p = mat._DendrogramPlotter(self.x_norm, **kws)
        npt.assert_array_equal(p.linkage, linkage)
Exemplo n.º 24
0
 def cluster(self, cluster_count = None, cluster_radius = 10.0):
     x = self.x
     nx = x.shape[0]
     D=pdist(x)
     l = fc.linkage(D,'single')
     l0 = numpy.hstack((x,x, numpy.zeros((nx,1)), numpy.ones((nx,1))))
         
     self._ct = ClusterTree(l0, l)
     self._ct.find_groups(cluster_radius)
     self._ct.sort_groups()
Exemplo n.º 25
0
def buildClusters(featMatrix):
    distanceMatrix = pdist(featMatrix[:,:], metric='jaccard')
    pickleSave(structsfolder + 'distanceMatrix.pkl', distanceMatrix)
#     ed = euclidean_distances(featMatrix[1:100,:], featMatrix[1:100,:])
    linkage = fastcluster.linkage(distanceMatrix, method='ward') # D-distance matrix
#     fc = fcluster(link, 30, criterion='maxclust')
    #R = dendrogram(link, color_threshold=0.3, leaf_font_size=6)
    #pylab.savefig( "/home/rojosewe/Dropbox/MAI90/tesis/images/wordClustering/featMatrix.png" )
    distanceMatrix = None
    pickleSave(structsfolder + 'linkage.pkl', linkage)
Exemplo n.º 26
0
    def perform_clustering(self, kwargs):
        """
        Performs the hierarchical clustering step and the clustering step. If the hierarchical
        matrix is given, then it just calculates the clusters for a given cutoff. If we call the algorithm
        a second time it will use the last matrix.
        """
        """
        Gets a condensed matrix and calculates the clustering. One can use
        diverse methodologies to do this clustering...
        With preserve_input=False the matrix is destroyed while clustering, ut it saves
        memory.
        The metric is not needed in this case,as we are giving the function the calculated
        matrix.
        The method is the method used to determine distances when fusing clusters. methods are described in:
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
        """
        try:
            cutoff = kwargs["cutoff"]
        except KeyError:
            cutoff = None

        try:
            hie_mat = kwargs["hie_mat"]
        except KeyError:
            hie_mat = None

        try:
            method = kwargs["method"]
        except KeyError:
            method = 'complete'

        if hie_mat != None:
            self.hie_mat = hie_mat
#            print "[HIERARCHICAL] Matrix provided."
        else:
            if self.hie_mat == None:
                #self.hie_mat = fast_hcluster.linkage(condensed_matrix, method='centroid', metric='euclidean', preserve_input=False)
#                print "[HIERARCHICAL] Calculating Matrix"
                #self.hie_mat = fastclust.linkage(self.condensed_matrix.get_data(), method = method)
                self.hie_mat = hcluster_fast.linkage(self.condensed_matrix.get_data(), method = method)
#            else:
#                print "[HIERARCHICAL] Matrix was already stored"

        algorithm_details = "Hierarchical with "+method+" method (cutoff = " +str(cutoff)+")"

        if cutoff != None:
            # Then apply the cutoff, this doesn't work much as expected
#            print "[HIERARCHICAL] getting clustering."+algorithm_details
            group_list = hcluster.fcluster(self.hie_mat,cutoff)
#            print "[HIERARCHICAL] Clustering done."+algorithm_details
            # Then let's generate the clusters
            clusters = gen_clusters_from_class_list(group_list)
            return Clustering(clusters,details = algorithm_details)
        else:
            return None
Exemplo n.º 27
0
def complete_linkage(dm):
    """
    Perform complete linkage hierarchical clustering on a distance matrix.

    Args:
        dm (numpy.array): Distance matrix

    Returns:
        (object): fastcluster complete linkage hierarchical clustering object
    """
    return linkage(dm, 'complete')
Exemplo n.º 28
0
def define_clusts(similarity_matrix, threshold=0.05, max_iter=200,
                  method='ap'):
    """Define clusters given the similarity matrix and the threshold."""
    n, labels = connected_components(similarity_matrix, directed=False)
    prev_max_clust = 0
    print("connected components: %d" % n)
    clusters = labels.copy()

    if method == 'dbscan':
        ap = DBSCAN(metric='precomputed', min_samples=1, eps=.2, n_jobs=-1)
    if method == 'ap':
        ap = AffinityPropagation(affinity='precomputed', max_iter=max_iter,
                                 preference='median')

    for i in range(n):
        idxs = np.where(labels == i)[0]
        if idxs.shape[0] > 1:
            sm = similarity_matrix[idxs][:, idxs]
            sm += sm.T + scipy.sparse.eye(sm.shape[0])

            # Hierarchical clustering
            if method == 'hc':
                dists = squareform(1 - sm.toarray())
                links = fastcluster.linkage(dists, method='ward')
                try:
                    clusters_ = fcluster(links, threshold, 'distance')
                except ValueError as err:
                    logging.critical(err)
                    clusters_ = np.zeros(1, dtype=int)

            # DBSCAN
            elif method == 'dbscan':
                db = ap.fit(1. - sm.toarray())
                # Number of clusters in labels, ignoring noise if present.
                clusters_ = db.labels_
                # n_clusters_ = len(set(clusters_)) - int(0 in clusters_)

            # AffinityPropagation
            # ap = AffinityPropagation(affinity='precomputed')
            elif method == 'ap':
                db = ap.fit(sm)
                clusters_ = db.labels_
            else:
                raise ValueError("clustering method %s unknown" % method)

            if np.min(clusters_) == 0:
                clusters_ += 1
            clusters_ += prev_max_clust
            clusters[idxs] = clusters_
            prev_max_clust = max(clusters_)
        else:  # connected component contains just 1 element
            prev_max_clust += 1
            clusters[idxs] = prev_max_clust
    return np.array(extra.flatten(clusters))
Exemplo n.º 29
0
def cluster(dupes, threshold=.5, max_components=30000):
    '''
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    Keyword arguments:
    threshold -- number betweent 0 and 1 (default is .5). lowering the 
                 number will increase precision, raising it will increase
                 recall
    '''
    threshold = 1 - threshold

    dupe_sub_graphs = connected_components(dupes, max_components)

    clustering = {}
    cluster_id = 0
    for sub_graph in dupe_sub_graphs:
        if len(sub_graph) > 1:

            (i_to_id, condensed_distances) = condensedDistance(sub_graph)
            N = max(i_to_id) + 1

            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid', 
                                          preserve_input=False)

            partition = hcluster.fcluster(linkage, 
                                          threshold,
                                          criterion='distance')

            clusters = {}

            for (i, sub_cluster_id) in enumerate(partition):
                clusters.setdefault(cluster_id + sub_cluster_id, []).append(i)

            cophenetic_distances = hcluster.cophenet(linkage)

            for cluster_id, items in clusters.iteritems() :
                if len(items) > 1 :
                    score = clusterConfidence(items, cophenetic_distances, N)
                    clustering[cluster_id] = (tuple(i_to_id[item] 
                                                    for item in items),
                                              1 - score)

            cluster_id += max(partition) + 1
        else:
            ids, score = sub_graph[0]
            clustering[cluster_id] = tuple(ids), score
            cluster_id += 1
            

    return clustering.values()
Exemplo n.º 30
0
    def _hclust(self, nclusters, method, noise=False):
        """
        :param nclusters: Number of clusters to return
        :param linkage_method: single, complete, average, ward, weighted, centroid or median
                               (http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html)
        :param noise: Add Gaussian noise to the distance matrix prior to clustering (bool, default=False)
        :return: Partition object describing clustering
        """
        matrix = self.get_dm(noise)

        linkmat = fastcluster.linkage(squareform(matrix), method)
        return _hclust(linkmat, nclusters)
Exemplo n.º 31
0
from machinelearning import datasetselection, featureselection
import machinelearning.dataclasses as dc
import pickle
from operator import itemgetter
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as hierarchy
import scipy.spatial.distance as distance
import fastcluster

# load distance matrix
with open('gene_gene_matrix_euclidean_distance_from_projection.pickle',
          'rb') as fr:
    gene_gene = pickle.load(fr)

# prefer ward linkage for euclidean distance or at least this case
lnk = fastcluster.linkage(distance.squareform(gene_gene.matrix, checks=False),
                          'ward')
si = hierarchy.leaves_list(lnk).astype('int64')

# load projection
with open('gene_atb_matrix_2d_dnn_projection.pickle', 'rb') as fr:
    gene_proj = pickle.load(fr)
if ~(gene_proj.rowlabels == gene_gene.rowlabels).all():
    raise ValueError('genes not aligned')
gene_proj.reorder(si, 0)
ordered_genes = gene_proj.rowlabels.copy()
del gene_gene, lnk, si

# select datasets
dataset_info = datasetselection.finddatasets(getalllevels=True)
included_datasetabbrevs = {
    'clinvar', 'dbgap_cleaned', 'gad', 'gadhighlevel_cleaned', 'gobp', 'gocc',
Exemplo n.º 32
0
def diff_exp_clusters(cluster_expression_df, cluster_sizes, file_format):
    n_clusters = len(cluster_sizes)

    cluster_sum_umi = np.vstack([
        cluster_sizes[c] *
        cluster_expression_df[f'Cluster {c} mean UMI'].values
        for c in range(n_clusters)
    ])

    cluster_ssq_umi = np.vstack([
        cluster_sizes[c] *
        (cluster_expression_df[f'Cluster {c} std UMI'].values**2 +
         cluster_expression_df[f'Cluster {c} mean UMI'].values**2)
        for c in range(n_clusters)
    ])

    Z = fastcluster.linkage(cluster_sum_umi, method='average', metric='cosine')

    fig = matplotlib.figure.Figure(figsize=(12, 12))

    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    scipy.cluster.hierarchy.dendrogram(Z,
                                       ax=ax,
                                       color_threshold=0,
                                       above_threshold_color='grey')

    ax.set_title('Hierarchical structure of cell-type clusters')
    ax.set_xlabel('Cluster Label')
    ax.tick_params(labelleft='off')

    FigureCanvasAgg(fig).print_figure(file_format.format('dendrogram', 'png'))

    root, rd = scipy.cluster.hierarchy.to_tree(Z, rd=True)

    def de(lbl_1, lbl_2, group1, group2):
        print(f'Comparing {group1} to {group2}')

        group1_n_cells = sum(cluster_sizes[c] for c in group1)
        group2_n_cells = sum(cluster_sizes[c] for c in group2)

        group1_mean = cluster_sum_umi[group1, :].sum(axis=0) / group1_n_cells
        group2_mean = cluster_sum_umi[group2, :].sum(axis=0) / group2_n_cells

        mean_diff = group1_mean - group2_mean

        group1_var = (cluster_ssq_umi[group1, :].sum(axis=0) / group1_n_cells -
                      group1_mean**2)
        group2_var = (cluster_ssq_umi[group2, :].sum(axis=0) / group2_n_cells -
                      group2_mean**2)

        pooled_sd = np.sqrt(group1_var / group1_n_cells +
                            group2_var / group2_n_cells)

        z_scores = np.zeros_like(pooled_sd)
        nz = pooled_sd > 0
        z_scores[nz] = np.nan_to_num(mean_diff[nz] / pooled_sd[nz])

        # t-test
        p_vals = np.clip(
            (1 - stats.norm.cdf(np.abs(z_scores))) * 2 * z_scores.shape[0], 0,
            1)

        df = pd.DataFrame(OrderedDict([('z', z_scores), ('p', p_vals),
                                       ('group1', group1_mean),
                                       ('group2', group2_mean)]),
                          index=cluster_expression_df.index)

        df = df[df['p'] < 0.001]
        df['diff'] = df['group1'] - df['group2']

        df.sort_values('diff', ascending=False, inplace=True)

        name = f'differential_gene_expression_{lbl_1}_v_{lbl_2}'

        df.to_csv(file_format.format(name, 'csv'))

    for i in range(0, 2 * n_clusters - 1):
        if i >= n_clusters:
            left_child = rd[i].get_left()
            left_clusters = (left_child.pre_order(lambda x: x.id))

            right_child = rd[i].get_right()
            right_clusters = (right_child.pre_order(lambda x: x.id))

            # don't calculate if it's redundant with a 1-vs-all comp
            if i == 2 * n_clusters - 2 and (len(left_clusters) == 1
                                            or len(right_clusters) == 1):
                continue

            de(left_child.id, right_child.id, left_clusters, right_clusters)

        if i < 2 * n_clusters - 2:
            below = rd[i].pre_order(lambda x: x.id)
            above = [j for j in range(len(cluster_sizes)) if j not in below]

            # don't calculate redundant comparison
            if len(above) == 1:
                continue

            de(i, 'all', below, above)

    group_list = [(i, rd[i].pre_order(lambda x: x.id))
                  for i in range(0, 2 * n_clusters - 1)]
    group_list[-1] = ('total', group_list[-1][1])

    return group_list
  def test_adjusted_rand_performance(self):

    # Arrange
    n = 100
    np.random.seed(seed = 8455624)
    x = np.random.normal(n, 2, (n, 2))
    A = linkage(x, 'centroid')
    B = linkage(x, 'ward')
    
    # Act
    
    similarity_times = []
    sklearn_times = []
    fcluster_times = []
    
    for repitition in range(100):
    
      start = perf_counter()
    
      metrics = similarity_metrics(A, B)
      ar_similarity = metrics.adjusted_rand()
      
      end = perf_counter()
      
      similarity_times.append(end-start)
      
      ar_sklearn = []
      
      sklearn_time = 0
      fcluster_time = 0
      
      excluded_results = 0
      for i in range(n - 1, 1, -1):
      
        start = perf_counter()
        
        fcluster_a = fcluster(A, i, 'maxclust')
        fcluster_b = fcluster(B, i, 'maxclust')
        
        end = perf_counter()
        
        fcluster_time += (end - start)
        
        start = perf_counter()
        
        ar = adjusted_rand_score(fcluster_a, fcluster_b)
        
        end = perf_counter()
        
        sklearn_time += (end - start)
        
        # fcluster takes maxclust rather than an exact number of clusters 
        # most of the time it will create exactly maxclust, but for the occassions 
        # that it doesn't the results are are not comparable so ignore them
        if (len(np.unique(fcluster_a)) != i) or (len(np.unique(fcluster_b)) != i):
          excluded_results += 1
          ar_sklearn.append(ar_similarity[len(ar_sklearn)])
          
        else:
          ar_sklearn.append(ar)
       
      sklearn_times.append(sklearn_time)
      fcluster_times.append(fcluster_time)
      
      ar_sklearn = np.array(ar_sklearn)
       
      idx = ar_sklearn != np.nan
      
      # Assert
      self.assertEqual(len(ar_sklearn), len(ar_similarity))
      assert_almost_equal(ar_similarity, ar_sklearn)
      self.assertEqual(4, excluded_results) # double-check that we haven't excluded everything
    
    print("\nSimilarity average time: ", np.average(similarity_times))
    print("\nSklearn average time: ", np.average(sklearn_times))
    print("\nFCluster average time: ", np.average(fcluster_times))
Exemplo n.º 34
0
    plt.title('Image show of the correlation lattice - longitudinal')

    plt.figure()
    plt.imshow(dfC[0, 1::2, :])
    plt.colorbar()
    plt.title('Image show of the correlation lattice - lattitudinal')
    
#    plt.figure()
#    plt.imshow(y)
#    plt.title('Distance matrix')

#    render_component_single(gfC.d[0, :, :], gfC.lats, gfC.lons, False, None, "Neighbor correlation")

    print("Clustering ...")
    plt.figure()
    Z = fastcluster.linkage(ytri, method = 'single')
    print("Plotting dendrogram ...")
    dendrogram(Z, 7, 'level')
    
    max_d = np.amax(Z[:,2])
    print("Maximum distance is %g" % max_d)
    my_d = max_d / 2
    cont = True
    while cont:
        f = fcluster(Z, my_d, 'distance')
        print f.shape, my_d
        if np.amax(f) > 30:
            my_d = (max_d + my_d) * 0.5
        elif np.amax(f) < 10:
            my_d = my_d - (max_d - my_d) / max_d
        else:
Exemplo n.º 35
0
def clusterHeatmap(df,
                   title,
                   row_label_map,
                   col_label_map,
                   colormap=my_cmap,
                   cluster_rows=False,
                   cluster_columns=False,
                   cluster_data=None,
                   row_dendrogram=False,
                   column_dendrogram=False,
                   width=30,
                   height=20,
                   vmin=-3,
                   vmax=3,
                   distmethod="correlation",
                   colorbar=True,
                   colorbar_shrink=0.2,
                   label_values=False):

    cm = pylab.get_cmap(colormap)
    cm.set_bad("0.9")

    # do clustering
    if cluster_data is None:
        cluster_data = df  # cluster the same data that we are plotting

    matplotlib.rcParams['figure.figsize'] = [width, height]
    #    pylab.figsize(20, 10)
    pylab.title(title)
    #    pylab.text(0,-5,str(datetime.date.today()))

    # ylabels = [genesym[geneid] for geneid in pt.axes[0][Z['leaves']]]
    #  xlabels = pt.axes[1][cZ['leaves']]

    orderedVal = df

    if cluster_rows:
        distances = scipy.cluster.hierarchy.distance.pdist(
            cluster_data.values, distmethod)
        rowY = fastcluster.linkage(distances)
        rowZ = scipy.cluster.hierarchy.dendrogram(rowY,
                                                  orientation='right',
                                                  no_plot=True)
        orderedVal = df.reindex(index=df.axes[0][rowZ['leaves']])

    if cluster_columns:
        coldist = scipy.cluster.hierarchy.distance.pdist(
            df.values.transpose(), distmethod)
        cY = scipy.cluster.hierarchy.linkage(coldist)
        cZ = scipy.cluster.hierarchy.dendrogram(cY, no_plot=True)
        orderedVal = orderedVal.reindex(columns=df.axes[1][cZ['leaves']])

    # row labels
    if row_label_map is not None:
        pylab.yticks(range(0, len(orderedVal.index)),
                     [row_label_map[i] for i in orderedVal.index])
    else:
        pylab.yticks(range(0, len(orderedVal.index)), orderedVal.index)
    pylab.xticks(range(0, len(orderedVal.columns)),
                 orderedVal.columns,
                 rotation=90)

    if col_label_map is not None:
        pylab.xticks(range(0, len(orderedVal.columns)),
                     [col_label_map[i] for i in orderedVal.columns])

    if label_values:
        cmatrix = orderedVal.as_matrix()
        for x in range(cmatrix.shape[0]):
            for y in range(cmatrix.shape[1]):
                if cmatrix[x, y] >= 0:
                    pylab.text(y,
                               x,
                               "%.1f" % cmatrix[x, y],
                               horizontalalignment='center',
                               verticalalignment='center')

    #orderedVal = orderedVal[:,]
    pylab.tick_params(direction="out")
    pylab.imshow(orderedVal,
                 interpolation="nearest",
                 cmap=cm,
                 aspect='auto',
                 norm=None,
                 vmin=vmin,
                 vmax=vmax)
    if colorbar:
        pylab.colorbar(shrink=colorbar_shrink)
Exemplo n.º 36
0
    def consensus(self,
                  k,
                  density_threshold_str='0.5',
                  local_neighborhood_size=0.30,
                  show_clustering=False,
                  skip_density_and_return_after_stats=False,
                  close_clustergram_fig=True):
        merged_spectra = load_df_from_npz(self.paths['merged_spectra'] % k)
        norm_counts = sc.read(self.paths['normalized_counts'])

        if skip_density_and_return_after_stats:
            density_threshold_str = '2'
        density_threshold_repl = density_threshold_str.replace('.', '_')
        density_threshold = float(density_threshold_str)
        n_neighbors = int(local_neighborhood_size * merged_spectra.shape[0] /
                          k)

        # Rescale topics such to length of 1.
        l2_spectra = (merged_spectra.T / np.sqrt(
            (merged_spectra**2).sum(axis=1))).T

        if not skip_density_and_return_after_stats:
            # Compute the local density matrix (if not previously cached)
            topics_dist = None
            if os.path.isfile(self.paths['local_density_cache'] % k):
                local_density = load_df_from_npz(
                    self.paths['local_density_cache'] % k)
            else:
                #   first find the full distance matrix
                topics_dist = squareform(fast_euclidean(l2_spectra.values))
                #   partition based on the first n neighbors
                partitioning_order = np.argpartition(topics_dist, n_neighbors +
                                                     1)[:, :n_neighbors + 1]
                #   find the mean over those n_neighbors (excluding self, which has a distance of 0)
                distance_to_nearest_neighbors = topics_dist[
                    np.arange(topics_dist.shape[0])[:,
                                                    None], partitioning_order]
                local_density = pd.DataFrame(
                    distance_to_nearest_neighbors.sum(1) / (n_neighbors),
                    columns=['local_density'],
                    index=l2_spectra.index)
                save_df_to_npz(local_density,
                               self.paths['local_density_cache'] % k)
                del (partitioning_order)
                del (distance_to_nearest_neighbors)

            density_filter = local_density.iloc[:, 0] < density_threshold
            l2_spectra = l2_spectra.loc[density_filter, :]

        kmeans_model = KMeans(n_clusters=k, n_init=10, random_state=1)
        kmeans_model.fit(l2_spectra)
        kmeans_cluster_labels = pd.Series(kmeans_model.labels_ + 1,
                                          index=l2_spectra.index)

        # Find median usage for each gene across cluster
        median_spectra = l2_spectra.groupby(kmeans_cluster_labels).median()

        # Normalize median spectra to probability distributions.
        median_spectra = (median_spectra.T / median_spectra.sum(1)).T

        # Compute the silhouette score
        stability = silhouette_score(l2_spectra.values,
                                     kmeans_cluster_labels,
                                     metric='euclidean')

        # Obtain the reconstructed count matrix by re-fitting the usage matrix and computing the dot product: usage.dot(spectra)
        refit_nmf_kwargs = yaml.load(open(self.paths['nmf_run_parameters']),
                                     Loader=yaml.FullLoader)
        refit_nmf_kwargs.update(
            dict(n_components=k, H=median_spectra.values, update_H=False))

        _, rf_usages = self._nmf(norm_counts.X, nmf_kwargs=refit_nmf_kwargs)
        rf_usages = pd.DataFrame(rf_usages,
                                 index=norm_counts.obs.index,
                                 columns=median_spectra.index)
        rf_pred_norm_counts = rf_usages.dot(median_spectra)

        # Compute prediction error as a frobenius norm
        if sp.issparse(norm_counts.X):
            prediction_error = ((norm_counts.X.todense() -
                                 rf_pred_norm_counts)**2).sum().sum()
        else:
            prediction_error = ((norm_counts.X -
                                 rf_pred_norm_counts)**2).sum().sum()

        consensus_stats = pd.DataFrame(
            [k, density_threshold, stability, prediction_error],
            index=[
                'k', 'local_density_threshold', 'stability', 'prediction_error'
            ],
            columns=['stats'])

        if skip_density_and_return_after_stats:
            return consensus_stats

        save_df_to_npz(
            median_spectra,
            self.paths['consensus_spectra'] % (k, density_threshold_repl))
        save_df_to_npz(
            rf_usages,
            self.paths['consensus_usages'] % (k, density_threshold_repl))
        save_df_to_npz(
            consensus_stats,
            self.paths['consensus_stats'] % (k, density_threshold_repl))
        save_df_to_text(
            median_spectra,
            self.paths['consensus_spectra__txt'] % (k, density_threshold_repl))
        save_df_to_text(
            rf_usages,
            self.paths['consensus_usages__txt'] % (k, density_threshold_repl))

        # Compute gene-scores for each GEP by regressing usage on Z-scores of TPM
        tpm = sc.read(self.paths['tpm'])
        tpm_stats = load_df_from_npz(self.paths['tpm_stats'])

        if sp.issparse(tpm.X):
            norm_tpm = (np.array(tpm.X.todense()) -
                        tpm_stats['__mean'].values) / tpm_stats['__std'].values
        else:
            norm_tpm = (tpm.X -
                        tpm_stats['__mean'].values) / tpm_stats['__std'].values

        usage_coef = fast_ols_all_cols(rf_usages.values, norm_tpm)
        usage_coef = pd.DataFrame(usage_coef,
                                  index=rf_usages.columns,
                                  columns=tpm.var.index)

        save_df_to_npz(
            usage_coef,
            self.paths['gene_spectra_score'] % (k, density_threshold_repl))
        save_df_to_text(
            usage_coef, self.paths['gene_spectra_score__txt'] %
            (k, density_threshold_repl))

        # Convert spectra to TPM units, and obtain results for all genes by running last step of NMF
        # with usages fixed and TPM as the input matrix
        norm_usages = rf_usages.div(rf_usages.sum(axis=1), axis=0)
        refit_nmf_kwargs.update(dict(H=norm_usages.T.values, ))

        _, spectra_tpm = self._nmf(tpm.X.T, nmf_kwargs=refit_nmf_kwargs)
        spectra_tpm = pd.DataFrame(spectra_tpm.T,
                                   index=rf_usages.columns,
                                   columns=tpm.var.index)
        save_df_to_npz(
            spectra_tpm,
            self.paths['gene_spectra_tpm'] % (k, density_threshold_repl))
        save_df_to_text(
            spectra_tpm,
            self.paths['gene_spectra_tpm__txt'] % (k, density_threshold_repl))

        if show_clustering:
            if topics_dist is None:
                topics_dist = squareform(fast_euclidean(l2_spectra.values))
                # (l2_spectra was already filtered using the density filter)
            else:
                # (but the previously computed topics_dist was not!)
                topics_dist = topics_dist[
                    density_filter.values, :][:, density_filter.values]

            spectra_order = []
            for cl in sorted(set(kmeans_cluster_labels)):

                cl_filter = kmeans_cluster_labels == cl

                if cl_filter.sum() > 1:
                    cl_dist = squareform(topics_dist[cl_filter, :][:,
                                                                   cl_filter])
                    cl_dist[
                        cl_dist <
                        0] = 0  #Rarely get floating point arithmetic issues
                    cl_link = linkage(cl_dist, 'average')
                    cl_leaves_order = leaves_list(cl_link)

                    spectra_order += list(
                        np.where(cl_filter)[0][cl_leaves_order])
                else:
                    ## Corner case where a component only has one element
                    spectra_order += list(np.where(cl_filter)[0])

            from matplotlib import gridspec
            import matplotlib.pyplot as plt

            width_ratios = [0.5, 9, 0.5, 4, 1]
            height_ratios = [0.5, 9]
            fig = plt.figure(figsize=(sum(width_ratios), sum(height_ratios)))
            gs = gridspec.GridSpec(len(height_ratios),
                                   len(width_ratios),
                                   fig,
                                   0.01,
                                   0.01,
                                   0.98,
                                   0.98,
                                   height_ratios=height_ratios,
                                   width_ratios=width_ratios,
                                   wspace=0,
                                   hspace=0)

            dist_ax = fig.add_subplot(gs[1, 1],
                                      xscale='linear',
                                      yscale='linear',
                                      xticks=[],
                                      yticks=[],
                                      xlabel='',
                                      ylabel='',
                                      frameon=True)

            D = topics_dist[spectra_order, :][:, spectra_order]
            dist_im = dist_ax.imshow(D,
                                     interpolation='none',
                                     cmap='viridis',
                                     aspect='auto',
                                     rasterized=True)

            left_ax = fig.add_subplot(gs[1, 0],
                                      xscale='linear',
                                      yscale='linear',
                                      xticks=[],
                                      yticks=[],
                                      xlabel='',
                                      ylabel='',
                                      frameon=True)
            left_ax.imshow(kmeans_cluster_labels.values[spectra_order].reshape(
                -1, 1),
                           interpolation='none',
                           cmap='Spectral',
                           aspect='auto',
                           rasterized=True)

            top_ax = fig.add_subplot(gs[0, 1],
                                     xscale='linear',
                                     yscale='linear',
                                     xticks=[],
                                     yticks=[],
                                     xlabel='',
                                     ylabel='',
                                     frameon=True)
            top_ax.imshow(kmeans_cluster_labels.values[spectra_order].reshape(
                1, -1),
                          interpolation='none',
                          cmap='Spectral',
                          aspect='auto',
                          rasterized=True)

            hist_gs = gridspec.GridSpecFromSubplotSpec(3,
                                                       1,
                                                       subplot_spec=gs[1, 3],
                                                       wspace=0,
                                                       hspace=0)

            hist_ax = fig.add_subplot(hist_gs[0, 0],
                                      xscale='linear',
                                      yscale='linear',
                                      xlabel='',
                                      ylabel='',
                                      frameon=True,
                                      title='Local density histogram')
            hist_ax.hist(local_density.values, bins=np.linspace(0, 1, 50))
            hist_ax.yaxis.tick_right()

            xlim = hist_ax.get_xlim()
            ylim = hist_ax.get_ylim()
            if density_threshold < xlim[1]:
                hist_ax.axvline(density_threshold, linestyle='--', color='k')
                hist_ax.text(density_threshold + 0.02,
                             ylim[1] * 0.95,
                             'filtering\nthreshold\n\n',
                             va='top')
            hist_ax.set_xlim(xlim)
            hist_ax.set_xlabel(
                'Mean distance to k nearest neighbors\n\n%d/%d (%.0f%%) spectra above threshold\nwere removed prior to clustering'
                % (sum(~density_filter), len(density_filter), 100 *
                   (~density_filter).mean()))

            fig.savefig(self.paths['clustering_plot'] %
                        (k, density_threshold_repl),
                        dpi=250)
            if close_clustergram_fig:
                plt.close(fig)
Exemplo n.º 37
0
def generate_heatmap(
    dataframe: pd.DataFrame,
    color_dataframe: pd.DataFrame,
    histo_dataframe: pd.DataFrame,
    color_map: dict = None,
    cluster: bool = True,
    figsize: tuple = (10, 15)) -> plt.Figure:

    fig = plt.figure(figsize=figsize)

    if cluster:
        linkage = fastcluster.linkage(dataframe.T,
                                      "complete",
                                      metric="correlation",
                                      preserve_input=True)
        dendrogram_row_ratio = 2
    else:
        linkage = None
        # Make row smaller without dendrogram
        dendrogram_row_ratio = 0.1

    max_rows = 4 if color_map is not None else 3

    # The bar plot is actually on the third row
    # FIXME: Handle long labels (ratios)

    if color_map is not None:
        bar_ratio = 0.25 * len(color_map)
        height_ratios = [dendrogram_row_ratio, 0.25, bar_ratio, 15]
    else:
        height_ratios = [dendrogram_row_ratio, 0.25, 15]

    gs = grid.GridSpec(max_rows,
                       2,
                       height_ratios=height_ratios,
                       width_ratios=[0.2, 15])

    dendro_ax = fig.add_subplot(gs[0, 1], axisbg="white")  # Dendrogram

    plt.setp(dendro_ax.get_yticklabels(), visible=False)

    pathway_ax = fig.add_subplot(gs[-1, 0])  # Pathway
    heatmap_ax = fig.add_subplot(gs[-1, 1], sharey=pathway_ax)  # Heatmap

    # Con gridspec e' necessario fare questo in modo che gli assi Y
    # non siano visibili

    plt.setp(heatmap_ax.get_yticklabels(), visible=False)

    if linkage is not None:
        leaf_ax = fig.add_subplot(gs[1, 1], sharex=dendro_ax)
        dendro = sch.dendrogram(
            linkage,
            ax=dendro_ax,
            no_labels=False,
            labels=dataframe.columns,
            leaf_rotation=90,
        )
        # Reorder dataframe according to the labels in the leaves
        dataframe = dataframe[dendro["ivl"]]  # Leaf node labels

        # Put labels in the right order!
        histo_dataframe = histo_dataframe.loc[dendro["ivl"]]

        # TRICK: Given that printing labels screws layout because they add an
        # x axis, we generate a specific axis only with the text, iterating on
        # the locations of the labels of the dendrogram. After the new text is
        # in place, we remove the labels from the dendrogram.

        for leafname, leafcoord in zip(dendro["ivl"],
                                       dendro_ax.xaxis.get_ticklocs()):
            leaf_ax.text(leafcoord,
                         0.99,
                         leafname,
                         rotation=90,
                         horizontalalignment="center")

    else:
        set_axis_parameters(heatmap_ax, dataframe, False)
        leaf_ax = fig.add_subplot(gs[1, 1], sharex=heatmap_ax)
        dataframe = dataframe.loc[:, histo_dataframe.index]

        set_axis_parameters(leaf_ax, dataframe, False)

        for index, leafcoord in enumerate(leaf_ax.xaxis.get_ticklocs()):
            leaf_ax.text(leafcoord,
                         0.99,
                         dataframe.columns[index],
                         rotation=90,
                         horizontalalignment="center")

    clean_axis(leaf_ax)
    leaf_ax.grid(False)
    leaf_ax.axis('off')
    clean_axis(dendro_ax)

    if color_map is not None:

        subgrids = len(color_map)
        gs_inside = grid.GridSpecFromSubplotSpec(
            subgrids,
            1,
            subplot_spec=gs[2, 1],
            height_ratios=[1 for item in color_map])

        bars = list()
        for index, group in enumerate(sorted(color_map)):

            column = color_map[group]
            bar_ax = fig.add_subplot(gs_inside[index])
            clean_axis(bar_ax)
            create_colorbar(histo_dataframe,
                            bar_ax,
                            False,
                            column,
                            labels=False)
            bar_ax.text(-0.25,
                        0.5,
                        group,
                        horizontalalignment="right",
                        verticalalignment="center")
            bars.append(bar_ax)

    create_colorbar(color_dataframe, pathway_ax)

    cmap, norm = create_colormap()

    dataframe = dataframe.loc[color_dataframe.index]

    heatmap1 = heatmap_ax.pcolor(dataframe,
                                 cmap=cmap,
                                 edgecolors="black",
                                 alpha=1,
                                 norm=norm)

    set_axis_parameters(heatmap_ax, dataframe, False)

    cax = fig.add_axes([-0.05, 1.025, 0.15, 0.025])

    cbar = fig.colorbar(heatmap1,
                        cax=cax,
                        orientation="horizontal",
                        ticks=range(9))

    cbar.solids.set_edgecolor("face")

    gs.tight_layout(fig)

    return fig, cax
Exemplo n.º 38
0
def test_dendrogram_plotting():
    c.Z = linkage(iris['data'], c.algorithm)
    labels = np.random.rand(1, iris['data'].shape[0])[0]
    c.dendrogram = c._calculate_dendrogram(labels)
    assert len(labels) == len(c.dendrogram['ivl'])
Exemplo n.º 39
0
					try:
						boosted_wdfVoc[k] = wdfVoc[k] * boost_entity[k]
					except:
						boosted_wdfVoc[k] = wdfVoc[k]

				print "sorted wdfVoc*boost_entity:"
				print sorted( ((v,k) for k,v in boosted_wdfVoc.iteritems()), reverse=True)
				'''
            #Hclust: fast hierarchical clustering with fastcluster
            #X is samples by features
            #distMatrix is sample by samples distances
            distMatrix = pairwise_distances(X_normalized, metric='cosine')

            #cluster tweets
            print "fastcluster, average, cosine"
            L = fastcluster.linkage(distMatrix, method='average')

            #for dt in [0.3, 0.4, 0.5, 0.6, 0.7]:
            #for dt in [0.5]:
            dt = 0.5
            print "hclust cut threshold:", dt
            #				indL = sch.fcluster(L, dt, 'distance')
            indL = sch.fcluster(L, dt * distMatrix.max(), 'distance')
            #print "indL:", indL
            freqTwCl = Counter(indL)
            print "n_clusters:", len(freqTwCl)
            print(freqTwCl)
            #				print "silhoutte: ", metrics.silhouette_score(distMatrix, indL, metric="precomputed")
            allowSiloutte = False
            for freqTwClkey, freqTwClCount in freqTwCl.iteritems():
                if (freqTwClCount > 1):
Exemplo n.º 40
0
def cluster_array_to_k_groups(R, k):
    Z=fastcluster.linkage(R, method='average', metric='euclidean', preserve_input=True)
    import tree
    tr=tree.Tree(Z=Z)
    X=tr.representatives(n_picks=k, l_keep_members=True)
    return X
Exemplo n.º 41
0
        img_ds.RasterCount),  # number of bands
    gdal_array.GDALTypeCodeToNumericTypeCode(
        img_ds.GetRasterBand(1).DataType))  # data type code
#print img.shape # warning: that assumed that the raster bands were all the same type (should be true)

# reshape the image band by band
for b in range(img.shape[2]):
    img[:, :, b] = img_ds.GetRasterBand(b + 1).ReadAsArray()

# reshape image again to match expected format for scikit-learn
new_shape = (img.shape[0] * img.shape[1], img.shape[2])
X = img[:, :, :img.shape[2]].reshape(new_shape)

# use fastcluster.linkage instead of scipy.cluster.hierarchy.linkage
print "calculating linkage.."
Z = fc.linkage(X, 'average')  # https://en.wikipedia.org/wiki/UPGMA

print "calculating dendrogram.."
fig = plt.figure(figsize=(10, 10))  # 25, 10
plt.title('hierarchical clustering dendrogram')
rotate = False

plt.ylabel('distance' if (not rotate) else 'index')
plt.xlabel('index' if (not rotate) else 'distance')
dn = dendrogram(
    Z,
    #truncate_mode='lastp',
    #p = n_clusters,
    leaf_rotation=0. if rotate else 90.,
    show_contracted=True,
    orientation='right' if rotate else 'top',
Exemplo n.º 42
0
 def cluster(self, method='average', metric='euclidean', l_row=True, l_col=True):
     if l_row:
         self.Zr=fastcluster.linkage(self.data, method='average', metric='euclidean', preserve_input=True)
         #left_dendrogram=clst.dendrogram(Zr, orientation='left')
     if l_col:
         self.Zc=fastcluster.linkage(self.data.T, method='average', metric='euclidean', preserve_input=True)
Exemplo n.º 43
0
def prog_linkage(X, n_cluster):
    hclust = linkage(X, method='single')
    labels = fcluster(hclust, t=n_cluster, criterion="maxclust")
    return labels - 1
    pickle_fp = os.path.join(PICKLE_DATA, "%s.pkl" % run_name)
    [pair_indexs, data_dict, traj_list, gene_pair_names
     ] = convert_data_into_np_array(stage_data_dir_name,
                                    INDEX_RANGE,
                                    pickle_fp,
                                    load=False,
                                    OFF_SET=OFF_SET,
                                    include_sox_and_t=include_sox_and_t,
                                    Filter=False)

    metric = "directed_hausdorff_plus_pair_wise_euclidean"  # pair_wise_euclidean_distance
    distance_fp = os.path.join(NPY_DATA, "%s_%s.npy" % (run_name, metric))
    calc_distance_matrix(distance_fp, traj_list)
    cm = plt.get_cmap('gist_rainbow')
    p_dist = np.load(distance_fp)
    Z = fc.linkage(p_dist, method="ward")
    distance_threshold = 800 if log_transformed else 10
    labels = fcluster(Z, t=distance_threshold, criterion="distance") - 1
    [
        passed_traj_list, passed_labels, passed_pair_indexs,
        passed_gene_pair_names
    ] = filter_cluster(traj_list, labels, pair_indexs, gene_pair_names)
    CLUSTER_PLOT_CMAP = "gist_rainbow"
    passed_labels = plot_cluster(passed_traj_list,
                                 passed_labels,
                                 run_name,
                                 FIGURE_FORMAT,
                                 color_palette=None,
                                 log_transformed=log_transformed,
                                 cmap=CLUSTER_PLOT_CMAP)
Exemplo n.º 45
0
def uhc_cluster(cosmic_list, ref_sig):
    spectra = [list(ref_sig.values())]  # so ref signature is value 0
    for sig in cosmic_list:
        spectra.append(list(cosmic_list[sig].values()))
    return linkage(spectra, method='ward', metric='cosine')
Exemplo n.º 46
0
def hacluster(y):
    """ Wrapper for the Hierarchical Clustering algorithm from fastcluster """
    z = fastcluster.linkage(y, method='single')
    return z
Exemplo n.º 47
0
def validation(M,df_encoded,results,Z,method,min_K,max_K,automatic=None,pp=None,gap=None,Tp=None):
    ##############################################################################
    # HOW MANY CLUSTERS?
    ###############################################################################
    # bootstrap method - sampling without replacement

    #dictionary to store all computed indexes for each number of clusters K=min_K,...max_K
    nn_history = defaultdict(dict)
    trees = defaultdict(dict)
    dicio_statistics = {k:{} for k in range(min_K,max_K)}

    for k in range(min_K,max_K):
        for index in indexes:
            dicio_statistics[k][index] = []

        c_assignments_original = cut_tree(Z, k)
        # list of clusters for the clustering result with the original data
        partition_original = cluster_indices(c_assignments_original, df_encoded.index.tolist())
        trees[k] = partition_original


    #for each bootstrap sample
    for i in range(M):
        # sampling rows of the original data
        idx = np.random.choice(len(df_encoded), int((3/4)*len(df_encoded)), replace = False)
        idx = np.sort(idx)
        #get all the possible combinations between the sampled patients
        patient_comb_bootstrap = list(itertools.combinations(df_encoded.loc[idx,'id_patient'],2))
        patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap,columns = ['patient1','patient2'])
        #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering
        results_bootstrap = pd.merge(results, patient_comb_bootstrap, how='inner', on=['patient1','patient2'])
        # Hierarchical Clustering of the bootstrap sample
        Z_bootstrap = linkage(results_bootstrap['score'],method)

        #for each number of clusters k=min_K,...,max_K
        for k, partition in trees.items():

            c_assignments_bootstrap = cut_tree(Z_bootstrap,k)
            #list of clusters for the clustering result with the bootstrap sample
            partition_bootstrap = cluster_indices(c_assignments_bootstrap,idx)
            #compute 4 different cluster external indexes between the partitions
            #computed_indexes = cluster_external_index(partition,partition_bootstrap)
            computed_indexes = clustereval.calculate_external(partition, partition_bootstrap)



            #print(computed_indexes)
            for pos, index in enumerate(external_indexes):
                dicio_statistics[k][index].append(computed_indexes[pos])

    for k, partition in trees.items():
        calc_idx = clustereval.calculate_internal(results[['patient1', 'patient2', 'score']], partition, k, trees[max_K - 1])
        for index in internal_indexes:
            dicio_statistics[k][index].append(calc_idx[index])
    ###########################################################################
    #  DECISION ON THE NUMBER OF CLUSTERS
    # The correct number of clusters is the k that yield most maximum average values of
    # clustering indices.
    # Also the k found before needs to have a low value of standard deviation - it has to
    # be the minimum between all k's or a value that is somehow still low compared to others
    ###########################################################################

    #dataframe that stores the clustering indices averages for each k
    col = indexes.copy()
    col.extend(['k', 'k_score_avg'])
    df_avgs = pd.DataFrame(index = range(min_K,max_K),columns = col, dtype='float')
    #dataframe that stores the AR and AW indices standard deviations for each k
    df_stds = pd.DataFrame(index = range(min_K,max_K),columns = col, dtype = 'float')

    #computing the means and standard deviations
    for k in range(min_K,max_K):
        df_avgs.loc[k]['k'] = k
        df_stds.loc[k]['k'] = k
        for index in indexes:
            if index not in internal_indexes:
                df_avgs.loc[k][index] = mean(dicio_statistics[k][index])
                df_stds.loc[k][index] = stdev(dicio_statistics[k][index])
            else:
                df_avgs.loc[k][index] = dicio_statistics[k][index][0]
                df_stds.loc[k][index] = dicio_statistics[k][index][0]

        df_avgs.loc[k]['k_score_avg'] = 0
        df_stds.loc[k]['k_score_std'] = 0

        #df_stds.loc[k]['k_score_std_2'] = 0

    #weights given to each clustering indice, Rand Index does not value as much as the other indices
    weights = {index: 1/len(indexes) for index in indexes}
    #found the maximum value for each clustering index and locate in which k it happens
    # compute the scores for each k as being the sum of weights whenever that k has maximums of clustering indices
    columns = df_avgs.columns
    analyzed_columns = columns[2:-3]
    for column in analyzed_columns:

        if column in min_indexes:
            idx_min = df_avgs[column].idxmin()
            df_avgs.loc[idx_min]['k_score_avg'] = df_avgs.loc[idx_min]['k_score_avg'] + weights[column]
            continue


        idx_max = df_avgs[column].idxmax()
        df_avgs.loc[idx_max]['k_score_avg'] = df_avgs.loc[idx_max]['k_score_avg'] + weights[column]

    #idx_min_s_dbw = df_avgs['s_dbw'].idxmin()
    #idx_min_cvnn = df_avgs['cvnn'].idxmin()
    #df_avgs.loc[idx_min_s_dbw]['k_score_avg'] = df_avgs.loc[idx_min_s_dbw]['k_score_avg'] + weights['s_dbw']
    #df_avgs.loc[idx_min_cvnn]['k_score_avg'] = df_avgs.loc[idx_min_cvnn]['k_score_avg'] + weights['cvnn']

    #final number of clusters chosen by analysing df_avgs
    final_k = df_avgs['k_score_avg'].idxmax()


    if(automatic==0 or automatic==1):

        fig1 = plt.figure(figsize=(10,5))
        ax = plt.gca()
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)
        ax.axis('tight')
        ax.axis('off')
        #colLabels=df_avgs.loc[:, df_avgs.columns != 'k_score_avg'].columns
        colLabels1 = external_indexes.copy()
        colLabels1.append('k')
        cell_text1 = []
        for row in range(len(df_avgs)):
            cell_text1.append(df_avgs.iloc[row,list(range(len(external_indexes))) + [-2]].round(decimals=3))
        plt.title('Average values of eleven external indices \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method))
        the_table = plt.table(cellText=cell_text1, colLabels=colLabels1, loc='center',cellLoc='center')
        #the_table.auto_set_font_size(False)
        #the_table.set_fontsize(4)
        fig1.text(0.1, 0.01, "R = Rand, AR = Adjusted Rand, FM = Fowlkes and Mallows, J = Jaccard, AW = Adjusted Wallace, "
                      "VD = Van Dongen, H = Huberts, H' = Huberts Normalized, F = F-Measure, "
                      "VI = Variation of information, MS = Minkowski", fontsize=5)
        pp.savefig(fig1)



        fig2 = plt.figure(3, figsize=(10, 5))
        ax = plt.gca()
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)
        ax.axis('tight')
        ax.axis('off')
        # colLabels=df_avgs.loc[:, df_avgs.columns != 'k_score_avg'].columns
        colLabels2 = internal_indexes.copy()
        colLabels2.append('k')
        cell_text2 = []
        for row in range(len(df_avgs)):
            cell_text2.append(df_avgs.iloc[row, list(range(len(external_indexes), len(indexes))) + [-2]].round(decimals=3))
        plt.title('Average values of six internal indices \n gap: %.2f, Tp: %.2f, %s link' % (gap, Tp, method))
        plt.table(cellText=cell_text2, colLabels=colLabels2, loc='center', cellLoc='center', fontsize=20)
        pp.savefig(fig2)


        #bar chart of standard deviation - standard deviation of all measures
        # Create a figure instance
    #    plt.figure(2)
    #    df_stds.loc[:,df_stds.columns != 'k'].plot.bar(figsize=(15,8))
    #    plt.title('Standard deviation of five measures versus number of clusters',fontsize=25)
    #    plt.xlabel('Number of clusters',labelpad=20,fontsize=20)
    #    plt.ylabel('Standard deviation',labelpad=10,fontsize=20)
    #    plt.xticks(size = 20)
    #    plt.yticks(size = 20)
    #    plt.show()


        fig3 = plt.figure(4)
        df_stds.loc[:,'AR'].plot.bar(figsize=(15,8),color='forestgreen')
        plt.title('Standard deviation of Adjusted Rand versus number of clusters \n gap: %.2f, Tp: %.2f, %s link' %(gap,Tp,method),fontsize=25)
        plt.xlabel('Number of clusters',labelpad=20,fontsize=15)
        plt.ylabel('Standard deviation',labelpad=10,fontsize=15)
        plt.xticks(size = 20)
        plt.yticks(size = 20)
        #plt.show()

        pp.savefig(fig3)


    return [df_avgs,df_stds,final_k]
Exemplo n.º 48
0
 def __call__(self, X):
     return linkage(X, method='weighted')
Exemplo n.º 49
0
M[:, 1] = ys

scales = np.exp(np.linspace(np.log(Lmin), np.log(Lmax), Nscales))

del MASS, Observable

print('        Scale Loop        \n')

for scale in scales:
    print('        This scale : %f\t     \n' % scale)
    l = 1.1 * scale / float(ds.length_unit.in_units('pc'))
    ls = 1.1 * scale

    distance = sch.distance.pdist(
        M)  # vector of (100 choose 2) pairwise distances
    Link = fastcluster.linkage(distance, method='complete')
    ind = sch.fcluster(Link, l, 'distance')

    xcm = []
    ycm = []

    for j in set(ind):
        temp_mass = mass[ind == j].sum()
        if temp_mass > 10:
            xcm.append(np.average(xs[ind == j], weights=mass[ind == j]))
            ycm.append(np.average(ys[ind == j], weights=mass[ind == j]))

    del temp_mass

    print('        Number of regions :%d\t     \n' % len(xcm))
Exemplo n.º 50
0
def cluster_validation(M, method, k, partition_found, df_encoded, results):
    #dictionary to store all computed indexes for each cluster
    dicio_cluster_validation = {k: {} for k in range(1, k + 1)}
    for k in range(1, k + 1):
        dicio_cluster_validation[k]['jaccard'] = []
        dicio_cluster_validation[k]['dice'] = []
        dicio_cluster_validation[k]['asymmetric'] = []

    #assess cluster stability for K=k that was the number of clusters chosen
    for i in range(M):
        # sampling rows of the original data
        idx = np.random.choice(len(df_encoded),
                               int((3 / 4) * len(df_encoded)),
                               replace=False)
        idx = np.sort(idx)
        #get all the possible combinations between the sampled patients
        patient_comb_bootstrap = list(
            itertools.combinations(df_encoded.loc[idx, 'id_patient'], 2))
        patient_comb_bootstrap = pd.DataFrame(patient_comb_bootstrap,
                                              columns=['patient1', 'patient2'])
        #extract the scores regarding the previous sampled combinations to be used in hierarchical clustering
        results_bootstrap = pd.merge(results,
                                     patient_comb_bootstrap,
                                     how='inner',
                                     on=['patient1', 'patient2'])
        # Hierarchical Clustering of the bootstrap sample
        Z_bootstrap = linkage(results_bootstrap['score'], method)

        c_assignments_bootstrap = cut_tree(Z_bootstrap, k)
        partition_bootstrap = cluster_indices(c_assignments_bootstrap, idx)

        for k_i in range(1, k + 1):
            aux_jaccard = []
            aux_dice = []
            aux_asymmetric = []
            for i in range(1, k + 1):
                aux = cluster_validation_indexes(partition_found[k_i - 1],
                                                 partition_bootstrap[i - 1])
                aux_jaccard.append(aux[0])
                aux_dice.append(aux[2])
                aux_asymmetric.append(aux[1])

            dicio_cluster_validation[k_i]['jaccard'].append(max(aux_jaccard))
            dicio_cluster_validation[k_i]['dice'].append(max(aux_dice))
            dicio_cluster_validation[k_i]['asymmetric'].append(
                max(aux_asymmetric))

    #obtain the average cluster external indexes for each number of clusters
    jaccard_cluster_median = []
    dice_median = []
    asymmetric_median = []
    jaccard_cluster_avg = []
    dice_avg = []
    asymmetric_avg = []
    jaccard_cluster_std = []
    dice_std = []
    asymmetric_std = []
    table = []
    cluster_sizes = []

    for k in range(1, k + 1):
        jaccard_cluster_median.append(
            round(median(dicio_cluster_validation[k]['jaccard']), 3))
        dice_median.append(
            round(median(dicio_cluster_validation[k]['dice']), 3))
        asymmetric_median.append(
            round(median(dicio_cluster_validation[k]['asymmetric']), 3))
        jaccard_cluster_avg.append(
            round(mean(dicio_cluster_validation[k]['jaccard']), 3))
        dice_avg.append(round(mean(dicio_cluster_validation[k]['dice']), 3))
        asymmetric_avg.append(
            round(mean(dicio_cluster_validation[k]['asymmetric']), 3))
        jaccard_cluster_std.append(
            round(stdev(dicio_cluster_validation[k]['jaccard']), 3))
        dice_std.append(round(stdev(dicio_cluster_validation[k]['dice']), 3))
        asymmetric_std.append(
            round(stdev(dicio_cluster_validation[k]['asymmetric']), 3))

        cluster_sizes.append(len(partition_found[k - 1]))

        table.append([
            str(k) + ' (' + str(len(partition_found[k - 1])) + ')',
            jaccard_cluster_median[k - 1], dice_median[k - 1],
            asymmetric_median[k - 1], jaccard_cluster_avg[k - 1],
            dice_avg[k - 1], asymmetric_avg[k - 1], jaccard_cluster_std[k - 1],
            dice_std[k - 1], asymmetric_std[k - 1]
        ])

    headers = [
        'Cluster Number', 'J_median', 'D_median', 'A_median', 'J_avg', 'D_avg',
        'A_avg', 'J_std', 'D_std', 'A_std'
    ]
    print(tabulate(table, headers))

    cluster_stability = [
        jaccard_cluster_median, dice_median, asymmetric_median,
        jaccard_cluster_avg, dice_avg, asymmetric_avg, jaccard_cluster_std,
        dice_std, asymmetric_std, cluster_sizes
    ]

    return cluster_stability
def color_palette(frame_bgr,
                  mask=None,
                  mask_index=None,
                  n_merge_steps=100,
                  image_size=400.0,
                  seeds_model=None,
                  n_pixels=400,
                  n_merge_per_lvl=10,
                  mask_inverse=False,
                  normalization_lower_bound=100.0,
                  seeds_input_width=600,
                  use_lab=True,
                  show_seed=False,
                  seed_labels=None) -> PaletteAsset:
    """
    Computes a hierarchical color palette as generated by VIAN, does not keep the original tree.

    :param frame_bgr: A frame in bgr uint8, currently float32 is not allowed since OpenCV may crash on it
    :param mask: An optional mask of labels
    :param mask_index: The label which the palette should be computed on
    :param mask_inverse: If true, all but the given mask_index will be computed.
    :param n_merge_steps: Number of merge steps to return (approximately), this is restricted by the
    :param image_size: image size to compute on
    :param seeds_model: the seeds model can optionally be given as argument to avoid initialization after each image
    :param n_pixels: number of super pixels to compute (approximately)
    :param n_merge_per_lvl: After the first 10 merges, every nth depth to store in the result
    :param normalization_lower_bound: Minimal number of pixels to keep a cluster
    :param seeds_input_width: input for the seeds model
    :param use_lab: if false, RGB will used for average computation instead of lab
    :param show_seed: if true, the seeds output will be shown in opencv, make sure to put cv2.waitKey() to see the result
    :return: PaletteAsset
    """

    frame = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2LAB)

    if seeds_input_width < frame.shape[0]:
        rx = seeds_input_width / frame.shape[0]
        frame = cv2.resize(frame, None, None, rx, rx, cv2.INTER_CUBIC)

    if seed_labels is None:
        if seeds_model is None:
            seeds_model = PaletteExtractorModel(frame,
                                                n_pixels=n_pixels,
                                                num_levels=4)
        labels = seeds_model.forward(frame, 200).astype(np.uint8)
    else:
        labels = seed_labels

    if show_seed:
        cv2.imshow(
            "SEED",
            cv2.cvtColor(seeds_model.labels_to_avg_color_mask(frame, labels),
                         cv2.COLOR_LAB2BGR))

    fx = image_size / frame.shape[0]
    frame = cv2.resize(frame, None, None, fx, fx, cv2.INTER_CUBIC)
    labels = cv2.resize(labels, None, None, fx, fx, cv2.INTER_NEAREST)
    frame_bgr = cv2.resize(frame_bgr, None, None, fx, fx, cv2.INTER_CUBIC)

    if mask is not None:
        mask = cv2.resize(mask, (labels.shape[1], labels.shape[0]), None,
                          cv2.INTER_NEAREST)

        if mask_inverse:
            labels[np.where(mask == mask_index)] = 255
        else:
            labels[np.where(mask != mask_index)] = 255

        bins = np.unique(labels)
        bins = np.delete(bins, np.where(bins == 255))
    else:
        bins = np.unique(labels)

    data = []
    hist = np.histogram(labels, bins=bins)

    normalization_f = np.amin(hist[0])
    if normalization_f < normalization_lower_bound:
        normalization_f = normalization_lower_bound
    labels_list = []
    colors_list = []

    all_cols = []

    all_labels = []

    for i, bin in enumerate(hist[0]):
        if bin < normalization_f:
            continue
        lbl = hist[1][i]
        if use_lab:
            avg_color = np.round(
                cv2.cvtColor(
                    np.array(
                        [[np.mean(frame[np.where(labels == lbl)], axis=0)]],
                        dtype=np.uint8),
                    cv2.COLOR_LAB2BGR)[0, 0]).astype(np.uint8)
        else:
            avg_color = np.round(
                np.mean(frame_bgr[np.where(labels == lbl)],
                        axis=0)).astype(np.uint8)

        labels_list.append(lbl)
        colors_list.append(avg_color)

        data.extend([avg_color] * int(np.round(bin / normalization_f)) * 2)
        all_cols.extend([avg_color] * int(np.round(bin / normalization_f)) * 2)
        all_labels.extend([lbl] * int(np.round(bin / normalization_f)) * 2)

    data = np.array(data)

    Z = linkage(data, 'ward')
    tree, merge_dists = to_cluster_tree(Z, all_labels, all_cols, n_merge_steps,
                                        n_merge_per_lvl)
    return PaletteAsset(tree, merge_dists)
Exemplo n.º 52
0
        if len(corp)>4:
            wordCorps.append(corp)
#-----------------------------------------------------------------------------------------------------------------------------------------------
'''train and pick trained word vec'''
dirs = "C:\\Users\\Administrator.NBJXUEJUN-LI\\Desktop\\project\\Python\\NLP\\savedObject\\CompCorpus\\"
slm = pickle.load(open(dirs+"slm.pkl","rb"))
'''perform kmeans cluster without normalize'''
from sklearn.cluster import KMeans
TopicNums = 10
wordNums = slm.wordvec.shape[0]
kmeansFit = KMeans(n_clusters=TopicNums)
kmeansFit.fit(slm.wordvec)

'''perform hierachical cluster'''
import fastcluster 
result = fastcluster.linkage (X = slm.wordvec, method='single', metric='euclidean', preserve_input='False')
'''compute word depth'''
clustStruct = {}
for ridx in range(result.shape[0]):
    cidx = int(ridx+wordNums)
    clustStruct.setdefault(cidx,np.zeros(wordNums, dtype=np.int))
    for i in [0,1]:
        code = int(result[ridx][i])
        if code<wordNums:
            clustStruct[cidx][code]+=1
        else:
            clustStruct[cidx]+=(clustStruct[code]+ (clustStruct[code]!=0).astype('int'))
wordDepth = clustStruct[max(clustStruct.keys())]
#-----------------------------------------------------------------------------------------------------------------------------------------------
'''compute the word degree within sentence co-occurance cross docs'''
sentWindow = 3
Exemplo n.º 53
0
def cluster(dupes: numpy.ndarray,
            cluster_threshold: float = 0.5,
            max_components: int = 30000,
            id_to_match: str = None) -> Clusters:
    """
    Takes in a list of duplicate pairs and clusters them in to a
    list records that all refer to the same entity based on a given
    threshold

    `https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.fcluster.html`



    Args:
        dupes: (np.array)[tuple(list[str], float)] A list of tuples, where each tuple
            contains an id pair and a probability that they are a match:
                id_pair_tuple: ([record_id_1, record_id_2], prob)
                dtype: np.dtype([('pairs', '<U256', 2),
                                 ('score', 'f4', 1)])
        threshold: (float) number betweent 0 and 1 (default is .5). lowering the
            number will increase precision, raising it will increase recall
    """
    distance_threshold = cluster_threshold
    score_threshold = 1 - cluster_threshold
    dupe_sub_graphs = connected_components(dupes, max_components)
    # logger.info(f"Dupes: {dupes}")
    for sub_graph in dupe_sub_graphs:
        if len(sub_graph) > 1:
            i_to_id, condensed_distances, N = condensed_distance(sub_graph)
            logger.debug(f"{condensed_distances}")
            linkage = fastcluster.linkage(condensed_distances,
                                          method='centroid',
                                          preserve_input=True)
            partition = hcluster.fcluster(linkage,
                                          distance_threshold,
                                          criterion='distance')

            clusters: Dict[int, List[int]] = defaultdict(list)
            logger.debug(f"Partition: {partition}")
            logger.debug(f"Linkage: {linkage}")
            for i, cluster_id in enumerate(partition):
                clusters[cluster_id].append(i)

            logger.info(f"Clusters: {clusters}")
            for cluster in clusters.values():
                if len(cluster) > 1:
                    scores = confidences(cluster, condensed_distances, N)
                    logger.info(
                        f"Cluster Ids and scores: {tuple(i_to_id[i] for i in cluster)}, {scores}"
                    )
                    ids = [i_to_id[i] for i in cluster]
                    if id_to_match in ids and id_to_match is not None:
                        yield tuple(ids), scores
                    elif id_to_match is None:
                        yield tuple(ids), scores

        else:
            (ids, score), = sub_graph
            if score > score_threshold and id_to_match in ids and id_to_match is not None:
                # logger.info(tuple(ids), ((score,) * 2))
                yield tuple(ids), (score, ) * 2
            elif score > score_threshold and id_to_match is None:
                yield tuple(ids), (score, ) * 2
Exemplo n.º 54
0
 def __call__(self, X):
     return linkage(X, method='average')
Exemplo n.º 55
0
    fname = os.path.basename(fname)
    if fname.endswith('.distances'):
        pdb = fname[:-10]
        index[pdb] = i
length = len(index)
npArray = np.zeros((length, length))
for filename in glob.glob('/home/lmt72/PDBdistances/*.distances'):
    distanceFile = open(filename)
    filename = os.path.basename(filename)
    if filename.endswith('.distances'):
        pdb = filename[:-10]
    for line in distanceFile:
        data = line.split()
        secondProtein = data[0]
        distance = float(data[1].strip())
        i = index[pdb]
        npArray[i, index[secondProtein]] = distance
        npArray[index[secondProtein], i] = distance
names = ['' for x in xrange(length)]
for (name, i) in index.iteritems():
    names[i] = name
print npArray

Z1 = fcl.linkage(npArray, method='average')
l1 = sch.leaves_list(Z1)
D = (npArray[l1])
Z2 = fcl.linkage(npArray.T, method='average')
l2 = sch.leaves_list(Z2)
D = D[:, l2]
cPickle.dump((npArray, D, Z1, names), open("clusterstate.pickle", 'w'), -1)
Exemplo n.º 56
0
 def __call__(self, X):
     return linkage(X, method='ward')
Exemplo n.º 57
0
 def __call__(self, X):
     return linkage(X, method='complete')
Exemplo n.º 58
0
num_points_per_bb = 50
data_generator = DataGenerator(bounding_boxes, num_points_per_bb, window)
points = data_generator.load_points_from_csv('points.csv')
#points = data_generator.generate_points()

canvas = Canvas(window, width=1024, height=768, bg='white')
colors = ['white', 'yellow', 'cyan', 'red', 'blue', 'brown', 'green']

np_points = np.zeros((len(points), 2))
for i in range(len(points)):
    np_points[i][0] = points[i].x
    np_points[i][1] = points[i].y

start_time = time.time()
np_clusters = fastcluster.linkage(np_points,
                                  method='single',
                                  metric='euclidean')

print(np_clusters)

clusters = []
for i in range(len(points)):
    cluster = Cluster(i)
    cluster.populate([points[i]])
    clusters.append(cluster)


def get_cluster_by_id(id):
    for i in range(len(clusters)):
        if clusters[i].id == id:
            return i
Exemplo n.º 59
0
 def __call__(self, X):
     return linkage(X, method='single')
    def __initial_match(self,
                        candidate_list: (np.ndarray, np.generic),
                        min_pts=2,
                        t=50,
                        criterion='distance'):
        # TODO group matching for non-grouped user
        # 1 : dbscan algorithm + gps based movement vector alignment -> clear!
        # 2 : acceleration -> let's discuss
        """Performs initial-clustering on cn candidate_list(nT x 2 numpy array) and returns group lists.
        Parameters
        ----------
        candidate_list : array of shape (n_samples, n_of_time_steps, pair of latitude and longitude
        min_pts : minimum members of a group for HDBSCAN-algorithm
        t : scalar
            For criteria 'inconsistent', 'distance' or 'monocrit',
            this is the threshold to apply when forming flat clusters.
            For 'maxclust' or 'maxclust_monocrit' criteria,
            this would be max number of clusters requested.
        criterion : str, optional
        The criterion to use in forming flat clusters. This can
        be any of the following values:

          ``inconsistent`` :
              If a cluster node and all its
              descendants have an inconsistent value less than or equal
              to `t` then all its leaf descendants belong to the
              same flat cluster. When no non-singleton cluster meets
              this criterion, every node is assigned to its own
              cluster. (Default)

          ``distance`` :
              Forms flat clusters so that the original
              observations in each flat cluster have no greater a
              cophenetic distance than `t`.

          ``maxclust`` :
              Finds a minimum threshold ``r`` so that
              the cophenetic distance between any two original
              observations in the same flat cluster is no more than
              ``r`` and no more than `t` flat clusters are formed.

          ``monocrit`` :
              Forms a flat cluster from a cluster node c
              with index i when ``monocrit[j] <= t``.

              For example, to threshold on the maximum mean distance
              as computed in the inconsistency matrix R with a
              threshold of 0.8 do::

                  MR = maxRstat(Z, R, 3)
                  cluster(Z, t=0.8, criterion='monocrit', monocrit=MR)

          ``maxclust_monocrit`` :
              Forms a flat cluster from a
              non-singleton cluster node ``c`` when ``monocrit[i] <=
              r`` for all cluster indices ``i`` below and including
              ``c``. ``r`` is minimized such that no more than ``t``
              flat clusters are formed. monocrit must be
              monotonic. For example, to minimize the threshold t on
              maximum inconsistency values so that no more than 3 flat
              clusters are formed, do::

                  MI = maxinconsts(Z, R)
                  cluster(Z, t=3, criterion='maxclust_monocrit', monocrit=MI)
        Returns
        ----------
        groups : list of shape (n_clusters, n_members)

        Examples
        ----------
        >>> candidate_list = np.array([,...,], shape=[5,3,2]) -> labels of candidate_list = [0,1,0,1,0]
        >>> groups = [[0,2,4],[1,3]]
        """
        assert isinstance(candidate_list, (np.ndarray, np.generic))
        num_of_data, num_time_steps, _ = candidate_list.shape
        X = np.array([
            candidate_list[i, num_time_steps - 1, :]
            for i in range(num_of_data)
        ])
        rads = np.radians(X)  # [N,2]
        # Clustering with gps-data of 1-time step.
        # 'haversine' do clustering using distance transformed from (lat, long)
        clusterer = hdbscan.HDBSCAN(min_cluster_size=min_pts,
                                    min_samples=2,
                                    metric='haversine')
        labels = clusterer.fit_predict(rads)
        print('Before trajectory clustering, labels are ', labels)
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        groups = []
        for ulb in range(n_clusters_):
            groups.append([])
        for i, lb in enumerate(labels):
            if lb == -1:
                continue
            groups[lb].append(i)
        total_n_clusters = n_clusters_
        # Group refinement considering user's trajectory
        for nc in range(n_clusters_):
            group_member_mask = (labels == nc)
            group_members = candidate_list[group_member_mask]
            pdist = tdist.pdist(group_members.transpose([0, 2, 1]),
                                metric="sspd",
                                type_d="spherical")
            Z = fc.linkage(pdist, method="ward")
            sub_labels = sch.fcluster(Z, t, criterion=criterion) - 1
            unique_sub_labels = len(set(sub_labels))
            if unique_sub_labels == 1:
                continue
            for ad in range(unique_sub_labels - 1):
                groups.append([])
            member_indices = list(
                compress(range(len(group_member_mask)), group_member_mask))
            for sb in range(unique_sub_labels):
                sub_group_mask = (sub_labels == sb)
                sub_member_indices = list(
                    compress(range(len(sub_group_mask)), sub_group_mask))
                # Noise case
                if len(sub_member_indices) == 1:
                    groups[nc].remove(member_indices[sub_member_indices[0]])
                    labels[member_indices[sub_member_indices[0]]] = -1
                    continue
                for m in range(len(sub_member_indices)):
                    # remove from wrong group
                    groups[nc].remove(member_indices[sub_member_indices[m]])
                    # add to refined group
                    groups[total_n_clusters].append(
                        member_indices[sub_member_indices[m]])
                    labels[member_indices[
                        sub_member_indices[m]]] = total_n_clusters
                total_n_clusters += 1
        print('After trajectory clustering, labels are ', labels)
        return groups.copy()