Пример #1
1
def random_distribution(n):

    #make up some data
    data = np.random.normal(scale=n, size=(n, n))
    data[0:n / 2,0:n / 2] += 75
    data[n / 2:, n / 2:] = np.random.poisson(lam=n,size=data[n / 2:, n / 2:].shape)
    #cluster the rows
    row_dist = ssd.squareform(ssd.pdist(data))
    row_Z = sch.linkage(row_dist)
    row_idxing = sch.leaves_list(row_Z)

    row_labels = ['bar{}'.format(i) for i in range(n)]

    #cluster the columns
    col_dist = ssd.squareform(ssd.pdist(data.T))
    col_Z = sch.linkage(col_dist)
    col_idxing = sch.leaves_list(col_Z)
    #make the dendrogram

    col_labels = ['foo{}'.format(i) for i in range(n)]

    data = data[:,col_idxing][row_idxing,:]

    heatmap = pdh.DendroHeatMap(heat_map_data=data,left_dendrogram=row_Z, top_dendrogram=col_Z, heatmap_colors=("#ffeda0", "#feb24c", "#f03b20"), window_size="auto", color_legend_displayed=False, label_color="#777777")
    heatmap.row_labels = row_labels
    heatmap.col_labels = col_labels
    heatmap.title = 'An example heatmap'
    heatmap.show()#heatmap.save("example.png")
Пример #2
0
def save_mat(c2map, filepath):
	mat = c2map['mat']
	fig = pylab.figure(figsize=(8,8))
	
	# Compute and plot first dendrogram.
	ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
	Y = sch.linkage(mat, method='centroid')
	Z1 = sch.dendrogram(Y, orientation='right')
	ax1.set_xticks([])
	ax1.set_yticks([])

	# Compute and plot second dendrogram.
	ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
	Y = sch.linkage(mat, method='single')
	Z2 = sch.dendrogram(Y)
	ax2.set_xticks([])
	ax2.set_yticks([])

	# Plot distance matrix.
	axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
	idx1 = Z1['leaves']
	idx2 = Z2['leaves']
	mat = mat[idx1,:]
	mat = mat[:,idx2]
	im = axmatrix.matshow(mat, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
	axmatrix.set_xticks([])
	axmatrix.set_yticks([])

	# Plot colorbar.
	axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
	pylab.colorbar(im, cax=axcolor)

	fig.savefig(filepath)
    def getDistMatrixes(cls, distDict, distMeasure, linkageCriterion):
        """
        Find and return the correlation matrix, linkage matrix and distance matrix for the distance/correlation
        measure given with distMeasure parameter.
        """
        from scipy.spatial.distance import squareform
        from numpy import ones, fill_diagonal
        from scipy.cluster.hierarchy import linkage

        if distMeasure == cls.CORR_PEARSON or distMeasure == cls.SIM_MCCONNAUGHEY:
            '''As these measures generate values between -1 and 1, need special handling'''

            # Cluster distances, i.e. convert correlation into distance between 0 and 1
            triangularCorrMatrix = distDict[distMeasure]
            triangularDistMatrix = ones(len(triangularCorrMatrix)) - [(x + 1) / 2 for x in triangularCorrMatrix]
            linkageMatrix = linkage(cls.removeNanDistances(triangularDistMatrix), linkageCriterion)

            # Make correlation matrix square
            correlationMatrix = squareform(triangularCorrMatrix)
            fill_diagonal(correlationMatrix, 1)
        else:

            # Cluster distances
            triangularDistMatrix = distDict[distMeasure]
            linkageMatrix = linkage(cls.removeNanDistances(triangularDistMatrix), linkageCriterion)

            # Convert triangular distances into square correlation matrix
            squareDistMatrix = squareform(triangularDistMatrix)
            squareSize = len(squareDistMatrix)
            correlationMatrix = ones((squareSize, squareSize)) - squareDistMatrix

        return correlationMatrix, linkageMatrix, triangularDistMatrix
def draw_intensity(a, cmap=GREEN_CMAP, metric='euclidean', method='average', sort_x=True, sort_y=True):
    main_axes = plt.gca()
    divider = make_axes_locatable(main_axes)

    if sort_x is True:
        plt.sca(divider.append_axes("top", 0.5, pad=0))
        xlinkage = linkage(pdist(a.T, metric=metric), method=method, metric=metric)
        xdendro = dendrogram(xlinkage, orientation='top', no_labels=True,
                             distance_sort='descending',
                             link_color_func=lambda x: 'black')
        plt.gca().set_axis_off()
        a = a[[a.columns[i] for i in xdendro['leaves']]]

    if sort_y is True:
        plt.sca(divider.append_axes("left", 1.0, pad=0))
        ylinkage = linkage(pdist(a, metric=metric), method=method, metric=metric)
        ydendro = dendrogram(ylinkage, orientation='right', no_labels=True,
                             distance_sort='descending',
                             link_color_func=lambda x: 'black')
        plt.gca().set_axis_off()
        a = a.ix[[a.index[i] for i in ydendro['leaves']]]

    plt.sca(main_axes)
    plt.imshow(a, aspect='auto', interpolation='none',
               cmap=cmap, vmin=0.0, vmax=1.0)
    plt.colorbar(pad=0.15)
    plt.gca().yaxis.tick_right()
    plt.xticks(range(a.shape[1]), a.columns, rotation=90, size='small')
    plt.yticks(range(a.shape[0]), a.index, size='x-small')
    plt.gca().xaxis.set_ticks_position('none')
    plt.gca().yaxis.set_ticks_position('none')
    plt.gca().invert_yaxis()

    plt.show()
def hierarchical_clustering(data, skill,  method='single', metric='euclidean', dendrogram=True, concepts=False, cluster_number=3, corr_as_vectors=False):
    pk, level = data.get_skill_id(skill)
    items = data.get_items_df()
    skills = data.get_skills_df()
    corr = compute_corr(data, merge_skills=concepts)
    print("Corr ({}) contain total {} values and from that {} nans".format(corr.shape, corr.size, corr.isnull().sum().sum()))
    corr[corr.isnull()] = 0

    if concepts:
        items = items[items["skill_lvl_" + str(level)] == pk]
        skill_ids = items[~items["skill_lvl_3"].isnull()]["skill_lvl_3"].unique()
        corr = pd.DataFrame(corr, index=skill_ids, columns=skill_ids)
        labels = list(skills.loc[corr.index]["name"])

    else:
        items = items[items["skill_lvl_" + str(level)] == pk]
        items = items[items["visualization"] != "pairing"]
        corr = pd.DataFrame(corr, index=items.index, columns=items.index)
        labels = ["{1} - {0}".format(item["name"], item["visualization"][0]) for id, item in list(items.iterrows())]

    if corr_as_vectors:
        Z = hr.linkage(corr, method=method, metric=metric)
    else:
        Z = hr.linkage(dst.squareform(1 - corr), method=method)
    Z[Z < 0] = 0
    if dendrogram:
        plt.title('{}: method: {}, metric: {}, as vectors: {}'.format(skill, method, metric, corr_as_vectors))
        plt.xlabel('items' if not concepts else "concepts")
        plt.ylabel('distance')
        hr.dendrogram(Z, leaf_rotation=90., leaf_font_size=10., labels=labels)

    return hr.fcluster(Z, cluster_number, "maxclust")
Пример #6
0
def compare_clusters(args):

    ref_df = pd.read_table(args['ref'], sep='\t', skipinitialspace=True, index_col=0).as_matrix()
    check_symmetry(ref_df)
    linkage_ref = linkage(ref_df, 'average')
    c_ref, coph_dists_ref = cophenet(linkage_ref, pdist(ref_df))

    outfile = open(args['output'],"w")
    outfile.write("Tree_cluster\tMantel_Correlation_Coefficient\tManter_P-value\tCophenetic_Pearson\tCophenetic_P-value\n")

    for i in args['all']:
        fst_df = pd.read_table(i, sep='\t', skipinitialspace=True, index_col=0).as_matrix()
        check_symmetry(fst_df)
        mantel_coeff = 0.0
        p_value_mantel = 0.0
        cophenetic_pearson = 0.0
        p_value_cophenetic = 0.0
        n = 0
        try:
            mantel_coeff, p_value_mantel, n = mantel(ref_df, fst_df)
            linkage_fst = linkage(fst_df, 'average')
            c_fst, coph_dists_fst = cophenet(linkage_fst, pdist(fst_df))
            cophenetic_pearson, p_value_cophenetic = pearsonr(coph_dists_ref, coph_dists_fst)
        except Exception as e:
            print("Error : %s" % str(e))
            mantel_coeff = "Failed"
            p_value_manel = "Failed"
            cophenetic_pearson = "Failed"
            p_value_cophenetic = "Failed"

        outfile.write(i+"\t"+str(mantel_coeff)+"\t"+str(p_value_mantel)+"\t"+str(cophenetic_pearson)+"\t"+str(p_value_cophenetic)+"\n")

    outfile.close()
Пример #7
0
    def cluster_fps(self):
        clkg = hcluster.linkage(self.dm,method = 'average') 
        coarse_r = hcluster.fcluster(clkg,0.3,criterion = 'distance')
        self.coarse_r = coarse_r

        bcount = np.bincount(coarse_r)
        knum = len(np.nonzero(bcount > 1)[0])

        s = self.density_matrix.shape
        if False and len(s) >1 and s[0] > 10 and s[1] > 10 and knum < min(s) / 2:
            (u,s,vt) = la.svds(self.sps_matrixs,k = knum)
            self.u = u
            print '============'
        else:
            
            self.result = self.coarse_r
            return (clkg,clkg)
 

#rankA = npla.matrix_rank(self.sps_matrixs)
#        if rankA < 3:
        a = np.matrix(np.diag(s)) * np.matrix(vt)
        pd = dist.pdist(np.array(a.T),'cosine')
        pd[np.abs(pd) < 1e-11] = 0
        lkg = hcluster.linkage(pd,method = 'average')
        self.lkg = lkg

        self.result = hcluster.fcluster(lkg,self.svd_cluster_thr,criterion = 'distance')

#        self.result = hcluster.fcluster(lkg,1)

# self.result = hcluster.fclusterdata(u,0.7,metric = 'cosine', criterion = 'distance',method = 'average')
        return (lkg,clkg)
def main():
    D = 2 # so we can visualize it more easily
    s = 4 # separation so we can control how far apart the means are
    mu1 = np.array([0, 0])
    mu2 = np.array([s, s])
    mu3 = np.array([0, s])

    N = 900 # number of samples
    X = np.zeros((N, D))
    X[:300, :] = np.random.randn(300, D) + mu1
    X[300:600, :] = np.random.randn(300, D) + mu2
    X[600:, :] = np.random.randn(300, D) + mu3

    Z = linkage(X, 'ward')
    print "Z.shape:", Z.shape
    # Z has the format [idx1, idx2, dist, sample_count]
    # therefore, its size will be (N-1, 4)
    plt.title("Ward")
    dendrogram(Z)
    plt.show()

    Z = linkage(X, 'single')
    plt.title("Single")
    dendrogram(Z)
    plt.show()

    Z = linkage(X, 'complete')
    plt.title("Complete")
    dendrogram(Z)
    plt.show()
Пример #9
0
def HierarchicalCluster(A):
    #see http://stackoverflow.com/questions/2982929/plotting-results-of-hierarchical-clustering-ontop-of-a-matrix-of-data-in-python
    Corr = np.corrcoef(A.T)
    fig = plt.figure(figsize=(8,8))
    ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
    Y = hrc.linkage(Corr, method='centroid')
    Z1 = hrc.dendrogram(Y, orientation='right')
    ax1.set_xticks([])
    ax1.set_yticks([])

    ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
    Y = hrc.linkage(Corr, method='centroid')
    Z2 = hrc.dendrogram(Y)
    ax2.set_xticks([])
    ax2.set_yticks([])

    axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
    idx1 = Z1['leaves']
    idx2 = Z2['leaves']
    Corr = Corr[idx1, :]
    Corr = Corr[:, idx2]
    im = axmatrix.matshow(Corr, aspect='auto', origin='lower')

    axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
    pylab.colorbar(im, cax=axcolor)
    fig.show()
    fig.savefig('dendrogram.png')
Пример #10
0
	def hcluster_cols(self, thresh):
		try:
			link = linkage(self.X.T, method='complete', metric = 'cosine')
			assignments = fcluster(link, thresh, 'distance')

		except:
			link = linkage(self.X.T, method='complete', metric = 'euclidean')
			assignments = fcluster(link, thresh, 'distance')

		col_ind = np.arange(len(self.crimes))
		d = pd.DataFrame(zip(col_ind, assignments)).groupby(1)[0].aggregate(lambda x: tuple(x))
		df_new = pd.DataFrame(index = np.arange(len(self.names)))
		for i in d:
			cols = []
			for w in i:
			    cols.append(w)
			if len(cols) > 1:
				df_new[str(self.crimes[cols])] = np.mean(self.X[:,cols], axis = 1)
			else:
			    df_new[str(self.crimes[cols[0]])] = self.X[:,cols[0]]

		# plt.figure(figsize=(10,20))
		# dendro = dendrogram(link, color_threshold=thresh, leaf_font_size=13, labels = self.crimes, orientation = 'left')
		# plt.subplots_adjust(top=.99, bottom=0.5, left=0.05, right=0.99)
		# plt.show()

		self.df = df_new
		self.crimes = df_new.columns.values
 def starthcc(self):
     print self.dm,self.lin
     dataFrame = pd.DataFrame(self.tr, columns=['x', 'y'])
     from scipy.spatial.distance import pdist, squareform
     
     # not printed as pretty, but the values are correct
     distxy = squareform(pdist(dataFrame, metric=(self.dm)))
     #print distxy
     if self.lin=="single":
         plt.figure()
         R = dendrogram(linkage(distxy, method=str(self.lin)))
         
         plt.xlabel('X units')
         plt.ylabel('Y units')
         plt.suptitle('Cluster Dendrogram', fontweight='bold', fontsize=14);
        
         plt.show()
     elif self.lin=="complete":
         plt.figure()
         R = dendrogram(linkage(distxy, method=str(self.lin)))
         
         plt.xlabel('X units')
         plt.ylabel('Y units')
         plt.suptitle('Cluster Dendrogram', fontweight='bold', fontsize=14);
        
         plt.show()
     else:
         plt.figure()
         R = dendrogram(linkage(distxy, method=str(self.lin)))
         
         plt.xlabel('X units')
         plt.ylabel('Y units')
         plt.suptitle('Cluster Dendrogram', fontweight='bold', fontsize=14);
        
         plt.show()
Пример #12
0
def plot_transition_clustermap(data_array, gene_names, pseudotimes, n_clusters=10, gradient=False):
    if gradient:
        data_to_plot = zscore(np.gradient(data_array)[1].T, axis=0)
        scale = None
        metric = 'seuclidean'
        row_linkage = linkage(pdist(abs(data_to_plot), metric=metric), method='complete')
    else:
        data_to_plot = data_array.T
        scale = 0
        metric = 'correlation'
        row_linkage = linkage(pdist(data_to_plot, metric=metric), method='complete')
    
    assignments = fcluster(row_linkage, n_clusters, criterion='maxclust')
    cm = sns.clustermap(data_to_plot, col_cluster=False, standard_scale=scale, 
                        yticklabels=gene_names, row_linkage=row_linkage,
                        row_colors=[settings.STATE_COLORS[i] for i in assignments])
    r = np.arange(10, data_array.shape[0], data_array.shape[0]/10)
    plt.setp(cm.ax_heatmap.get_yticklabels(), fontsize=5)
    cm.ax_heatmap.set_xticks(r)
    cm.ax_heatmap.set_xticklabels(['%.1f' % x for x in pseudotimes[r]])
    cm.ax_heatmap.set_xlabel('Pseudotime')
    cm.ax_heatmap.set_ylabel('Gene')
    
    gene_clusters = defaultdict(list)
    for i, cl in enumerate(assignments):
        gene_clusters[settings.STATE_COLORS[cl]].append(gene_names[i])
    return gene_clusters
Пример #13
0
def _cluster_idx(df):
    """ sort indices by clusters """
    dcol = pdist(df.T)
    drow = pdist(df)
    lcol = linkage(dcol)
    lrow = linkage(drow)
    cols = dendrogram(lcol, no_plot=True)['leaves']
    rows = dendrogram(lrow, no_plot=True)['leaves']
    return rows,cols
Пример #14
0
    def check_linkage_q(self, method):
        # Tests linkage(Y, method) on the Q data set.
        Z = linkage(hierarchy_test_data.X, method)
        expectedZ = getattr(hierarchy_test_data, "linkage_X_" + method)
        assert_allclose(Z, expectedZ, atol=1e-06)

        y = scipy.spatial.distance.pdist(hierarchy_test_data.X, metric="euclidean")
        Z = linkage(y, method)
        assert_allclose(Z, expectedZ, atol=1e-06)
Пример #15
0
def plot_clustered_heatmap(df, genes_list, cancer, output_path, scale='binary'):
    # Build nxm matrix (n samples, m genes)
    X = df[genes_list].as_matrix().transpose()
    
    if scale == 'binary':
        Z = linkage(X, method='complete', metric='hamming')
        colorscale = [[0, "rgb(111, 168, 220)"], [1, "rgb(5, 10, 172)"]]
        colorbar = {'tick0': 0,'dtick': 1}
    elif scale == 'logarithmic':
        Z = linkage(X, method='ward')
        X_max = X.max()
        colorscale = [[0, 'rgb(250, 250, 250)'],
                      [1./X_max, 'rgb(200, 200, 200)'],
                      [5./X_max, 'rgb(150, 150, 200)'],
                      [20./X_max, 'rgb(100, 100, 200)'],
                      [100./X_max, 'rgb(50, 50, 200)'],
                      [1., 'rgb(0, 0, 200)']]
        colorbar = {'tick0': 0,
                    'tickmode': 'array',
                    'tickvals': [0, 1, 5, 20, 100, X_max]}
    c, coph_dists = cophenet(Z, pdist(X))
    print "Cophenetic Correlation Coefficient:", c
    
    #layout = go.Layout(yaxis=dict(title='%s germline mutations (ordered by samples somatic mutation load)'% cancer, zeroline=False))    
#    fig = pylab.figure(figsize=(8,8))
#    ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
#    ax1.set_xticks([])
#    ax1.set_yticks([])
#    axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
    den = dendrogram(Z, orientation='left')
    idx = den['leaves']
    X = X[idx,:]
    print "X shape:", X.shape
    genes_ordered = [genes_list[i] for i in idx]
    logger.info("ordered genes: %s", str(genes_ordered))
    
#    im = axmatrix.matshow(X, aspect='auto', origin='lower', cmap=pylab.cm.YlGnBu)
#    axmatrix.set_xticks([])
#    axmatrix.set_yticks([])
#    # Plot colorbar.
#    axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
#    pylab.colorbar(im, cax=axcolor)
#    fig.savefig(output_path)
    
    # Plotting the heatmap (without the hirarchy)
    heatmap_trace = go.Heatmap(z=X.tolist(), x=df.patient_id, y=genes_ordered, showscale=True, colorscale=colorscale, colorbar=colorbar)
    mutation_load_trace = go.Bar(x=df.patient_id, y=df.somatic_mutations_count/30.0)
    fig = tls.make_subplots(rows=29, cols=1, specs=[[{'rowspan':5, 'colspan' : 1}]] + [[None]] * 4 + [[{'rowspan' : 24, 'colspan' : 1}]] + [[None]] * 23)
    fig.append_trace(mutation_load_trace, 1, 1)
    fig.append_trace(heatmap_trace, 6, 1)
    fig['layout']['xaxis1'].update(showticklabels = False)
    fig['layout']['xaxis1'].update(zeroline = False, showgrid=False)
    fig['layout']['yaxis1'].update(zeroline = False, showgrid = False, tickfont=dict(family='Arial', size=4))
    fig['layout']['xaxis2'].update(showticklabels = False)
    fig['layout']['xaxis2'].update(zeroline = False, showgrid=False)
    fig['layout']['yaxis2'].update(zeroline = False, showgrid = False, tickfont=dict(family='Arial', size=4))
    plot(fig, auto_open=False, filename="%s_%s_heatmap_clustered.html" % (output_path, cancer))
Пример #16
0
def refineEnsemble(ens, lower=.5, upper=10.):
    """Refine a PDB ensemble based on RMSD criterions.""" 

    from scipy.cluster.hierarchy import linkage, fcluster
    from scipy.spatial.distance import squareform
    from collections import Counter

    ### calculate pairwise RMSDs ###
    RMSD = ens.getRMSDs(pairwise=True)

    # convert the RMSD table to the compressed form
    v = squareform(RMSD)

    ### apply upper threshold ###
    Z_upper = linkage(v, method='complete')
    labels = fcluster(Z_upper, upper, criterion='distance')
    most_common_label = Counter(labels).most_common(1)[0][0]
    I = np.where(labels==most_common_label)[0]

    ### apply lower threshold ###
    Z_lower = linkage(v, method='single')
    labels = fcluster(Z_lower, lower, criterion='distance')
    uniq_labels = np.unique(labels)

    clusters = []
    for label in uniq_labels:
        indices = np.where(labels==label)[0]
        clusters.append(indices)

    J = np.ones(len(clusters), dtype=int) * -1
    rmsd = None
    for i, cluster in enumerate(clusters):
        if len(cluster) > 0:
            # find the conformations with the largest coverage 
            # (the weight of the ref should be 1)
            weights = [ens[j].getWeights().sum() for j in cluster]
            js = np.where(weights==np.max(weights))[0]

            # in the case where there are multiple structures with the same weight,
            # the one with the smallest rmsd wrt the ens._coords is selected. 
            if len(js) > 1:
                # rmsd is not calulated unless necessary for the sake of efficiency
                rmsd = ens.getRMSDs() if rmsd is None else rmsd
                j = js[np.argmin(rmsd[js])]
            else:
                j = js[0]
            J[i] = cluster[j]
        else:
            J[i] = cluster[0]

    ### refine ensemble ###
    K = np.intersect1d(I, J)

    reens = ens[K]

    return reens
Пример #17
0
def clusterData(xdata, rowMethod=True, columnMethod=False, method='average', metric='euclidean'):
    """clusterData clusters the data either by row, by column, or both

    :param xdata: a data dictionary - the one to be transformed
    :type x: dict, must contain 'data', 'proteins', 'fractions'
    :param rowMethod: a boolean asking if you want to flip on the rows (proteins get clustered)
    :type rowMethod: bool
    :param columnMethod: a boolean asking if you want to flip on the columns (fractions get clustered)
    :type columnMethod: bool
    :param method: string defining the linkage type, defaults to 'average' - 'ward' might be a good option
    :type method: string
    :param metric: string defining the distance metric, defaults to 'euclidean'
    :type metric: string
    :returns:  a data ditionary. 'data', 'proteins', 'fractions', 'fi', 'pi', 'topDendro', 'rightDendro' are updated

    """
        
    xdat = xdata.copy()
    x = xdat['data']
    ind1 = xdat['proteins']
    ind2 = xdat['fractions']
    xt = x
    idx1 = None
    idx2 = None
    
    toReturn = xdat
    Y1 = None
    Y2 = None
    if rowMethod:
        d1 = ssd.pdist(x)
        D1 = ssd.squareform(d1)  # full matrix
        Y1 = sch.linkage(D1, method=method, metric=metric) ### gene-clustering metric - 'average', 'single', 'centroid', 'complete'
        Z1 = sch.dendrogram(Y1, no_plot=True, orientation='right')
        idx1 = Z1['leaves'] ### apply the clustering for the gene-dendrograms to the actual matrix data
        xt = xt[idx1,:]   # xt is transformed x
        newIndex = []
        for i in idx1:
            newIndex.append(ind1[i])
        toReturn['proteins'] = newIndex
        toReturn['pi'] = idx1
    if columnMethod:
        d2 = ssd.pdist(x.T)
        D2 = ssd.squareform(d2)
        Y2 = sch.linkage(D2, method=method, metric=metric) ### array-clustering metric - 'average', 'single', 'centroid', 'complete'
        Z2 = sch.dendrogram(Y2, no_plot=True)
        idx2 = Z2['leaves'] ### apply the clustering for the array-dendrograms to the actual matrix data
        xt = xt[:,idx2]
        newIndex = []
        for i in idx2:
            newIndex.append(ind2[i])
        toReturn['fractions'] = newIndex
        toReturn['fi'] = idx2
    toReturn['data'] = xt
    toReturn['topDendro'] = Y2
    toReturn['rightDendro'] = Y1
    return toReturn
Пример #18
0
    def heatmap_v1(self,data_I,row_labels_I,column_labels_I):
        '''Generate a heatmap using pandas and scipy
        DEPRECATED: kept for compatibility with old io methods'''

        """dendrogram documentation:

        Output:
        'color_list': A list of color names. The k?th element represents the color of the k?th link.
        'icoord' and 'dcoord':  Each of them is a list of lists. Let icoord = [I1, I2, ..., Ip] where Ik = [xk1, xk2, xk3, xk4] and dcoord = [D1, D2, ..., Dp] where Dk = [yk1, yk2, yk3, yk4], then the k?th link painted is (xk1, yk1) - (xk2, yk2) - (xk3, yk3) - (xk4, yk4).
        'ivl':  A list of labels corresponding to the leaf nodes.
        'leaves': For each i, H[i] == j, cluster node j appears in position i in the left-to-right traversal of the leaves, where \(j < 2n-1\) and \(i < n\). If j is less than n, the i-th leaf node corresponds to an original observation. Otherwise, it corresponds to a non-singleton cluster."""

        #parse input into col_labels and row_labels
        #TODO: pandas is not needed for this.
        mets_data = pd.DataFrame(data=data_I, index=row_labels_I, columns=column_labels_I)

        mets_data = mets_data.dropna(how='all').fillna(0.)
        #mets_data = mets_data.replace([np.inf], 10.)
        #mets_data = mets_data.replace([-np.inf], -10.)
        col_labels = list(mets_data.columns)
        row_labels = list(mets_data.index)

        #heatmap data matrix
        heatmap_data = []
        for i,g in enumerate(mets_data.index):
            for j,c in enumerate(mets_data.columns):
                #heatmap_data.append({"col": j+1, "row": i+1, "value": mets_data.ix[g][c]})
                heatmap_data.append({"col": j, "row": i, "value": mets_data.ix[g][c]})

        #perform the custering on the both the rows and columns
        dm = mets_data
        D1 = squareform(pdist(dm, metric='euclidean'))
        D2 = squareform(pdist(dm.T, metric='euclidean'))

        Y = linkage(D1, method='single')
        Z1 = dendrogram(Y, labels=dm.index)

        Y = linkage(D2, method='single')
        Z2 = dendrogram(Y, labels=dm.columns)

        #parse the output
        hccol = Z2['leaves'] # no hclustering; same as heatmap_data['col']
        hcrow = Z1['leaves'] # no hclustering; same as heatmap_data['row']
        hccolicoord = Z2['icoord'] # no hclustering; same as heatmap_data['col']
        hcrowicoord = Z1['icoord'] # no hclustering; same as heatmap_data['row']
        hccoldcoord = Z2['dcoord'] # no hclustering; same as heatmap_data['col']
        hcrowdcoord = Z1['dcoord'] # no hclustering; same as heatmap_data['row']
        
        #hccol = [x+1 for x in hccol]; # hccol index should match heatmap_data index
        #hcrow = [x+1 for x in hcrow];

        return {'hcrow': hcrow, 'hccol': hccol, 'row_labels':row_labels,
                                            'col_labels':col_labels,
                                            'heatmap_data':heatmap_data,
                                            'maxval' : max([x['value'] for x in heatmap_data]),
                                            'minval' : min([x['value'] for x in heatmap_data])}
Пример #19
0
 def test_correspond_4_and_up(self):
     # Tests correspond(Z, y) on linkage and CDMs over observation sets of
     # different sizes. Correspondance should be false.
     for (i, j) in list(zip(list(range(2, 4)), list(range(3, 5)))) + list(zip(list(range(3, 5)), list(range(2, 4)))):
         y = np.random.rand(i * (i - 1) // 2)
         y2 = np.random.rand(j * (j - 1) // 2)
         Z = linkage(y)
         Z2 = linkage(y2)
         assert_equal(correspond(Z, y2), False)
         assert_equal(correspond(Z2, y), False)
def analyzeClusters(n_loops=1, cl=None, sp=None, shuffled=False, spShuff=False):
    results = {}
    n = n_loops

    bins = [i for i in drange(0.0, 1.0, 0.1)]
    total_hist = [0 for i in bins]

    data = win.getData(shuffle=shuffled, class_=cl, spec=sp)
    if spShuff is True:
        win.shuffleIt(data, mode=2)
    Z = hie.linkage(data, method='average', metric='correlation')
    D = hie.dendrogram(Z, orientation='left', no_plot=True)

    total_ys = [0 for d in D['dcoord']]
    total_z = [0 for d in Z[::-1, 2]]
    total_acc = [0 for d in np.diff(Z[::-1, 2], 2)]

    for ii in range(0, n):  # for loop added to average shuffled results
        # data = win.getData(shuffle=True, class_='J')
        # labels = win.getStudents(class_=classes[0])
        # labels = [str(st.class_) + " " + str(st.spec) for st in labels]

        Z = hie.linkage(data, method='average', metric='correlation')
        D = hie.dendrogram(Z, orientation='left', no_plot=True)

        # print(data[40, :])
        # print(data[42, :])

        # freq method
        ys = [d[1] for d in D['dcoord']]
        total_ys = [a + b for a, b in zip(ys, total_ys)]
        hist, bins = np.histogram(ys, bins=bins)
        total_hist = [a + b for a, b in zip(hist, total_hist)]

        # elbow method (sort of)
        z = Z[::-1, 2]
        total_z = [a + b for a, b in zip(z, total_z)]

        # inv elbow
        acceleration = np.diff(Z[::-1, 2], 2)  # 2nd derivative of distances
        total_acc = [a + b for a, b in zip(acceleration, total_acc)]
        if ii < n - 1:  # dont get new data if there wont be another loop
            data = win.getData(shuffle=shuffled, class_=cl, spec=sp)

    total_hist = [a / n for a in total_hist]
    total_ys = [a / n for a in total_ys]
    total_z = [a / n for a in total_z]
    total_acc = [a / n for a in total_acc]

    results['bins'] = (bins[:-1] + bins[1:]) / 2
    results['hist'] = total_hist
    results['ys'] = total_ys
    results['z'] = total_z
    results['acc'] = total_acc
    return results
def heatmap_plot_zscore_bigneuron(df_zscore_features, df_all, output_dir, title=None):

    print "heatmap plot:bigneuron"

    #taiwan
    metric ='nt_type'
    mtypes = np.unique(df_all[metric])
    print mtypes
    mtypes_pal = sns.color_palette("hls", len(mtypes))

    mtypes_lut = dict(zip(mtypes, mtypes_pal))
    mtypes_colors = df_all[metric].map(mtypes_lut)



    linkage = hierarchy.linkage(df_zscore_features, method='ward', metric='euclidean')

    data = df_zscore_features.transpose()
    row_linkage = hierarchy.linkage(data, method='ward', metric='euclidean')
    feature_order = hierarchy.leaves_list(row_linkage)

    #print data.index
    matchIndex = [data.index[x] for x in feature_order]
    #print matchIndex
    data = data.reindex(matchIndex)

    pl.figure()
    g = sns.clustermap(data, row_cluster = False, col_linkage=linkage, method='ward', metric='euclidean',
                       linewidths = 0.0,col_colors = [mtypes_colors],
                       cmap = sns.cubehelix_palette(light=1, as_cmap=True),figsize=(40,10))

    pl.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    pl.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
    #g.ax_heatmap.set_xticklabels([])
    pl.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.95)  # !!!!!

    if title:
        pl.title(title)


    location ="best"
    num_cols=1
    # Legend for row and col colors

    for label in mtypes:
         g.ax_row_dendrogram.bar(0, 0, color=mtypes_lut[label], label=label, linewidth=0.0)
         g.ax_row_dendrogram.legend(loc=location, ncol=num_cols,borderpad=0)

    filename = output_dir + '/zscore_feature_heatmap.png'
    pl.savefig(filename, dpi=300)
    #pl.show()
    print("save zscore matrix heatmap figure to :" + filename)
    pl.close()
    print "done clustering and heatmap plotting"
    return linkage
Пример #22
0
def is_distance_and_linkage_compatible(distance, linkage):
    is_linkage_method_OK(linkage)
    is_distance_metric_OK(distance)

    if distance == 'yule' and linkage != 'single':
        raise ConfigError("The cistance metric 'yule' will only work with the linkage 'single' :/")

    try:
        hierarchy.linkage([(1, 0), (0, 1), (1, 1)], metric=distance, method=linkage)
    except Exception as exception:
        raise ConfigError("Someone is upset here: %s" % exception)
Пример #23
0
 def test_correspond_2_and_up(self):
     # Tests correspond(Z, y) on linkage and CDMs over observation sets of
     # different sizes.
     for i in xrange(2, 4):
         y = np.random.rand(i*(i-1)//2)
         Z = linkage(y)
         self.assertTrue(correspond(Z, y))
     for i in xrange(4, 15, 3):
         y = np.random.rand(i*(i-1)//2)
         Z = linkage(y)
         self.assertTrue(correspond(Z, y))
Пример #24
0
def hierarchical_clustering(data, distance='correlation', method='ward'):
    """ Perform hierarchical clustering on distance matrix.

    Parameters
    ----------
    data : array_like
        Data matrix to cluster, precompupted distances.
    distance : 'str'
        Distance metric to use. Passed as `metric`-key word to
        `scipy.spatial.distance.pdist` if not equal to `'precomputed'`
    method : str
        Linkage method, passed to `scipy.cluster.hierarchy.linkage`,
        default method is `"ward"`.
    metric : str
        Distance method, passed to `scipy.cluster.hierarchy.linkage`,
        defaults to Euclidean distance..

    Returns
    -------
    row_linkage : numpy.ndarray
        Row linkage matrix.
    col_linkage : numpy.ndarray
        Column linkage matrix.
    row_dist : numpy.ndarray
        Distance matrix
    """
    symmetric = False
    if distance == 'precomputed':
        try:
            symmetric = np.allclose(data, data.T)
        except ValueError:
            symmetric = False

        if not symmetric:
            raise ValueError('precomputed distance not symmetric')

        row_dist = col_dist = data.copy()
    else:
        try:
            row_dist = scipy_dist.squareform(
                scipy_dist.pdist(data, metric=distance))
            col_dist = scipy_dist.squareform(
                scipy_dist.pdist(data.T, metric=distance))
        except ValueError:
            raise

    row_linkage = scipy_hc.linkage(row_dist, method=method)

    if symmetric:
        col_linkage = row_linkage
    else:
        col_linkage = scipy_hc.linkage(col_dist, method=method)

    return row_linkage, col_linkage
Пример #25
0
 def test_correspond_4_and_up_2(self):
     # Tests correspond(Z, y) on linkage and CDMs over observation sets of
     # different sizes. Correspondance should be false.
     for (i, j) in (list(zip(list(range(2, 7)), list(range(16, 21)))) +
                    list(zip(list(range(2, 7)), list(range(16, 21))))):
         y = np.random.rand(i*(i-1)//2)
         y2 = np.random.rand(j*(j-1)//2)
         Z = linkage(y)
         Z2 = linkage(y2)
         self.assertTrue(correspond(Z, y2) == False)
         self.assertTrue(correspond(Z2, y) == False)
Пример #26
0
def cluster(df, metric="euclidean", method="single", row=True, column=True):
    row_linkmat, col_linkmat = None, None
    if row:
        distmat = dist.pdist(df, metric)
        row_linkmat = hier.linkage(distmat, method)
        df = df.iloc[hier.leaves_list(row_linkmat), :]
    if column:
        df = df.T
        distmat = dist.pdist(df, metric)
        col_linkmat = hier.linkage(distmat, method)
        df = df.iloc[hier.leaves_list(col_linkmat), :].T
    return df, row_linkmat, col_linkmat
Пример #27
0
def test_optimal_leaf_ordering():
    # test with the distance vector y
    Z = optimal_leaf_ordering(linkage(hierarchy_test_data.ytdist),
                              hierarchy_test_data.ytdist)
    expectedZ = hierarchy_test_data.linkage_ytdist_single_olo
    assert_allclose(Z, expectedZ, atol=1e-10)

    # test with the observation matrix X
    Z = optimal_leaf_ordering(linkage(hierarchy_test_data.X, 'ward'),
                              hierarchy_test_data.X)
    expectedZ = hierarchy_test_data.linkage_X_ward_olo
    assert_allclose(Z, expectedZ, atol=1e-06)
Пример #28
0
    def heatmap_fc(self, labels=False, dendro=False):
        """Make heatmap with log2fold change... function not ready yet

        :param labels:
        :param dendro:
        """
        fig = plt.figure(figsize=(7, 9))
        D = self.matrix.values
        if dendro:
            rectangle1 = (0, 0.2, 0.2, 0.69)
            ax1 = fig.add_axes(rectangle1)
            Y = sch.linkage(D, method='centroid')
            Z1 = sch.dendrogram(Y, orientation='right')
            ax1.set_xticks([])
            ax1.set_yticks([])

            # need to transpose the array so you can sort by RPKM
            Dt = np.transpose(D)

            # Compute and plot the top dendrogram.
            ax2 = fig.add_axes([0.4, 0.9, 0.4, 0.1])
            Y = sch.linkage(Dt, method='single')
            Z2 = sch.dendrogram(Y)
            ax2.set_xticks([])
            ax2.set_yticks([])

            # Plot heatmap distance matrix.
            axmatrix = fig.add_axes([0.4, 0.2, 0.4, 0.69])
            idx1 = Z1['leaves']
            idx2 = Z2['leaves']
            D = D[idx1, :]
            D = D[:, idx2]

        color = plt.cm.jet
        colormap = plt.get_cmap(color)
        axmatrix = fig.add_axes([0.4, 0.2, 0.4, 0.69])
        normal = mpl.colors.Normalize(vmin=np.nanmin(D), vmax=np.nanmax(D))
        im = axmatrix.pcolormesh(D, cmap=colormap, norm=normal, clip_on=True)
        if labels:
            if dendro:
                xlabels = list(self.matrix.columns[i].__str__() for i in idx2)
                ylabels = list(self.matrix.index[i].__str__() for i in idx1)
            else:
                ylabels = list(self.matrix.index)
                xlabels = list(self.matrix.columns)
            axmatrix.set_xticklabels(xlabels, rotation=90, minor=False)
            axmatrix.set_xticks(np.arange(xlabels.__len__()) + 0.5, minor=False)
            axmatrix.set_yticklabels(ylabels, fontsize='small', minor=False)
            axmatrix.set_yticks(np.arange(ylabels.__len__()) + 0.5, minor=False)
        plt.ylim(0, self.matrix.shape[0])
        axcolor = fig.add_axes([.85, 0.2, 0.02, 0.6])
        plt.colorbar(im, cax=axcolor)
        axcolor.set_title('FC')
def cluster(data):
    pairwise_dists = distance.squareform(distance.pdist(data))
    # cluster
    sch.set_link_color_palette(['black'])
    row_clusters = sch.linkage(pairwise_dists,method='complete')
    # rename row clusters
    #row_clusters = clusters
    # calculate pairwise distances for columns
    col_pairwise_dists = distance.squareform(distance.pdist(data.T))
    # cluster
    col_clusters = sch.linkage(col_pairwise_dists,method='complete')
    return row_clusters, col_clusters
Пример #30
0
 def get_linkage(self, stat_linkage_method):
     #create the datamodel which is needed as input for the dendrogram
     #only method ward demands a redundant distance matrix while the others seem to get different
     #results with a redundant matrix and with a flat one, latter seems to be ok.
     #see https://github.com/scipy/scipy/issues/2614  (not sure this is still an issue)
     if stat_linkage_method == "ward":
         z = sch.linkage(self, method='ward', metric='euclidean')
     else:
         #creating a flat representation of the dist matrix
         deltas_flat = ssd.squareform(self)
         z = sch.linkage(deltas_flat, method=stat_linkage_method, metric='euclidean')
     return z
Пример #31
0
def linkage_tree(X,
                 connectivity=None,
                 n_components=None,
                 n_clusters=None,
                 linkage='complete',
                 affinity="euclidean",
                 return_distance=False):
    """Linkage agglomerative clustering based on a Feature matrix.

    The inertia matrix uses a Heapq-based representation.

    This is the structured version, that takes into account some topological
    structure between samples.

    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        feature matrix representing n_samples samples to be clustered

    connectivity : sparse matrix (optional).
        connectivity matrix. Defines for each sample the neighboring samples
        following a given structure of the data. The matrix is assumed to
        be symmetric and only the upper triangular half is used.
        Default is None, i.e, the Ward algorithm is unstructured.

    n_components : int (optional)
        Number of connected components. If None the number of connected
        components is estimated from the connectivity matrix.

    n_clusters : int (optional)
        Stop early the construction of the tree at n_clusters. This is
        useful to decrease computation time if the number of clusters is
        not small compared to the number of samples. In this case, the
        complete tree is not computed, thus the 'children' output is of
        limited use, and the 'parents' output should rather be used.
        This option is valid only when specifying a connectivity matrix.

    linkage : {"average", "complete"}, optional, default: "complete"
        Which linkage critera to use. The linkage criterion determines which
        distance to use between sets of observation.
            - average uses the average of the distances of each observation of
              the two sets
            - complete or maximum linkage uses the maximum distances between
              all observations of the two sets.

    affinity : string or callable, optional, default: "euclidean".
        which metric to use. Can be "euclidean", "manhattan", or any
        distance know to paired distance (see metric.pairwise)

    return_distance : bool, default False
        whether or not to return the distances between the clusters.

    Returns
    -------
    children : 2D array, shape (n_nodes, 2)
        The children of each non-leaf node. Values less than `n_samples`
        correspond to leaves of the tree which are the original samples.
        A node `i` greater than or equal to `n_samples` is a non-leaf
        node and has children `children_[i - n_samples]`. Alternatively
        at the i-th iteration, children[i][0] and children[i][1]
        are merged to form node `n_samples + i`

    n_components : int
        The number of connected components in the graph.

    n_leaves : int
        The number of leaves in the tree.

    parents : 1D array, shape (n_nodes, ) or None
        The parent of each node. Only returned when a connectivity matrix
        is specified, elsewhere 'None' is returned.

    distances : ndarray, shape (n_nodes,)
        Returned when return_distance is set to True.

        distances[i] refers to the distance between children[i][0] and
        children[i][1] when they are merged.

    See also
    --------
    ward_tree : hierarchical clustering with ward linkage
    """
    X = np.asarray(X)
    if X.ndim == 1:
        X = np.reshape(X, (-1, 1))
    n_samples, n_features = X.shape

    linkage_choices = {
        'complete': _hierarchical.max_merge,
        'average': _hierarchical.average_merge,
    }
    try:
        join_func = linkage_choices[linkage]
    except KeyError:
        raise ValueError('Unknown linkage option, linkage should be one '
                         'of %s, but %s was given' %
                         (linkage_choices.keys(), linkage))

    if connectivity is None:
        from scipy.cluster import hierarchy  # imports PIL

        if n_clusters is not None:
            warnings.warn(
                'Partial build of the tree is implemented '
                'only for structured clustering (i.e. with '
                'explicit connectivity). The algorithm '
                'will build the full tree and only '
                'retain the lower branches required '
                'for the specified number of clusters',
                stacklevel=2)

        if affinity == 'precomputed':
            # for the linkage function of hierarchy to work on precomputed
            # data, provide as first argument an ndarray of the shape returned
            # by pdist: it is a flat array containing the upper triangular of
            # the distance matrix.
            i, j = np.triu_indices(X.shape[0], k=1)
            X = X[i, j]
        elif affinity == 'l2':
            # Translate to something understood by scipy
            affinity = 'euclidean'
        elif affinity in ('l1', 'manhattan'):
            affinity = 'cityblock'
        elif callable(affinity):
            X = affinity(X)
            i, j = np.triu_indices(X.shape[0], k=1)
            X = X[i, j]
        out = hierarchy.linkage(X, method=linkage, metric=affinity)
        children_ = out[:, :2].astype(np.int)

        if return_distance:
            distances = out[:, 2]
            return children_, 1, n_samples, None, distances
        return children_, 1, n_samples, None

    connectivity = _fix_connectivity(X,
                                     connectivity,
                                     n_components=n_components)

    connectivity = connectivity.tocoo()
    # Put the diagonal to zero
    diag_mask = (connectivity.row != connectivity.col)
    connectivity.row = connectivity.row[diag_mask]
    connectivity.col = connectivity.col[diag_mask]
    connectivity.data = connectivity.data[diag_mask]
    del diag_mask

    # FIXME We compute all the distances, while we could have only computed
    # the "interesting" distances
    distances = paired_distances(X[connectivity.row],
                                 X[connectivity.col],
                                 metric=affinity)
    connectivity.data = distances

    if n_clusters is None:
        n_nodes = 2 * n_samples - 1
    else:
        assert n_clusters <= n_samples
        n_nodes = 2 * n_samples - n_clusters

    if return_distance:
        distances = np.empty(n_nodes - n_samples)
    # create inertia heap and connection matrix
    A = np.empty(n_nodes, dtype=object)
    inertia = list()

    # LIL seems to the best format to access the rows quickly,
    # without the numpy overhead of slicing CSR indices and data.
    connectivity = connectivity.tolil()
    # We are storing the graph in a list of IntFloatDict
    for ind, (data, row) in enumerate(zip(connectivity.data,
                                          connectivity.rows)):
        A[ind] = IntFloatDict(np.asarray(row, dtype=np.intp),
                              np.asarray(data, dtype=np.float64))
        # We keep only the upper triangular for the heap
        # Generator expressions are faster than arrays on the following
        inertia.extend(
            _hierarchical.WeightedEdge(d, ind, r) for r, d in zip(row, data)
            if r < ind)
    del connectivity

    heapify(inertia)

    # prepare the main fields
    parent = np.arange(n_nodes, dtype=np.intp)
    used_node = np.ones(n_nodes, dtype=np.intp)
    children = []

    # recursive merge loop
    for k in xrange(n_samples, n_nodes):
        # identify the merge
        while True:
            edge = heappop(inertia)
            if used_node[edge.a] and used_node[edge.b]:
                break
        i = edge.a
        j = edge.b

        if return_distance:
            # store distances
            distances[k - n_samples] = edge.weight

        parent[i] = parent[j] = k
        children.append((i, j))
        # Keep track of the number of elements per cluster
        n_i = used_node[i]
        n_j = used_node[j]
        used_node[k] = n_i + n_j
        used_node[i] = used_node[j] = False

        # update the structure matrix A and the inertia matrix
        # a clever 'min', or 'max' operation between A[i] and A[j]
        coord_col = join_func(A[i], A[j], used_node, n_i, n_j)
        for l, d in coord_col:
            A[l].append(k, d)
            # Here we use the information from coord_col (containing the
            # distances) to update the heap
            heappush(inertia, _hierarchical.WeightedEdge(d, k, l))
        A[k] = coord_col
        # Clear A[i] and A[j] to save memory
        A[i] = A[j] = 0

    # Separate leaves in children (empty lists up to now)
    n_leaves = n_samples

    # # return numpy array for efficient caching
    children = np.array(children)[:, ::-1]

    if return_distance:
        return children, n_components, n_leaves, parent, distances
    return children, n_components, n_leaves, parent
users = range(10000)
purchases = []

for p in range(100000):
	u = random.choice(users)
	p = random.choice(products)
	purchases.append((u,p))

X = purchases.iloc[:, [3, 4]].values
y = purchases.iloc[:, 3].values

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

import scipy.cluster.hierarchy as sch
dendrogram = sch.dendrogram(sch.linkage(X, method = 'ward'))
plt.title('Dendrogram')
plt.xlabel('Products')
plt.ylabel('Users')
plt.show()

from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')
y_hc = hc.fit_predict(X)

plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(X[y_hc == 5, 0], X[y_hc == 5, 1], s = 100, c = 'cyan', label = 'Cluster 6')
Пример #33
0
plt.scatter(df.total_salaries, df.total_wins, s=60, c=labels)
# This one looks better, in my opinion, with 4 clusters, one of which is only NY

# Of K-means and DBSCAN, DBSCAN is better at identifying outliers

############
# Dendrogram
############
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, fclusterdata

distanceMatrix = pdist(data)

# print dendrogram
dend = dendrogram(linkage(distanceMatrix, method='complete'),
                  color_threshold=1,
                  leaf_font_size=10,
                  labels=df.teamID.tolist())
# This give us 7 clusters

# let's set the cutoff at 2 for 4 clusters
dend = dendrogram(linkage(distanceMatrix, method='complete'),
                  color_threshold=2,
                  leaf_font_size=10,
                  labels=df.teamID.tolist())

# get cluster assignments
assignments = fcluster(linkage(distanceMatrix, method='complete'), 2,
                       'distance')
Пример #34
0
for i in range(len(x)):
    X = x[i][0]
    Y = x[i][1]
    plt.scatter(X, Y)
    plt.xlabel('x axis')
    plt.ylabel('y axis')
    plt.title('The raw dataset')
Labels = range(1, 11)  #Labeling the points

#Let's plot the dendrogram for our data points, we must use Scipy Library

from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt

linked = linkage(
    x, 'single'
)  #Determine whether this is a single_linkage, complete_linkage or average clustering

labelList = range(1, 11)

plt.figure(figsize=(10, 7))
dendrogram(linked,
           orientation='top',
           labels=labelList,
           distance_sort='descending',
           show_leaf_counts=True)
plt.xlabel('point labels')
plt.ylabel('The distance and the cluster tress')
plt.show()

from sklearn.cluster import AgglomerativeClustering
'''We are going to continue the investigation into the sightings of legendary Pokémon from the previous exercise. Remember that in the scatter plot of the previous exercise, you identified two areas where Pokémon sightings were dense. This means that the points seem to separate into two clusters. In this exercise, you will form two clusters of the sightings using hierarchical clustering.

'x' and 'y' are columns of X and Y coordinates of the locations of sightings, stored in a Pandas data frame, df. The following are available for use: matplotlib.pyplot as plt, seaborn as sns, and pandas as pd.'''

import pandas as pd

x = [9, 6, 2, 3, 1, 7, 1, 6, 1, 7, 23, 26, 25, 23, 21, 23, 23, 20, 30, 23]
y = [8, 4, 10, 6, 0, 4, 10, 10, 6, 1, 29, 25, 30, 29, 29, 30, 25, 27, 26, 30]


df = pd.DataFrame({'x':x,'y':y})

# Import linkage and fcluster functions
from scipy.cluster.hierarchy import linkage, fcluster

# Use the linkage() function to compute distances
Z = linkage(df, 'ward')

# Generate cluster labels
df['cluster_labels'] = fcluster(Z, 2, criterion='maxclust')

# Plot the points with seaborn
import matplotlib.pyplot as plt
import seaborn as sns
sns.scatterplot(x='x', y='y', hue='cluster_labels', data=df)
plt.show()
Пример #36
0
        z = tmp[:, 2] + (rxyz[i, 2] * csize)
        tmp = np.column_stack([x, y, z])
        cls = np.vstack([cls, tmp])
    return cls


# Generate a cluster of clusters and distance matrix.
cls = clusters()

D = pdist(cls[:, 0:2])
D = squareform(D)

# Compute and plot first dendrogram.
fig = mpl.figure(figsize=(8, 8))
ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6])
Y1 = hy.linkage(D, method='complete')
cutoff = 0.3 * np.max(Y1[:, 2])
Z1 = hy.dendrogram(Y1, orientation='right', color_threshold=cutoff)
ax1.xaxis.set_visible(False)
ax1.yaxis.set_visible(False)
# Compute and plot second dendrogram.
ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
Y2 = hy.linkage(D, method='average')
cutoff = 0.3 * np.max(Y2[:, 2])
Z2 = hy.dendrogram(Y2, color_threshold=cutoff)
ax2.xaxis.set_visible(False)
ax2.yaxis.set_visible(False)
# Plot distance matrix.
ax3 = fig.add_axes([0.3, 0.1, 0.6, 0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
Пример #37
0
        s2 = samples[j]
        #a[i][j] = a[j][i] = len(s2snps[s1].intersection(s2snps[s2]))
        a[i][j] = a[j][i] = len(s2snps[s1].symmetric_difference(s2snps[s2]))

np.savetxt(sys.stdout,
           a,
           delimiter="\t",
           header="\t".join(samples[:len(s2snps)]),
           fmt='%i')

sys.stderr.write("Plotting...\n")
D = a
# Compute and plot first dendrogram.
fig = pylab.figure(figsize=(8, 8))
ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6])
Y = sch.linkage(D, method='centroid')
Z1 = sch.dendrogram(Y, orientation='right')
ax1.set_xticks([])
#ax1.set_yticks([])

# Compute and plot second dendrogram.
ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
Y = sch.linkage(D, method='single')
Z2 = sch.dendrogram(Y)
#ax2.set_xticks([])
ax2.set_yticks([])

fig.savefig('dendrogram.svg')

# Plot distance matrix.
axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6])
Пример #38
0
def sample_from_corrgan(model_loc, dim=10, n_samples=1):
    # pylint: disable=import-outside-toplevel, disable=too-many-locals
    """
    Samples correlation matrices from the pre-trained CorrGAN network.

    It is reproduced with modifications from the following paper:
    `Marti, G., 2020, May. CorrGAN: Sampling Realistic Financial Correlation Matrices Using
    Generative Adversarial Networks. In ICASSP 2020-2020 IEEE International Conference on
    Acoustics, Speech and Signal Processing (ICASSP) (pp. 8459-8463). IEEE.
    <https://arxiv.org/pdf/1910.09504.pdf>`_

    It loads the appropriate CorrGAN model for the required dimension. Generates a matrix output
    from this network. Symmetries this matrix and finds the nearest correlation matrix
    that is positive semi-definite. Finally, it maximizes the sum of the similarities between
    adjacent leaves to arrange it with hierarchical clustering.

    The CorrGAN network was trained on the correlation profiles of the S&P 500 stocks. Therefore
    the output retains these properties. In addition, the final output retains the following
    6 stylized facts:

    1. Distribution of pairwise correlations is significantly shifted to the positive.

    2. Eigenvalues follow the Marchenko-Pastur distribution, but for a very large first
    eigenvalue (the market).

    3. Eigenvalues follow the Marchenko-Pastur distribution, but for a couple of other
    large eigenvalues (industries).

    4. Perron-Frobenius property (first eigenvector has positive entries).

    5. Hierarchical structure of correlations.

    6. Scale-free property of the corresponding Minimum Spanning Tree (MST).

    :param model_loc: (str) Location of folder containing CorrGAN models.
    :param dim: (int) Dimension of correlation matrix to sample.
        In the range [2, 200].
    :param n_samples: (int) Number of samples to generate.
    :return: (np.array) Sampled correlation matrices of shape (n_samples, dim, dim).
    """
    # Import here needed to prevent unnecessary imports in other parts of code.
    import tensorflow as tf

    # Validate dimension.
    if not (1 < dim <= 200):
        raise ValueError("Dimension not supported, {}".format(dim))

    # Resulting correlation matrices.
    nearest_corr_mats = []

    # Load generator model closest to the required dimension by looking at the models folder.
    dimension_from_folder = [
        int(f.split("_")[1][:-1]) for f in listdir(model_loc)
        if not path.isfile(path.join(model_loc, f))
    ]
    all_generator_dimensions = np.sort(dimension_from_folder)
    closest_dimension = next(
        filter(lambda i: i >= dim, all_generator_dimensions))

    # Load model.
    generator = tf.keras.models.load_model("{}/generator_{}d".format(
        model_loc, closest_dimension),
                                           compile=False)

    # Sample from generator. Input dimension based on network.
    noise_dim = generator.layers[0].input_shape[1]
    noise = tf.random.normal([n_samples, noise_dim])
    generated_mat = generator(noise, training=False)

    # Get the indices of an upper triangular matrix.
    tri_rows, tri_cols = np.triu_indices(dim, k=1)

    # For each sample generated, make them strict correlation matrices
    # by projecting them on the nearest correlation matrix using Higham’s
    # alternating projections method.
    for i in range(n_samples):
        # Grab only the required dimensions from generated matrix.
        corr_mat = np.array(generated_mat[i, :dim, :dim, 0])

        # Set diagonal to 1 and symmetrize.
        np.fill_diagonal(corr_mat, 1)
        corr_mat[tri_cols, tri_rows] = corr_mat[tri_rows, tri_cols]
        # Get nearest correlation matrix that is positive semi-definite.
        nearest_corr_mat = corr_nearest(corr_mat)

        # Set diagonal to 1 and symmetrize.
        np.fill_diagonal(nearest_corr_mat, 1)
        nearest_corr_mat[tri_cols, tri_rows] = nearest_corr_mat[tri_rows,
                                                                tri_cols]

        # Arrange with hierarchical clustering by maximizing the sum of the
        # similarities between adjacent leaves.
        dist = 1 - nearest_corr_mat
        linkage_mat = hierarchy.linkage(dist[tri_rows, tri_cols],
                                        method="ward")
        optimal_leaves = hierarchy.optimal_leaf_ordering(
            linkage_mat, dist[tri_rows, tri_cols])
        optimal_ordering = hierarchy.leaves_list(optimal_leaves)
        ordered_corr = nearest_corr_mat[optimal_ordering, :][:,
                                                             optimal_ordering]
        nearest_corr_mats.append(ordered_corr)

    return np.array(nearest_corr_mats)
Пример #39
0
        row += 1
    col += 1
joint_num = 0
Dis_all = np.zeros((motion_num, motion_num))
for dis_Mat in Dis_Mat_list:
    Dis_all += dis_Mat
    df_dis = pd.DataFrame(dis_Mat, columns=namelist, index=namelist)
    df_dis.to_csv("/home/kei/document/experiments/Master2/AJ_result/" +
                  OpenPoseJoint[joint_num] + "_dis.csv")
    joint_num += 1
Dis_all = pd.DataFrame(Dis_all, columns=namelist, index=namelist)
Dis_all.to_csv("/home/kei/document/experiments/Master2/AJ_result/Distance.csv")
Distance = Dis_all.values
print(Distance)
darray = distance.squareform(Distance)
result = linkage(darray, method="average")

plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams['font.size'] = 14  #フォントサイズを設定

dendrogram(result, labels=namelist)
plt.ylabel("distance")
#plt.show()
#plt.savefig("/home/kei/document/experiments/Master/UJ_result/elder.png")
plt.cla()
NUM_CLUSTERS_RANGE = range(2, 24)
silhouette_coefficient = []
davies_bouldin_index = []
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Coefficient')
plt.rcParams["ytick.direction"] = "in"
Пример #40
0
    print("Read expression in")

    ase_rows, ase_cols = ase.shape
    ase = ase.ix[np.sum(np.isfinite(ase), axis=1) > .75 * ase_cols]
    ase = ase.ix[ase.index.intersection(all_expr.index)].dropna(axis='columns',
                                                                how='all')
    all_expr = all_expr.ix[ase.index]

    all_expr_lognorm = np.log(all_expr + 1).divide(
        np.log(all_expr.max(axis=1) + 1), axis=0)

    print("Precalculating distances")
    metric = DistributionDifference.earth_mover_multi
    dist_mat = DistributionDifference.mp_pandas_pdist(ase + eps, metric)

    Z = hierarchy.linkage(dist_mat, method='weighted')

    make_treeview_files(
        "analysis/results/all_log_normed_" + is_sparse + metric.__name__,
        all_expr_lognorm, Z)

    make_treeview_files("analysis/results/all_" + is_sparse + metric.__name__,
                        all_expr, Z)

    make_treeview_files("analysis/results/ase_" + is_sparse + metric.__name__,
                        ase, Z)

    make_treeview_files(
        "analysis/results/all_maxnorm_" + is_sparse + metric.__name__,
        all_expr.divide(all_expr.max(axis=1) + 1, axis=0), Z)
Пример #41
0
                            col_colors=colors_list,
                            xticklabels=False,
                            yticklabels=False)
title
plotitle = str(total_samples) + ' samples clustered by ' + str(
    len(att_IDS)) + ' attractors' + ', genes = ' + str(total_genes)
att_heatmap.fig.suptitle(plotitle, fontsize=18)
#saves figure
att_heatmap.plot
plt.savefig('attractors_heatmap.png', format='png', dpi=300)
plt.clf()

# plots attractors dendrogram:
from scipy.cluster import hierarchy
# computes distance between samples
datts = hierarchy.linkage(U_attractors.T, metric='euclidean')
# plots the dendrogram
plt.title('Hierarchical Clustering Dendrogram')
plt.ylabel('attractors')
plt.xlabel('distance [Euclidean]')
hierarchy.set_link_color_palette(None)
hierarchy.dendrogram(datts,
                     labels=U_attractors.columns,
                     leaf_rotation=0,
                     orientation='left')
plt.savefig('attractors_dendrogram.png', format='png', dpi=300)
plt.clf()

# plots attractor stacked bar plot for sample type content
vl_title = 'Attractor/type' + ', ARI = ' + str(ARI) + ', AMI =' + str(AMI)
att_content = samples_to_attractors[['type', 'attractor']]
Пример #42
0
def DrawSHC(samples, labels):
    plt.title("Customer Dendograms")
    dend = shc.dendrogram(shc.linkage(samples, method='ward'))
    plt.show()
Пример #43
0
def makeDendro(flatDist, labels, meta):
    clusters = sciHi.linkage(flatDist, metric=distMetric, method='average')
    print("Linkage:")
    print(clusters)

    plt.subplot(20, 1, (1, 15))
    plt.rcParams['lines.linewidth'] = 0.6
    dendro = sciHi.dendrogram(clusters, labels=labels)
    for i in dendro:
        print(i, dendro[i])
    ax = plt.gca()
    plt.setp(ax.get_xticklabels(), visible=False)

    plt.subplot(20, 1, 16)
    genders = [
        genderToNum(meta.loc[meta['Sample_Name'] == ind,
                             'Delivery_Sex'].values[0])
        for ind in dendro['ivl']
    ]
    plt.imshow([genders] * 4)
    plt.yticks([])
    plt.xticks([])
    plt.axis('off')

    plt.subplot(20, 1, 17)
    smoking = [
        smokingToNum(meta.loc[meta['Sample_Name'] == ind,
                              'Patient_tobacco_now'].values[0])
        for ind in dendro['ivl']
    ]
    plt.imshow([smoking] * 4)
    plt.yticks([])
    plt.xticks([])
    plt.axis('off')

    plt.subplot(20, 1, 18)
    ga = [
        m.trunc(meta.loc[meta['Sample_Name'] == ind,
                         'Delivery_week_at_delivery'].values[0])
        for ind in dendro['ivl']
    ]
    plt.imshow([ga] * 4)
    plt.yticks([])
    plt.xticks([])
    plt.axis('off')

    plt.subplot(20, 1, 19)
    gd = [
        dgToNum(meta.loc[meta['Sample_Name'] == ind, 'DG'].values[0])
        for ind in dendro['ivl']
    ]
    plt.imshow([gd] * 4)
    plt.yticks([])
    plt.xticks([])
    plt.axis('off')

    plt.subplot(20, 1, 20)
    baby_weight = [
        weightToNum(
            meta.loc[meta['Sample_Name'] == ind,
                     'SGA, AGA ou LGA (par rapport au poids)'].values[0])
        for ind in dendro['ivl']
    ]
    plt.imshow([baby_weight] * 4)
    plt.yticks([])
    plt.xticks([])
    plt.axis('off')

    plt.savefig('%sdendro_test_%s.svg' %
                (savePath, distDataPath.split('/')[-1]),
                format='svg')
    # dendro = sciHi.dendrogram(clusters, truncate_mode='level', p=20)
    # plt.savefig('%sdendro_p20_%s.svg' % (savePath, distDataPath.split('/')[-1]), format='svg')
    plt.close()

    return dendro
Пример #44
0
def cluster_ssh(sla, lat, lon, nclusters, distthres=3000, returnall=False):
    # Remove All NaN Points
    ntime, nlat, nlon = sla.shape
    slars = sla.reshape(ntime, nlat * nlon)
    okdata, knan, okpts = proc.find_nan(slars, 0)
    npts = okdata.shape[1]

    # ---------------------------------------------
    # Calculate Correlation and Covariance Matrices
    # ---------------------------------------------
    srho = np.corrcoef(okdata.T, okdata.T)
    scov = np.cov(okdata.T, okdata.T)
    srho = srho[:npts, :npts]
    scov = scov[:npts, :npts]

    # --------------------------
    # Calculate Distance Matrix
    # --------------------------
    lonmesh, latmesh = np.meshgrid(lon, lat)
    coords = np.vstack([lonmesh.flatten(), latmesh.flatten()]).T
    coords = coords[okpts, :]
    coords1 = coords.copy()
    coords2 = np.zeros(coords1.shape)
    coords2[:, 0] = np.radians(coords1[:, 1])  # First point is latitude
    coords2[:, 1] = np.radians(coords1[:, 0])  # Second Point is Longitude
    sdist = haversine_distances(coords2, coords2) * 6371

    # --------------------------
    # Combine the Matrices
    # --------------------------
    a_fac = np.sqrt(
        -distthres /
        (2 * np.log(0.5)))  # Calcuate so exp=0.5 when distance is 3000km
    expterm = np.exp(-sdist / (2 * a_fac**2))
    distance_matrix = 1 - expterm * srho

    # --------------------------
    # Do Clustering (scipy)
    # --------------------------
    cdist = squareform(distance_matrix, checks=False)
    linked = linkage(cdist, 'weighted')
    clusterout = fcluster(linked, nclusters, criterion='maxclust')

    # -------------------------
    # Calculate the uncertainty
    # -------------------------
    uncertout = np.zeros(clusterout.shape)
    for i in range(len(clusterout)):
        covpt = scov[i, :]  #
        cid = clusterout[i]  #
        covin = covpt[np.where(clusterout == cid)]
        covout = covpt[np.where(clusterout != cid)]
        uncertout[i] = np.mean(covin) / np.mean(covout)

    # Apply rules from Thompson and Merrifield (Do this later)
    # if uncert > 2, set to 2
    # if uncert <0.5, set to 0
    #uncertout[uncertout>2]   = 2
    #uncertout[uncertout<0.5] = 0

    # -----------------------
    # Replace into full array
    # -----------------------
    clustered = np.zeros(nlat * nlon) * np.nan
    clustered[okpts] = clusterout
    clustered = clustered.reshape(nlat, nlon)
    cluster_count = []
    for i in range(nclusters):
        cid = i + 1
        cnt = (clustered == cid).sum()
        cluster_count.append(cnt)
        print("Found %i points in cluster %i" % (cnt, cid))
    uncert = np.zeros(nlat * nlon) * np.nan
    uncert[okpts] = uncertout
    uncert = uncert.reshape(nlat, nlon)

    if returnall:
        return clustered, uncert, cluster_count, srho, scov, sdist, distance_matrix
    return clustered, uncert, cluster_count
Пример #45
0
            elif ((truth[i] != truth[j]) and (predicted[i] != predicted[j])):
                disagree_same += 1
            count += 1
    return (agree_same + disagree_same) / float(count)


# Code Sample
import scipy.cluster.hierarchy as sch
import numpy as np
import pylab as pl

# Plot dendogram and cut the tree to find resulting clusters
fig = pl.figure()
data = np.array([[1, 2, 3], [1, 1, 1], [5, 5, 5]])
datalable = ['first', 'second', 'third']
hClsMat = sch.linkage(data, method='complete')  # Complete clustering
sch.dendrogram(hClsMat, labels=datalable, leaf_rotation=45)
fig.savefig("thing.pdf")
resultingClusters = sch.fcluster(hClsMat, t=3, criterion='distance')
print resultingClusters

# Your code starts from here ....

# 1.
# Scaling min max
# STUDENT CODE TODO

# 2.
# K-means http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
# STUDENT CODE TODO
Пример #46
0
def step5(max_d):
    global eventL, notCombineRDDL, resultEventL, resultRDDL, outputPath, specialNum

    #vectorize the text
    vectorizer = CountVectorizer(analyzer="word",
                                 tokenizer=my_tokenizer,
                                 preprocessor=None,
                                 stop_words=['*'],
                                 max_features=10000)

    train_data_features = vectorizer.fit_transform(eventL)
    train_data_features = train_data_features.toarray()

    #hierarchical clustering
    Z = linkage(train_data_features, 'complete', 'cityblock')
    #c, coph_dists = cophenet(Z, pdist(train_data_features))
    #print 'The goodness of cluster result:', c

    clusters = fcluster(Z, max_d, criterion='distance')

    #initialize RDD list and Event list
    resultEventLL = []
    resultRDDLL = []
    numCombinedEvents = max(clusters)
    for i in range(numCombinedEvents):
        resultRDDLL.append([])
        resultEventLL.append([])

    #Put event/RDD that belong to the same cluster into the same list
    currentEventNum = 0
    for clusterNum in clusters:
        resultRDDLL[clusterNum - 1].append(notCombineRDDL[currentEventNum])
        resultEventLL[clusterNum - 1].append(eventL[currentEventNum])
        currentEventNum += 1

    #Merge the event/RDD in the same list
    for sameEventL in resultEventLL:

        if len(sameEventL) == 1:
            resultEventL.append(sameEventL[0])
        else:
            combinedEvent = sameEventL[0].strip().split()
            count = 0
            for currentEvent in sameEventL:
                if count == 0:
                    count += 1
                    continue
                else:
                    combinedEvent = LCS(combinedEvent,
                                        currentEvent.strip().split())
                    count += 1
            resultEventL.append(' '.join(combinedEvent))

    for sameRDDL in resultRDDLL:
        if len(sameRDDL) == 1:
            resultRDDL.append(sameRDDL[0])
        else:
            resultRDDL.append(sc.union(sameRDDL))

        resultRDDL[-1].map(lambda (ID, log): ID).saveAsTextFile(
            outputPath + str(len(resultRDDL) + specialNum))
def batch_process_aggregate(folder_path: str, group_criteria: float) -> List[dict]:
    """
    this function will read all the labeled defects from ./rect.json, aggregate close ones and output the aggregated
    defects information to ./defects.json
    :param folder_path: folder_path of the raw ir images
    :param group_criteria: in meters, if two defects are closer than this, they will be aggregated
    :return: list of information about defects
    """
    with open(join(folder_path, "exif.json"), "r") as f:
        exif = json.load(f)

    with open(join(folder_path, "rect.json"), "r") as f:
        rect_info = json.load(f)

    rects = list()
    for d in rect_info:
        for rect in d.get("rects"):
            rect.update({"height": d.get("height"),
                         "width": d.get("width"),
                         "image": d.get("image")})
            rects.append(rect)

    group_ids = set([x.get("panel_group_id") for x in rects])

    defect_num = 0
    defects = list()

    for group_id in group_ids:
        rects_match_id = [x for x in rects if x.get("panel_group_id") == group_id]

        if len(rects_match_id) == 1:
            cluster = [0]
        else:
            pixel_location_table = np.array([[x.get("easting"), x.get("northing")] for x in rects_match_id])
            linkage_matrix = linkage(pixel_location_table, method='single', metric='chebyshev')

            ctree = cut_tree(linkage_matrix, height=[group_criteria])
            cluster = np.array([x[0] for x in ctree])

        for i in range(len(rects_match_id)):
            rects_match_id[i].update({"defectId": "DEF{:05d}".format(cluster[i] + defect_num)})
        defect_num += max(cluster) + 1

        defect_id_set = set([x.get("defectId") for x in rects_match_id])
        for defect_id in defect_id_set:
            defect = {"defectId": defect_id, "panelGroupId": group_id, "category": DefectCategory.UNCONFIRMED}
            rect_match_defect = [x for x in rects_match_id if x.get("defectId") == defect_id]

            easting = float(np.mean([x.get("easting") for x in rect_match_defect]))
            northing = float(np.mean([x.get("northing") for x in rect_match_defect]))
            severity = float(np.mean([x.get("severity") for x in rects_match_id]))
            utm_zone = rects_match_id[0].get("utm_zone")
            lat, lng = utm.to_latlon(easting, northing, utm_zone, northern=True)
            defect.update({"lat": lat, "lng": lng, "utmEasting": easting, "utmNorthing": northing,
                           "utmZone": utm_zone, "severity": severity})
            defect.update({"rects": [x for x in rect_match_defect]})

            defects.append(defect)

    with open(join(folder_path, "defects.json"), "w") as f:
        json.dump(defects, f)

    return defects
Пример #48
0
	def link_clusters(self, distances: numpy.ndarray, num: int) -> pandas.DataFrame:
		Z = hierarchy.linkage(distances, method = self.linkage_method, optimal_ordering = True)

		return format_linkage_matrix(Z, num)
Пример #49
0
def evaluate_distance_matrix(distanceMatrix, trueClusters, clusteringType,
                             **kwargs):

    # TODO: 1. clear blackList dependency
    #       2. clustering type is an unlucky name for betaCV and the like.

    trueClusterNum = len(np.unique(trueClusters))
    #     distanceMatrixCopy = np.copy(distanceMatrix)

    if clusteringType == 'all' or 'betaCV' in clusteringType:
        res = beta_cv(distanceMatrix,
                      trueClusters,
                      blackList=None,
                      ranks=False)
        print "Beta-CV = %f" % (res, )

    if clusteringType == 'all' or 'cIndex' in clusteringType:
        res = c_index(distanceMatrix, trueClusters, blackList=None)
        print "C-Index = %f" % (res, )

    if clusteringType == 'all' or 'silhouette' in clusteringType:
        print "Silhouette = %f" % (metrics.silhouette_score(
            distanceMatrix, trueClusters, metric='precomputed'), )

    if clusteringType == 'all' or 'hierarchical' in clusteringType:
        print "\nEvaluating **Hierarchical Clustering**"
        distArray = ssd.squareform(distanceMatrix)
        try:
            linkageFunction = kwargs['linkage']
        except:
            linkageFunction = "complete"

        print "Linkage = " + linkageFunction
        Z = hierarchy.linkage(distArray, method=linkageFunction)
        T = hierarchy.fcluster(Z, trueClusterNum, criterion="maxclust")
        if len(np.unique(T)) != trueClusterNum:
            print "!Clusters found: " + str(len(np.unique(T)))

        res = evaluate_unsup_clustering(trueClusters, T, None, verbose=True)

    if clusteringType == 'all' or 'affinity' in clusteringType:
        print "\nEvaluating **Affinity Propagation**"
        affinities = np.exp(-(distanceMatrix**2) /
                            (2 * (np.median(distanceMatrix)**2)))
        cluster_centers_indices, labels = sklearn_cluster.affinity_propagation(
            affinities, copy=False, verbose=True)
        res = evaluate_unsup_clustering(trueClusters,
                                        labels,
                                        len(cluster_centers_indices),
                                        verbose=True)

    if clusteringType == 'all' or "dbscan" in clusteringType:
        print "\nEvaluating **DBScan Clustering**"
        # TODO maybe adapt eps
        eps = np.percentile(distanceMatrix, 5)
        predictedLabels = sklearn_cluster.DBSCAN(
            eps, metric='precomputed').fit_predict(distanceMatrix)
        print "Predicted as Noise: " + str(np.sum(predictedLabels == -1))
        res = evaluate_unsup_clustering(trueClusters,
                                        predictedLabels,
                                        len(np.unique(predictedLabels)),
                                        verbose=True)

    if clusteringType == 'all' or "spectral" in clusteringType:
        print "\nEvaluating **Spectral (with Normalized Laplacian) Clustering**"
        affinities = np.exp(-(distanceMatrix**2) /
                            (2 * (np.median(distanceMatrix)**2)))
        # arpack was chosen for stability reasons.
        classifier = sklearn_cluster.SpectralClustering(
            n_clusters=trueClusterNum,
            affinity='precomputed',
            assign_labels='kmeans',
            eigen_solver='arpack')
        classifier.fit(affinities)
        res = evaluate_unsup_clustering(trueClusters,
                                        classifier.labels_,
                                        None,
                                        verbose=True)

#     assert(np.all(distanceMatrixCopy == distanceMatrix))
    return res
Пример #50
0
k = np.max([
    np.where(pacf(consommation.loc[:, colname]) < 0)[0][0]
    for colname, col in consommation.iteritems()
])

DM_GCC = np.zeros((consommation.shape[1], consommation.shape[1]))
for i, j in itertools.combinations(range(consommation.shape[1]), 2):
    DM_GCC[i, j] = DM_GCC[j, i] = 1 - helpers.get_GCC(
        consommation.iloc[:, i], consommation.iloc[:, j], k)
DM_GCC = pd.DataFrame(DM_GCC,
                      index=consommation.columns,
                      columns=consommation.columns)

# sns.clustermap(consommation, col_linkage=hcl.linkage(squareform(DM_GCC)))
plt.figure()
hcl.dendrogram(hcl.linkage(squareform(DM_GCC), method="average"))

plt.figure()
plt.plot(
    np.arange(.1, 1.1, .1),
    np.array([
        np.unique(
            hcl.fcluster(hcl.linkage(squareform(DM_GCC), method="average"),
                         t=t,
                         criterion="distance")).shape[0]
        for t in np.arange(0.1, 1.1, 0.1)
    ]))

hcl.fcluster(hcl.linkage(squareform(DM_GCC), method="average"),
             t=0.4,
             criterion="distance")
Пример #51
0
#SciPy >> spatial.distance module >> pdist function
from scipy.spatial.distance import pdist, squareform
row_dist = pd.DataFrame(squareform(pdist(df, metric="euclidean")),
                        columns=labels,
                        index=labels)
row_dist

# In[ ]:

#agglomerative >>scipy.cluster.hierarchy submodule >> linkage function
from scipy.cluster.hierarchy import linkage
help(linkage)

# In[ ]:

row_clusters = linkage(pdist(df, metric="euclidean"), method="complete")
pd.DataFrame(row_clusters,
             columns=["row label 1", "row label 2", "distance", "No."],
             index=[
                 "cluster of {}".format(i + 1)
                 for i in range(row_clusters.shape[0])
             ])

# In[ ]:

from scipy.cluster.hierarchy import dendrogram
row_dendr = dendrogram(row_clusters, labels=labels)
plt.ylabel("Euclidean distance")
plt.tight_layout()
plt.show()
Пример #52
0
labelList = range(len(x))

plt.figure(figsize=(10, 7))
dendrogram(centr,
           orientation='top',
           labels=labelList,
           distance_sort='descending',
           show_leaf_counts=True)
plt.title('Dendrograma using centroid method')
plt.show()

# =============================================================================
# ALERTA! Hay más técnicas de clustering
# =============================================================================

linked = linkage(df, 'single')
labelList = range(len(x))
plt.figure(figsize=(10, 7))
dendrogram(linked,
           orientation='top',
           labels=labelList,
           distance_sort='descending',
           show_leaf_counts=True)
plt.title('Dendrograma using linkage method')
plt.show()

# =============================================================================
# DATOS Con diferentes unidades --- SIN NORMALIZAR
# =============================================================================
# Cambiamos los valores de un eje
Пример #53
0

def hecheng(a, b):
    m, N = a.shape
    n = b.shape[1]
    c = zeros((m, n))
    for i in range(m):
        for j in range(n):
            c[i, j] = max([min(a[i, k], b[k, j]) for k in range(N)])
    return c


a = array([[5, 5, 3, 2], [2, 3, 4, 5], [5, 5, 2, 3], [1, 5, 3, 1],
           [2, 4, 5, 1]])
d = array([[sum(abs(a[i] - a[j])) for i in range(5)] for j in range(5)])
r = 1 - 0.1 * d
print(r)
tr = hecheng(r, r)
while abs(r - tr).sum() > 0.00001:
    r = tr
    tr = hecheng(r, r)
print('\n------------------------\n', tr)
d2 = 1 - tr  #为了画图,再次转换为距离关系
d2 = triu(d2, 1)
d2 = d2[d2 != 0]  #提取矩阵上三角中的非零元素
z = sch.linkage(d2)
s = ['I', 'II', 'III', 'IV', 'V']
sch.dendrogram(z, labels=s)  #画聚类树
plt.yticks([])  #y轴不可见
plt.show()
# Remove the x ticks, y ticks, x and y axis
plt.xticks([])
plt.yticks([])
#plt.axis('off')



# Display the plot of the original data before clustering
plt.scatter(X1[:, 0], X1[:, 1], marker='.')
# Display the plot
plt.show()

dist_matrix = distance_matrix(X1,X1) 
print(dist_matrix)

Z = hierarchy.linkage(dist_matrix, 'complete')
dendro = hierarchy.dendrogram(Z)

filename = 'cars_clus.csv'

#Read csv
pdf = pd.read_csv(filename)

print ("Shape of dataset before cleaning: ", pdf.size)
pdf[[ 'sales', 'resale', 'type', 'price', 'engine_s',
       'horsepow', 'wheelbas', 'width', 'length', 'curb_wgt', 'fuel_cap',
       'mpg', 'lnsales']] = pdf[['sales', 'resale', 'type', 'price', 'engine_s',
       'horsepow', 'wheelbas', 'width', 'length', 'curb_wgt', 'fuel_cap',
       'mpg', 'lnsales']].apply(pd.to_numeric, errors='coerce')
pdf = pdf.dropna()
pdf = pdf.reset_index(drop=True)
    print('  %g%% of total patterns' % (100*len(inds)/len(ids_clusters)))
    for real_class in unique_y:
        clustered = (list(y[inds])).count(real_class)
        total = len(y)
        print(real_class,":", (clustered/total)*100 )


# ### Hierarchical clustering using ward

# In[87]:


from scipy.cluster.hierarchy import linkage, fcluster, dendrogram

# Hierarchichal clustering, single-linkage:
ward_cluster = linkage(NX, 'ward')

unique_y = np.unique(y)
ids_clusters = fcluster(ward_cluster, 5, # number of final clusters
                    criterion='maxclust') - 1
for i in np.unique(ids_clusters):
    inds = (np.where(np.array(ids_clusters) == i))[0]
    print('\033[1m'+'- Cluster %d' % i + '\033[0m')
    print('  %g%% of total patterns' % (100*len(inds)/len(ids_clusters)))
    for real_class in unique_y:
        clustered = (list(y[inds])).count(real_class)
        total = len(y)
        print(real_class, ":" ,(clustered/total)*100 )
    print()

Пример #56
0
def hierarchical(data=None,
                 k=0,
                 linkage='average',
                 metric='euclidean',
                 metric_args=None):
    """Perform clustering using hierarchical agglomerative algorithms.

    Parameters
    ----------
    data : array
        An m by n array of m data samples in an n-dimensional space.
    k : int, optional
        Number of clusters to extract; if 0 uses the life-time criterion.
    linkage : str, optional
        Linkage criterion; one of 'average', 'centroid', 'complete', 'median',
        'single', 'ward', or 'weighted'.
    metric : str, optional
        Distance metric (see 'biosppy.metrics').
    metric_args : dict, optional
        Additional keyword arguments to pass to the distance function.

    Returns
    -------
    clusters : dict
        Dictionary with the sample indices (rows from 'data') for each found
        cluster; outliers have key -1; clusters are assigned integer keys
        starting at 0.

    Raises
    ------
    TypeError
        If 'metric' is not a string.
    ValueError
        When the 'linkage' is unknown.
    ValueError
        When 'metric' is not 'euclidean' when using 'centroid', 'median',
        or 'ward' linkage.
    ValueError
        When 'k' is larger than the number of data samples.

    """

    # check inputs
    if data is None:
        raise TypeError("Please specify input data.")

    if linkage not in [
            'average', 'centroid', 'complete', 'median', 'single', 'ward',
            'weighted'
    ]:
        raise ValueError("Unknown linkage criterion '%r'." % linkage)

    if not isinstance(metric, six.string_types):
        raise TypeError("Please specify the distance metric as a string.")

    N = len(data)
    if k > N:
        raise ValueError("Number of clusters 'k' is higher than the number" \
                          " of input samples.")

    if metric_args is None:
        metric_args = {}

    if linkage in ['centroid', 'median', 'ward']:
        if metric != 'euclidean':
            raise TypeError("Linkage '{}' requires the distance metric to be" \
                            " 'euclidean'.".format(linkage))
        Z = sch.linkage(data, method=linkage)
    else:
        # compute distances
        D = metrics.pdist(data, metric=metric, **metric_args)

        # build linkage
        Z = sch.linkage(D, method=linkage)

    if k < 0:
        k = 0

    # extract clusters
    if k == 0:
        # life-time
        labels = _life_time(Z, N)
    else:
        labels = sch.fcluster(Z, k, 'maxclust')

    # get cluster indices
    clusters = _extract_clusters(labels)

    return utils.ReturnTuple((clusters, ), ('clusters', ))
Пример #57
0
modelo = AgglomerativeClustering(n_clusters=17)
grupos = modelo.fit_predict(generos_escalados)
grupos

tsne = TSNE()
visualizacao = tsne.fit_transform(generos_escalados)
visualizacao

sns.scatterplot(x=visualizacao[:, 0],
               y=visualizacao[:, 1],
               hue=grupos)

from scipy.cluster.hierarchy import dendrogram, linkage

modelo = KMeans(n_clusters=17)
modelo.fit(generos_escalados)

grupos = pd.DataFrame(modelo.cluster_centers_,
            columns=generos.columns)

grupos.transpose().plot.bar(subplots=True,
               figsize=(25, 50),
               sharex=False,
               rot=0)

matriz_de_distancia = linkage(grupos)
matriz_de_distancia

dendrograma = dendrogram(matriz_de_distancia)

Пример #58
0
import pylab
from matplotlib import pyplot as plt
import scipy.spatial.distance as dist
import scipy.cluster.hierarchy as hier
import cv2
import numpy as np

NUM_CLUST = 6

distSqMat = np.loadtxt('/home/brinstongonsalves/Documents/PyCharm/CV/mat.txt')
link_mat = hier.linkage(distSqMat,'single')
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram : Full')
hier.dendrogram(link_mat)
plt.savefig("dendogram.jpg")

plt.clf()
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram : Truncated')
hier.dendrogram(link_mat,truncate_mode='lastp',p = NUM_CLUST,)
plt.savefig("dendogram1.jpg")
Пример #59
0
def make_figure(df, pa):
    """Generates figure.

    Args:
        df (pandas.core.frame.DataFrame): Pandas DataFrame containing the input data.
        pa (dict): A dictionary of the style { "argument":"value"} as outputted by `figure_defaults`.

    Returns:
        A Plotly figure.
        A Pandas DataFrame with columns clusters.
        A Pandas DataFrame with rows clusters.
        A Pandas DataFrame as displayed in the the Maptlotlib figure.

    """

    #fig = go.Figure( )
    #fig.update_layout( width=pa_["fig_width"], height=pa_["fig_height"] ) #  autosize=False,

    tmp = df.copy()
    tmp.index = tmp[pa["xvals"]].tolist()
    tmp = tmp[pa["yvals"]]

    if pa["add_constant"] != "":
        tmp = tmp + float(pa["add_constant"])

    if pa["log_transform_value"] == "log2":
        tmp = np.log2(tmp)
    elif pa["log_transform_value"] == "log10":
        tmp = np.log10(tmp)

    pa_ = {}

    checkboxes = [
        "row_cluster", "col_cluster", "xticklabels", "yticklabels",
        "row_dendogram_dist", "col_dendogram_dist", "reverse_color_scale"
    ]  # "robust"
    for c in checkboxes:
        if (pa[c] == "on") | (pa[c] == ".on"):
            pa_[c] = True
        else:
            pa_[c] = False

    for v in [
            "col_color_threshold", "row_color_threshold", "upper_value",
            "center_value", "lower_value"
    ]:
        if pa[v] == "":
            pa_[v] = None
        else:
            pa_[v] = float(pa[v])

    if pa_["reverse_color_scale"]:
        pa_["colorscale_value"] = pa["colorscale_value"] + "_r"
    else:
        pa_["colorscale_value"] = pa["colorscale_value"]

    selfdefined_cmap = True
    for value in [
            "lower_value", "center_value", "upper_value", "lower_color",
            "center_color", "upper_color"
    ]:
        if pa[value] == "":
            selfdefined_cmap = False
            break
    if selfdefined_cmap:
        range_diff = float(pa["upper_value"]) - float(pa["lower_value"])
        center = float(pa["center_value"]) - float(pa["lower_value"])
        center = center / range_diff

        color_continuous_scale=[ [0, pa["lower_color"]],\
            [center, pa["center_color"]],\
            [1, pa["upper_color"] ]]

        pa_["colorscale_value"] = color_continuous_scale

    if pa["zscore_value"] == "row":
        tmp = pd.DataFrame(stats.zscore(tmp, axis=1, ddof=1),
                           columns=tmp.columns.tolist(),
                           index=tmp.index.tolist())
    elif pa["zscore_value"] == "columns":
        tmp = pd.DataFrame(stats.zscore(tmp, axis=0, ddof=1),
                           columns=tmp.columns.tolist(),
                           index=tmp.index.tolist())

    if len(pa["findrow"]) > 0:
        rows_to_find = pa["findrow"]

        possible_rows = tmp.index.tolist()
        not_found = [s for s in rows_to_find if s not in possible_rows]
        if len(not_found) > 0:
            message = "˜The following rows could not be found: %s. Please check your entries for typos." % (
                ", ".join(not_found))
            flash(message, 'error')

        rows_to_plot = [] + rows_to_find

        if (pa["findrowup"] != "") | (pa["findrowdown"] != ""):

            d = scs.distance.pdist(tmp, metric=pa["distance_value"])
            d = squareform(d)
            d = pd.DataFrame(d,
                             columns=tmp.index.tolist(),
                             index=tmp.index.tolist())
            d = d[rows_to_find]

            for r in rows_to_find:
                dfrow = d[[r]]

                if pa["findrowtype_value"] == "percentile":

                    row_values = dfrow[r].tolist()

                    if pa["findrowup"] != "":
                        upperc = np.percentile(row_values,
                                               float(pa["findrowup"]))
                        upperc = dfrow[dfrow[r] >= upperc]
                        rows_to_plot = rows_to_plot + upperc.index.tolist()

                    if pa["findrowdown"] != "":
                        downperc = np.percentile(row_values,
                                                 float(pa["findrowdown"]))
                        downperc = dfrow[dfrow[r] <= downperc]
                        rows_to_plot = rows_to_plot + downperc.index.tolist()

                if pa["findrowtype_value"] == "n rows":
                    dfrow = dfrow.sort_values(by=[r], ascending=True)
                    row_values = dfrow.index.tolist()

                    if pa["findrowdown"] != "":
                        rows_to_plot = rows_to_plot + row_values[:int(
                            pa["findrowdown"])]

                    if pa["findrowup"] != "":
                        rows_to_plot = rows_to_plot + row_values[
                            -int(pa["findrowup"]):]

                if pa["findrowtype_value"] == "absolute":

                    if pa["findrowup"] != "":
                        upperc = dfrow[dfrow[r] >= float(pa["findrowup"])]
                        rows_to_plot = rows_to_plot + upperc.index.tolist()

                    if pa["findrowdown"] != "":
                        downperc = dfrow[dfrow[r] <= float(pa["findrowdown"])]
                        rows_to_plot = rows_to_plot + downperc.index.tolist()

                rows_to_plot = list(set(rows_to_plot))

        tmp = tmp[tmp.index.isin(rows_to_plot)]

    data_array = tmp.values
    data_array_ = tmp.transpose().values
    labels = tmp.columns.tolist()
    rows = tmp.index.tolist()

    # # Initialize figure by creating upper dendrogram
    if pa_["col_cluster"]:
        fig = ff.create_dendrogram(data_array_, orientation='bottom', labels=labels, color_threshold=pa_["col_color_threshold"],\
                                distfun = lambda x: scs.distance.pdist(x, metric=pa["distance_value"]),\
                                linkagefun= lambda x: sch.linkage(x, pa["method_value"]))
        for i in range(len(fig['data'])):
            fig['data'][i]['yaxis'] = 'y2'
        dendro_leaves_y_labels = fig['layout']['xaxis']['ticktext']
        #dendro_leaves_y = [ labels.index(i) for i in dendro_leaves_y_labels ]

        #for data in dendro_up['data']:
        #    fig.add_trace(data)

        if pa_["col_color_threshold"]:
            d = scs.distance.pdist(data_array_, metric=pa["distance_value"])
            Z = sch.linkage(d, pa["method_value"])  #linkagefun(d)
            max_d = pa_["col_color_threshold"]
            clusters_cols = fcluster(Z, max_d, criterion='distance')
            clusters_cols = pd.DataFrame({
                "col": tmp.columns.tolist(),
                "cluster": list(clusters_cols)
            })
        else:
            clusters_cols = pd.DataFrame({"col": tmp.columns.tolist()})

    else:
        fig = go.Figure()
        dendro_leaves_y_labels = tmp.columns.tolist()
    dendro_leaves_y = [labels.index(i) for i in dendro_leaves_y_labels]

    # Create Side Dendrogram
    if pa_["row_cluster"]:
        dendro_side = ff.create_dendrogram(data_array, orientation='right', labels=rows, color_threshold=pa_["row_color_threshold"],\
                                            distfun = lambda x: scs.distance.pdist(x, metric=pa["distance_value"]),\
                                            linkagefun= lambda x: sch.linkage(x, pa["method_value"] ))
        for i in range(len(dendro_side['data'])):
            dendro_side['data'][i]['xaxis'] = 'x2'
        dendro_leaves_x_labels = dendro_side['layout']['yaxis']['ticktext']
        #dendro_leaves_x = [ rows.index(i) for i in dendro_leaves_x_labels ]

        if pa_["row_color_threshold"]:
            d = scs.distance.pdist(data_array, metric=pa["distance_value"])
            Z = sch.linkage(d, pa["method_value"])  #linkagefun(d)
            max_d = pa_["row_color_threshold"]
            clusters_rows = fcluster(Z, max_d, criterion='distance')
            clusters_rows = pd.DataFrame({
                "col": tmp.index.tolist(),
                "cluster": list(clusters_rows)
            })
        else:
            clusters_rows = pd.DataFrame({"col": tmp.index.tolist()})

        #if pa_["col_cluster"]:
        # Add Side Dendrogram Data to Figure
        #print(dendro_side['data'][0])
        for data in dendro_side['data']:
            fig.add_trace(data)
        #else:
        #    fig=dendro_side

    else:
        dendro_leaves_x_labels = tmp.index.tolist()
    dendro_leaves_x = [rows.index(i) for i in dendro_leaves_x_labels]

    if pa["robust"] != "":
        vals = tmp.values.flatten()
        up = np.percentile(vals, 100 - float(pa["robust"]))
        down = np.percentile(vals, float(pa["robust"]))
        tmp[tmp > up] = up
        tmp[tmp < down] = down
        data_array = tmp.values

    # Create Heatmap
    heat_data = data_array
    heat_data = heat_data[dendro_leaves_x, :]
    heat_data = heat_data[:, dendro_leaves_y]

    heatmap = [
        go.Heatmap(x=dendro_leaves_x_labels,
                   y=dendro_leaves_y_labels,
                   z=heat_data,
                   zmax=pa_["upper_value"],
                   zmid=pa_["center_value"],
                   zmin=pa_["lower_value"],
                   colorscale=pa_['colorscale_value'],
                   colorbar={
                       "title": {
                           "text": pa["color_bar_label"],
                           "font": {
                               "size": float(pa["color_bar_font_size"])
                           }
                       },
                       "lenmode": "pixels",
                       "len": float(pa["fig_height"]) / 4,
                       "xpad": float(pa["color_bar_horizontal_padding"]),
                       "tickfont": {
                           "size": float(pa["color_bar_ticks_font_size"])
                       }
                   })
    ]

    if pa_["col_cluster"]:
        heatmap[0]['x'] = fig['layout']['xaxis']['tickvals']
    else:
        heatmap[0]['x'] = dendro_leaves_y_labels

    if pa_["row_cluster"]:
        heatmap[0]['y'] = dendro_side['layout']['yaxis']['tickvals']
    else:
        fake_vals = []
        i = 0
        for f in range(len(dendro_leaves_x_labels)):
            fake_vals.append(i)
            i += 1
        #dendro_leaves_x_labels=tuple(fake_vals)
        heatmap[0]['y'] = tuple(fake_vals)  #dendro_leaves_x_labels

    # Add Heatmap Data to Figure
    # if (pa_["col_cluster"]) | (pa_["row_cluster"]):
    for data in heatmap:
        fig.add_trace(data)
    # else:
    #     fig = go.Figure(data=heatmap[0])

    # Edit Layout
    fig.update_layout({
        'width': float(pa["fig_width"]),
        'height': float(pa["fig_height"]),
        'showlegend': False,
        'hovermode': 'closest',
        "yaxis": {
            "mirror": "allticks",
            'side': 'right',
            'showticklabels': pa_["xticklabels"],
            'ticktext': dendro_leaves_x_labels
        },
        "xaxis": {
            "mirror": "allticks",
            'side': 'right',
            'showticklabels': pa_["yticklabels"],
            'ticktext': dendro_leaves_y_labels
        }
    })

    # Edit xaxis
    fig.update_layout(xaxis={'domain': [ float(pa["row_dendogram_ratio"]), 1],
                                    'mirror': False,
                                    'showgrid': False,
                                    'showline': False,
                                    'zeroline': False,
                                    'showticklabels': pa_["yticklabels"],
                                    "tickfont":{"size":float(pa["yaxis_font_size"])},
                                    'ticks':"",\
                                    'ticktext':dendro_leaves_y_labels})

    # Edit xaxis2
    if pa_["row_cluster"]:
        fig.update_layout(
            xaxis2={
                'domain': [0, float(pa["row_dendogram_ratio"])],
                'mirror': False,
                'showgrid': False,
                'showline': False,
                'zeroline': False,
                'showticklabels': pa_["row_dendogram_dist"],
                'ticks': ""
            })

    # Edit yaxis
    fig.update_layout(yaxis={'domain': [0, 1-float(pa["col_dendogram_ratio"]) ],
                                    'mirror': False,
                                    'showgrid': False,
                                    'showline': False,
                                    'zeroline': False,
                                    'showticklabels': pa_["xticklabels"],
                                    "tickfont":{"size":float(pa["xaxis_font_size"])} ,
                                    'ticks': "",\
                                    'tickvals':heatmap[0]['y'],\
                                    'ticktext':dendro_leaves_x_labels})
    #'tickvals':dendro_side['layout']['yaxis']['tickvals'],\
    # Edit yaxis2 showticklabels
    if pa_["col_cluster"]:
        fig.update_layout(
            yaxis2={
                'domain': [1 - float(pa["col_dendogram_ratio"]), 1],
                'mirror': False,
                'showgrid': False,
                'showline': False,
                'zeroline': False,
                'showticklabels': pa_["col_dendogram_dist"],
                'ticks': ""
            })

    fig.update_layout(template='plotly_white')

    fig.update_layout(
        title={
            "text": pa["title"],
            "yanchor": "top",
            "font": {
                "size": float(pa["title_size_value"])
            }
        })

    cols = list(fig['layout']['xaxis']['ticktext'])
    rows = list(fig['layout']['yaxis']['ticktext'])
    df_ = pd.DataFrame({"i": range(len(rows))}, index=rows)
    df_ = df_.sort_values(by=["i"], ascending=False)
    df_ = df_.drop(["i"], axis=1)
    df_ = pd.merge(df_, tmp, how="left", left_index=True, right_index=True)
    df_ = df_[cols]

    clusters_cols_ = pd.DataFrame({"col": cols})
    if pa_["col_cluster"]:
        clusters_cols = pd.merge(clusters_cols_,
                                 clusters_cols,
                                 on=["col"],
                                 how="left")
    else:
        clusters_cols = clusters_cols_

    clusters_rows_ = pd.DataFrame({"col": df_.index.tolist()})
    if pa_["row_cluster"]:
        clusters_rows = pd.merge(clusters_rows_,
                                 clusters_rows,
                                 on=["col"],
                                 how="left")
    else:
        clusters_rows = clusters_rows_

    df_.reset_index(inplace=True, drop=False)
    cols = df_.columns.tolist()
    cols[0] = "rows"
    df_.columns = cols

    return fig, clusters_cols, clusters_rows, df_
Пример #60
0
rcParams['font.sans-serif'] = [
    'Linux Biolinum', 'Tahoma', 'DejaVu Sans', 'Lucida Grande', 'Verdana'
]

for i, path in enumerate(paths):

    if path.name != 'tf_doc_symbol_matrix.npy':
        continue

    feature_matrix = np.load(path)

    methods = ['ward']
    for method in methods:
        # Should consider using L2 distance here for word-to-vec model rather than euclidean
        links = hierarchy.linkage(feature_matrix,
                                  method=method,
                                  optimal_ordering=True)

        fig = plt.figure(figsize=((3.33 * 2) + 0.33, 3.25), dpi=220)

        hierarchy.dendrogram(
            links,
            leaf_label_func=get_leaf_label,
            orientation='left',
            leaf_rotation=0.,  # rotates the x axis labels
            color_threshold=0.4,
            above_threshold_color='xkcd:light grey',
        )

        # plt.title('Hierarchical Clustering of MSWE Document Representations', x=0.2, y = 1.04)
        plt.xlabel('Euclidean Distance Between Clusters',