Пример #1
0
    def test_dendrogram_colors(self):
        # Tests dendrogram plots with alternate colors
        Z = linkage(hierarchy_test_data.ytdist, 'single')

        set_link_color_palette(['c', 'm', 'y', 'k'])
        R = dendrogram(Z, no_plot=True,
                       above_threshold_color='g', color_threshold=250)
        set_link_color_palette(['g', 'r', 'c', 'm', 'y', 'k'])

        color_list = R['color_list']
        assert_equal(color_list, ['c', 'm', 'g', 'g', 'g'])
def cluster(data):
    pairwise_dists = distance.squareform(distance.pdist(data))
    # cluster
    sch.set_link_color_palette(['black'])
    row_clusters = sch.linkage(pairwise_dists,method='complete')
    # rename row clusters
    #row_clusters = clusters
    # calculate pairwise distances for columns
    col_pairwise_dists = distance.squareform(distance.pdist(data.T))
    # cluster
    col_clusters = sch.linkage(col_pairwise_dists,method='complete')
    return row_clusters, col_clusters
    def hierarchical(self,lst,fulldataset):
        #Samples are colored according to its sample type #
        label_color={}
        for i in self.numbering(self.classLabel(lst)):
            r=('r')
            b=('b')
            if i[0:6]=='cancer':
                label_color[i]=r
                #print label_colors
            elif i[0:6]=='normal' :
                label_color[i]=b
                #print label_colors
            else:
                continue
        tg=zip(*fulldataset)
        Y = pdist(tg)
        #average linkage is applied #
        Z = linkage(Y,method='average')
        sch.set_link_color_palette(['black'])
        a=sch.dendrogram(Z,leaf_font_size=6,labels=self.newlist)
            

        #dendrogram is plotted #
        ax = plt.gca()
        xlbls = ax.get_xmajorticklabels()
    
        for lbl in xlbls:
            lbl.set_color(label_color[lbl.get_text()])
        plt.title("Average Hierarchical Clustering Algorithm")
        plt.savefig('Average Hierarchical Clustering.pdf',dpi=500)
        #plt.show()
        plt.close()

        self.labels=array([])
        c=array([1])
        n=array([0])

        #Silhouette Test #
        #Samples are converted into '0' or '1' for validation #
        for i in self.classLabel(lst):
            if i=='cancer':
                self.labels=np.concatenate([self.labels,c])
            else:
                self.labels=np.concatenate([self.labels,n])

        self.labels=np.delete(self.labels,self.labels[-1])
        self.score=metrics.silhouette_score(Z, self.labels, metric='euclidean')
Пример #4
0
    def dendrogramClusteringPlot(cls, linkageMatrix, labels, fileLocation):
        from matplotlib import pyplot as plt
        import scipy.cluster.hierarchy as sch
        import numpy as np

        sch.set_link_color_palette(['black'])

        fig, axes = plt.subplots()
        fig.subplots_adjust(bottom=0.4)
        plt.title('Hierarchical Clustering Dendrogram')
        plt.ylabel('Distance')

        sch.dendrogram(
            linkageMatrix,
            labels=labels,
            leaf_rotation=270,   # rotates the x axis labels
            color_threshold=np.inf
        )
        plt.plot()
        cls.saveFigure(fig, fileLocation)
Пример #5
0
    def plot_dendrogram(self, nolabels=True):
        '''
        Plots the dendragram visualization of Z and returns
        the dendrogram object 'dendro'.
        '''
        from scipy.cluster.hierarchy import dendrogram, set_link_color_palette
        import matplotlib.pyplot as plt

        cpool = ["#1F78B4", "#E31A1C", "#A6CEE3", "#FB9A99", "#7BCCC4", "#B2DF8A", "#33A02C", "#02818A", "#FF7F00", "#FDBF6F", "#CAB2D6", "#6A3D9A", "#BFD3E6", "#8C96C6"]
        set_link_color_palette(cpool)
        h_clustering = self.Z

        dendro = dendrogram(h_clustering, no_labels=nolabels, 
            count_sort=True, orientation="left");
        #plt.title("Clustering Diagram for N = %d" % self.get_sample_size(), fontsize = 14)
        plt.xlabel("Coincidence Metric ($\Gamma$)", fontsize = 14)
        plt.ylabel("Clusters", fontsize = 14)
        plt.xticks([1, 0.8, 0.6, 0.4, 0.2, 0], [0, 0.2, 0.4, 0.6, 0.8, 1])
        plt.xticks(fontsize=14)
        return dendro
Пример #6
0
    def test_dendrogram_colors(self):
        # Tests dendrogram plots with alternate colors
        Z = linkage(hierarchy_test_data.ytdist, "single")

        set_link_color_palette(["c", "m", "y", "k"])
        R = dendrogram(Z, no_plot=True, above_threshold_color="g", color_threshold=250)
        set_link_color_palette(["g", "r", "c", "m", "y", "k"])

        color_list = R["color_list"]
        assert_equal(color_list, ["c", "m", "g", "g", "g"])

        # reset color palette (global list)
        set_link_color_palette(None)
Пример #7
0
def hierarchical(df, cluster_cols=True, cluster_rows=False, n_col_clusters=False, n_row_clusters=False, fcol=None, z_score=True, method='ward'):

    # helper for cleaning up axes by removing ticks, tick labels, frame, etc.
    def clean_axis(ax):
        """Remove ticks, tick labels, and frame from axis"""
        ax.get_xaxis().set_ticks([])
        ax.get_yaxis().set_ticks([])
        ax.set_axis_bgcolor('#ffffff')
        for sp in ax.spines.values():
            sp.set_visible(False)

    def optimize_clusters(clusters, denD, target_n):
        target_n = target_n - 1 # We return edges; not regions
        threshold = np.max(clusters)
        max_iterations = threshold

        i = 0
        while i < max_iterations:
            cc = sch.fcluster(clusters, threshold, 'distance')
            cco = cc[ denD['leaves'] ]
            edges = [n for n in range(cco.shape[0]-1) if cco[n] != cco[n+1]  ]
            n_clusters = len(edges)
            
            if n_clusters == target_n:
                break

            if n_clusters < target_n:
                threshold = threshold // 2

            elif n_clusters > target_n:
                threshold = int( threshold * 1.5 )

            i += 1

        return edges

    dfc = df.copy()

    if z_score:
        dfc = (dfc - dfc.median(axis=0)) / dfc.std(axis=0)

    # Remove nan/infs
    dfc[np.isinf(dfc)] = 0
    dfc[np.isnan(dfc)] = 0

    #dfc.dropna(axis=0, how='any', inplace=True)

    # make norm
    vmin = dfc.min().min()
    vmax = dfc.max().max()
    vmax = max([vmax, abs(vmin)])  # choose larger of vmin and vmax
    vmin = vmax * -1
    my_norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)

    df[np.isnan(df)] = 0
    df[np.isinf(df)] = 0

    # dendrogram single color
    sch.set_link_color_palette(['black'])

    # cluster
    if cluster_rows:
        row_pairwise_dists = distance.squareform(distance.pdist(dfc))
        row_clusters = sch.linkage(row_pairwise_dists, method=method)

    if cluster_cols:
        col_pairwise_dists = distance.squareform(distance.pdist(dfc.T))
        col_clusters = sch.linkage(col_pairwise_dists, method=method)

    # heatmap with row names
    fig = plt.figure(figsize=(12, 12))
    heatmapGS = gridspec.GridSpec(2, 2, wspace=0.0, hspace=0.0, width_ratios=[0.25, 1], height_ratios=[0.25, 1])

    if cluster_cols:
        # col dendrogram
        col_denAX = fig.add_subplot(heatmapGS[0, 1])
        col_denD = sch.dendrogram(col_clusters, color_threshold=np.inf)
        clean_axis(col_denAX)

    rowGSSS = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=heatmapGS[1, 0], wspace=0.0, hspace=0.0, width_ratios=[1, 0.05])

    if cluster_rows:
        # row dendrogram
        row_denAX = fig.add_subplot(rowGSSS[0, 0])
        row_denD = sch.dendrogram(row_clusters, color_threshold=np.inf, orientation='right')
        clean_axis(row_denAX)

    row_denD = {
        'leaves':range(0, dfc.shape[0])
    }

    # row colorbar
    if fcol and 'Group' in dfc.index.names:
        class_idx = dfc.index.names.index('Group')

        classcol = [fcol[x] for x in dfc.index.get_level_values(0)[row_denD['leaves']]]
        classrgb = np.array([colorConverter.to_rgb(c) for c in classcol]).reshape(-1, 1, 3)
        row_cbAX = fig.add_subplot(rowGSSS[0, 1])
        row_axi = row_cbAX.imshow(classrgb, interpolation='nearest', aspect='auto', origin='lower')
        clean_axis(row_cbAX)

    # heatmap
    heatmapAX = fig.add_subplot(heatmapGS[1, 1])

    axi = heatmapAX.imshow(dfc.iloc[row_denD['leaves'], col_denD['leaves']], interpolation='nearest', aspect='auto', origin='lower'
                           , norm=my_norm, cmap=cm.PuOr_r)
    clean_axis(heatmapAX)

    # row labels
    if dfc.shape[0] <= 100:
        heatmapAX.set_yticks(range(dfc.shape[0]))
        heatmapAX.yaxis.set_ticks_position('right')
        ylabels = [" ".join([str(t) for t in i]) if type(i) == tuple else str(i) for i in dfc.index[row_denD['leaves']]]
        heatmapAX.set_yticklabels(ylabels)

    # col labels
    if dfc.shape[1] <= 100:
        heatmapAX.set_xticks(range(dfc.shape[1]))
        xlabels = [" ".join([str(t) for t in i]) if type(i) == tuple else str(i) for i in dfc.columns[col_denD['leaves']]]
        xlabelsL = heatmapAX.set_xticklabels(xlabels)
        # rotate labels 90 degrees
        for label in xlabelsL:
            label.set_rotation(90)

    # remove the tick lines
    for l in heatmapAX.get_xticklines() + heatmapAX.get_yticklines():
        l.set_markersize(0)

    heatmapAX.grid('off')

    if cluster_cols and n_col_clusters:
        edges = optimize_clusters(col_clusters, col_denD, n_col_clusters)
        for edge in edges:
            heatmapAX.axvline(edge +0.5, color='k', lw=3)

    if cluster_rows and n_row_clusters:
        edges = optimize_clusters(row_clusters, row_denD, n_row_clusters)
        for edge in edges:
            heatmapAX.axhline(edge +0.5, color='k', lw=3)



    return fig
Пример #8
0
    # Replace the data points with their respective cluster value 
    # (ex. 0) and is color coded with a colormap (plt.cm.spectral)
	plt.text(X1[i, 0], X1[i, 1], str(y1[i]), #(X1[i, 0] n_samples, X1[i, 1] n_features, str(y1[i]) The integer labels for cluster membership of each sample.
			 color=plt.cm.nipy_spectral(agglom.labels_[i]/10), #/10 (or any number) change color of lables, idk why.. 
			 fontdict={'weight': 'bold', 'size': 9})      #This only plot the numbers (labels) of data
    
# Remove the x ticks, y ticks, x and y axis
plt.xticks([])
plt.yticks([])
#plt.axis('off')



# Display the plot of the original data before clustering
plt.scatter(X1[:, 0], X1[:, 1], marker='.') #With these, we combine labes and datapoints.
plt.show()

dist_matrix = distance_matrix(X1,X1) #Measures dist between all data, set x as rows and also columns, diagonal = 0. 
print(dist_matrix)
Z = hierarchy.linkage(dist_matrix, "complete")
dendro = hierarchy.dendrogram(Z)
print(Z)

#plot example from google
hierarchy.set_link_color_palette(['m', 'c', 'y', 'k'])
fig, axes = plt.subplots(1, 2, figsize=(8, 3))
dn1 = hierarchy.dendrogram(Z, ax=axes[0], above_threshold_color='y',
                           orientation='top')
dn2 = hierarchy.dendrogram(Z, ax=axes[1], above_threshold_color='#bcbddc', orientation='right')
hierarchy.set_link_color_palette(None)  # reset to default after use
#plt.show()
Пример #9
0
    ax.get_yaxis().set_ticks([])
    for sp in ax.spines.values():
        sp.set_visible(False)


# make norm
vmin = input_data.min().min()
vmax = input_data.max().max()
print("Range in data %f...%f" % (vmin, vmax))
vmax = max([vmax, abs(vmin)])  # choose larger of vmin and vmax
vmin = vmax * -1
print("Normalised to %f...%f" % (vmin, vmax))
my_norm = mpl.colors.Normalize(vmin, vmax)

# dendrogram single color
sch.set_link_color_palette(['black'])

# cluster
row_pairwise_dists = distance.squareform(distance.pdist(input_data))
row_clusters = sch.linkage(row_pairwise_dists, method=config['method'])

col_pairwise_dists = distance.squareform(distance.pdist(input_data.T))
col_clusters = sch.linkage(col_pairwise_dists, method=config['method'])

progress(0.25)

# heatmap with row names
View = plt.figure(figsize=(12, 8))
heatmapGS = gridspec.GridSpec(2,
                              2,
                              wspace=0.0,
Пример #10
0
def clean_select_neighb(df_feature2):
    # Keep only neighborhoods with 10 or more known coffee shops
    data_temp=df_feature2[df_feature2['Count']>=10]
    
    X=np.asarray(data_temp.loc[:,~data_temp.columns.isin(['loc_City','Avg_Utility_Score','Count'])])

    columns=[col for col in data_temp.columns if col not in ['Avg_Utility_Score', 'Count','loc_City']]

    imputer = Imputer(missing_values = 'NaN', strategy = 'median', axis = 0)
    imputer = imputer.fit(X)
    X = imputer.transform(X)
    scaler = preprocessing.MinMaxScaler()
    minmax_scaled_df = scaler.fit_transform(X)
    minmax_scaled_df = pd.DataFrame(minmax_scaled_df, 
                                    columns=columns,index=data_temp.index) 
    minmax_scaled_df=pd.concat([minmax_scaled_df,pd.DataFrame(data_temp['loc_City'],index=data_temp.index)],axis=1)
    minmax_scaled_df['loc_id'] = minmax_scaled_df.index + ', ' + minmax_scaled_df.loc_City
    minmax_scaled_df.drop(columns=['loc_City'],inplace=True)
    minmax_scaled_df.set_index('loc_id',drop=True,inplace=True)
    minmax_scaled_df = minmax_scaled_df.loc[~minmax_scaled_df.index.duplicated(keep='first')]

    samples=minmax_scaled_df.values
    labs = minmax_scaled_df.index
    set_link_color_palette(['teal','sandybrown', 'steelblue', 'firebrick', 'forestgreen', 'darkviolet', 'crimson', 'darkcyan', 'peru', 'indigo', 'darkorange'])
    mergings = linkage(samples, method='complete')

    # Apply dendogram being applying PCA
    dendo = dendrogram(mergings,
                       labels=labs,
                       leaf_rotation=0,
                       leaf_font_size=14,
                       color_threshold=1.0,
                       orientation='right',
                       no_plot=True)
    a = pd.DataFrame(dendo['color_list'])
    val_dict = dict(a.iloc[:,0].value_counts())
    colors = a.iloc[:,0].unique()
    color_list = []
    for color in colors:
        if color != 'b':
            for i in range(val_dict[color] + 1):
                color_list.append(color)



    neighb_colors = pd.DataFrame([color_list, dendo['ivl'], dendo['leaves']]).T
    neighb_colors.rename({0:'color', 1:'loc_id', 2:'leaf'}, axis=1,inplace=True)
    neighb_colors = neighb_colors.set_index('loc_id', drop=True)

    # Apply PCA, keeping top 5 features
    pca = PCA(5)
    projected = pca.fit_transform(minmax_scaled_df.values)

    # Now, after PCA, apply K-Means and GMM clustering
    n_clusters = 7
    kmeans = KMeans(n_clusters, random_state=42)
    labels_kmeans = kmeans.fit(projected).predict(projected)
    gmm = GaussianMixture(n_components=n_clusters).fit(projected)
    labels_GMM = gmm.predict(projected)

    minmax_scaled_df['labels_GMM']=labels_GMM
    minmax_scaled_df['labels_KMeans']=labels_kmeans
    neighb_colors = neighb_colors[['color']]
    neighb_colors = neighb_colors.sort_index()
    # Merge with Dendogram output
    minmax_scaled_df=minmax_scaled_df.merge(neighb_colors, on='loc_id',how='outer')
    minmax_scaled_df.rename(columns={'color': 'labels_dendo'}, inplace=True)
    return minmax_scaled_df
Пример #11
0
VOTES_RAW = {i: PLYRS[i]['votes'] for i in PLYRS.keys()}
(NAMES, VOTES) = (list(VOTES_RAW.keys()), list(VOTES_RAW.values()))
mat = np.genfromtxt(path.join(PT_DTA, FN_DST), delimiter=',')
if cst.ANONYMIZE:
    shuffle(NAMES)
###############################################################################
# Process
###############################################################################
print('(5) Plotting Dendrogram')
dists = squareform(mat)
linkage_matrix = linkage(dists, 'ward')
###############################################################################
# Plot
###############################################################################
(fig, ax) = plt.subplots()
set_link_color_palette(['#ff006e', '#2614ed', 'k'])
dend = dendrogram(
    linkage_matrix, 
    labels=NAMES, orientation='right',
    above_threshold_color='#bcbddcA0', 
    count_sort='descending'
)
ax.set_aspect(.0025)
plt.xticks([])
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_color('#ffffff')
plt.title('Hierarchical Clustering\n', fontdict={'size': 18})
fig.savefig(
    path.join(PT_PLT, 'DN.png'), 
def plot_dendrograms(data, labels):
  cmap = mpl.cm.get_cmap("tab10")
  hierarchy.set_link_color_palette([
    mpl.colors.to_hex(cmap(idx)) for idx in range(10)
  ])
  linkage_gene_total = linkage(
    data_dict["crc"]["gene"]["f"], 'ward',
    optimal_ordering=True
  )
  linkage_gene_0 = linkage(
    data_dict["crc"]["gene"]["fhb0"], 'ward',
    optimal_ordering=True
  )
  linkage_gene_1 = linkage(
    data_dict["crc"]["gene"]["fhb1"], 'ward',
    optimal_ordering=True
  )
  linkage_tumor = linkage(
    data_dict["common"]["tumor"]["f"], 'ward',
    optimal_ordering=True
  )

  fig, ax = plt.subplots(figsize=(5, 10))
  dendrogram(linkage_gene_total,
             orientation='left',
             above_threshold_color="grey",
             color_threshold=linkage_gene_total[-2, 2],
             labels=labels["crc"]["gene"],
             show_leaf_counts=True,
             ax=ax)
  ax.axvline(linkage_gene_total[-2, 2], color="grey", linestyle="--")

  plt.savefig("dendrogram-total.svg", dpi=600)

  fig, ax = plt.subplots(figsize=(5, 10))
  dendrogram(linkage_gene_0,
             orientation='left',
             above_threshold_color="grey",
             color_threshold=linkage_gene_0[-2, 2],
             labels=labels["crc"]["gene"],
             show_leaf_counts=True,
             ax=ax)
  ax.axvline(linkage_gene_0[-2, 2], color="grey", linestyle="--")
  ax.set_xlim(12, 0)

  plt.savefig("dendrogram-0.svg", dpi=600)

  fig, ax = plt.subplots(figsize=(5, 10))
  dendrogram(linkage_gene_1,
             orientation='left',
             above_threshold_color="grey",
             color_threshold=linkage_gene_0[-2, 2],
             labels=labels["crc"]["gene"],
             show_leaf_counts=True,
             ax=ax)
  ax.axvline(linkage_gene_0[-2, 2], color="grey", linestyle="--")
  ax.set_xlim(12, 0)

  plt.savefig("dendrogram-1.svg", dpi=600)

  fig, ax = plt.subplots(figsize=(5, 10))
  dendrogram(linkage_tumor,
             orientation='left',
             above_threshold_color="grey",
             color_threshold=linkage_tumor[-1, 1],
             labels=["CRC" for _ in labels["crc"]["tumor"]] + ["EC" for _ in labels["ec"]["tumor"]],
             show_leaf_counts=True,
             ax=ax)

  plt.savefig("dendrogram-t.svg", dpi=600)
Пример #13
0
                                  bubble_data['cluster'])
# and we attribute y-coordinate simply corresponding to the donor id
bubble_data['y'] = -bubble_data['max % ADCC'].index.map(int)


#%%
### FIGURE ###
fig = plt.figure(figsize = (8,3.5))

### DENDROGRAM
ax1=plt.subplot(122)

# we try to make the dendrogram a bit sexier than by default...
hierarchy.set_link_color_palette(['purple',
                                  'cornflowerblue',
                                  'limegreen',
                                  'gold',
                                  'tomato'])

# we need context manager to set the linewidth
with plt.rc_context({'lines.linewidth': 2}):
    dend = dendrogram(Z,
                      labels=data_cluster.index,
                      orientation='right',
                      leaf_font_size=7,
                      leaf_rotation=0,
                      above_threshold_color='lightgrey',
                      color_threshold=32,
                      ax=ax1)

# a few more aesthetics
Пример #14
0
def my_clustermap(matrix,
                  thrs_row=1,
                  thrs_col=1,
                  distM=None,
                  row_cls=False,
                  col_cls=True,
                  return_fig=False,
                  method='average',
                  fig_sz=(8, 8),
                  colnames=None,
                  rownames=None,
                  cls_info=False):
    colors = sns.color_palette("Set2", 25)
    colors = [mat_col.rgb2hex(color) for color in colors]
    set_link_color_palette(colors)
    #print ("testing")
    if colnames is None:
        try:
            colnames = np.array(matrix.columns, str)
        except AttributeError:
            colnames = np.array(range(0, matrix.shape[1]), str)
    if rownames is None:
        try:
            rownames = np.array(matrix.index, str)
        except AttributeError:
            rownames = np.array(range(0, matrix.shape[0]), str)
    if row_cls:
        if distM is None:
            D_row = scipy.spatial.distance.pdist(matrix)
        else:
            D_row = distM
    if col_cls:
        if distM is None:
            D_col = scipy.spatial.distance.pdist(matrix.T)
        else:
            D_col = distM
    #print ("testing")
    fig = plt.figure(figsize=fig_sz)
    #print ("testing2")
    lef = 0.01
    bot = 0.05
    h_sep = 0.2
    v_sep = 0.7
    row_leg = 0.01  #space for the legend of the rows plotted on the right side of the matrix
    #print ("test")
    if row_cls:
        if col_cls:  #if want both row and column dendrogram
            mat_h = v_sep - 0.005 - bot
            mat_w = 0.9 - row_leg - h_sep
            den_h = 1 - v_sep - 0.005
            den_w = h_sep - 0.005 - lef
            #plot dendrogram for column clusters
            ax_col = fig.add_axes([h_sep, v_sep, mat_w, den_h])
            g_col = scipy.cluster.hierarchy.linkage(D_col, method=method)
            den_col = scipy.cluster.hierarchy.dendrogram(
                g_col, color_threshold=thrs_col, above_threshold_color='black')
            idx_col = den_col['leaves']
            ax_col.set_xticklabels([''])
        else:  #if only want row dendrogram
            mat_h = 1 - bot * 2
            mat_w = 0.9 - 0.01 - h_sep
            den_w = h_sep - 0.005 - lef
            idx_col = list(range(0, matrix.shape[1]))

        # plot dendrogram for row clusters
        ax_row = fig.add_axes([lef, bot, den_w, mat_h])
        g_row = scipy.cluster.hierarchy.linkage(D_row, method=method)
        den_row = scipy.cluster.hierarchy.dendrogram(
            g_row,
            color_threshold=thrs_row,
            orientation='left',
            above_threshold_color='black')
        idx_row = den_row['leaves']
        ax_row.set_yticklabels([''])
        ax_mat = fig.add_axes([h_sep, bot, mat_w, mat_h])

    else:
        if col_cls:  #if only want column clusters
            lef = lef + 0.04
            mat_h = v_sep - 0.005 - bot
            mat_w = 0.9 - row_leg - lef
            den_h = 1 - v_sep - 0.005
        else:
            plt.close()
            raise ValueError(
                "At least one of row_cls and col_cls has to be Ture.")

        #plot dendrogram for column clusters
        ax_col = fig.add_axes([lef, v_sep, mat_w, den_h])
        g_col = scipy.cluster.hierarchy.linkage(D_col, method=method)
        den_col = scipy.cluster.hierarchy.dendrogram(
            g_col, color_threshold=thrs_col, above_threshold_color='black')
        idx_col = den_col['leaves']
        idx_row = list(range(0, matrix.shape[0]))
        ax_col.set_xticklabels([''])
        ax_mat = fig.add_axes([lef, bot, mat_w, mat_h])
    #plot data matrix as a heatmap
    #print (matrix.index)
    #matrix.loc['znf536','clcn3','tcf4','cnnm2','akt3b','foxg1het','foxg1','snap91','kmt2e']
    #matrix.loc[['gria1','znf536','clcn3','tcf4','cnnm2','akt3b','foxg1het','foxg1','snap91','kmt2e'],['gria1','znf536','clcn3','tcf4','cnnm2','akt3b','foxg1het','foxg1','snap91','kmt2e']]
    matrix3 = np.array(matrix)
    matrix2 = np.array(matrix3)
    matrix2[matrix2 > 0.9999] = np.nan
    maxval = np.nanmax(matrix2)
    list_to_mask = [
        'c11orf87', 'dgki', 'sept3', 'ppp1r16b', 'sez6l2', 'immp2l', 'fxr1',
        'otud7b', 'zswim6', 'ptn', 'ncan', 'tmtc1', 'nab2', 'kcnv1', 'r3hdm2',
        'chrna5', 'cyp17a1', 'gtdc1', 'srpk2', 'cacna1i', 'epc2', 'satb2',
        'srr', 'slc32a1', 'glt8d1', 'ftcdnl1', 'ogfod2', 'adamtsl3', 'sdccag8',
        'srebf2', 'plch2a', 'slc38a7', 'slc39a8', 'nfkb1', 'kif5c', 'nxph4',
        'dpyd', 'ngef', 'hapln4', 'apopt1', 'ina', 'mbd5', 'sybu', 'kctd13',
        'lrriq3', 'arl6ip4', 'klc1', 'c2orf82', 'nrgn', 'gatad2a', 'nck1',
        'grm3', 'fut9a', 'fes', 'galnt10', 'anp32e', 'slc35g2', 'snx19',
        'plcl1', 'c12orf65', 'bag5', 'tbc1d5', 'mdk', 'negr1', 'pak6b',
        'cntn4', 'ca8', 'man2a1', 'kcnj13', 'tcf20', 'stat6', 'cnksr2', 'ckb',
        'tcf4', 'shmt2', 'znf804a', 'sipa1l1', 'arl3', 'tle1', 'doc2a',
        'c10orf32', 'mad1l1'
    ]
    print(matrix)
    matrix.loc[list_to_mask] = 0
    matrix.loc[:, list_to_mask] = 0
    print(matrix)
    matrix = np.array(matrix)
    # maxval2 = np.nanmax(matrix)
    D = matrix[idx_row, :]
    D = D[:, idx_col]
    D2 = np.rot90(D)
    #print (D2)
    #D3 = np.rot90(D2)
    #print (D3)
    #print (colnames[idx_col])
    #print (rownames[idx_row])
    revlist = np.flipud(colnames[idx_col])
    #print (maxval, maxval2)
    #im = ax_mat.matshow(D, aspect='auto', origin='lower', cmap=plt.cm.YlGnBu)
    im = ax_mat.pcolormesh(D2, cmap=plt.cm.YlGnBu, vmin=0, vmax=maxval)
    ax_mat.xaxis.set_ticks_position('bottom')
    ax_mat.yaxis.set_ticks_position('right')
    ax_mat.set_xticks(list(np.asarray(list(range(0, matrix.shape[1]))) + 0.5))
    ax_mat.set_yticks(list(np.asarray(list(range(0, matrix.shape[0]))) + 0.5))
    #ax_mat.set_xticks(list(range(0,matrix.shape[1])+0.05))
    #ax_mat.set_yticks(list(range(0,matrix.shape[0])+0.05))
    ax_mat.set_xticklabels(colnames[idx_col], rotation=90, size=4)
    #ax_mat.set_xticklabels(colnames[idx_col],rotation=90,size=4)
    #ax_mat.set_yticklabels(rownames[idx_row],size=4)
    ax_mat.set_yticklabels(revlist, size=4)
    ax_mat.grid(False)

    # Plot colorbar.
    axcolor = fig.add_axes([0.94, bot, 0.02, mat_h])
    plt.colorbar(im, cax=axcolor)
    axcolor = fig.add_axes([0.94, bot, 0.02, mat_h])
    plt.colorbar(im, cax=axcolor)
    #namepre = fname.split(".")[0]
    #  if row_cls:
    #		namepre = namepre + "rows_"
    plt.savefig("Dec_corr_MASKING_sort1_" + method + ".png",
                bbox_inches='tight',
                dpi=600)
    #plt.savefig(namepre+method+".png",bbox_inches='tight', dpi=600)
    if cls_info:
        cls_dic = {}
        if col_cls:
            cls_dic['col_ind'] = den_col['leaves']
            cls_dic['col_cls'] = scipy.cluster.hierarchy.fcluster(
                g_col, t=thrs_col, criterion='distance')
        if row_cls:
            cls_dic['row_ind'] = den_row['leaves']
            cls_dic['row_cls'] = scipy.cluster.hierarchy.fcluster(
                g_row, t=thrs_row, criterion='distance')
        return (cls_dic)
Пример #15
0
def heatmap(Mat, label, bool_sort, filename, threshold, heatmap_threshold):
    m = len(Mat)
    n = len(Mat[0])
    print m, n

    #xlabel = []
    #for i in range(0,m):
    #   xlabel.append('lig_'+str(i+1))
    #ylabel = []
    #for i in range(0,n):
    #   ylabel.append('lig_'+str(i+1))
    xlabel = label
    ylabel = label

    fig = pylab.figure(figsize=(8, 8))

    if (bool_sort):
        Mat, Matvec = mat_to_vector(Mat)
        Mat_copy = copy.copy(Mat)
        Y = sch.linkage(Matvec, method='single')
        #Y = sch.linkage(Matvec, method='average')
        #Y = sch.linkage(Matvec, method='complete')
        #Y = sch.linkage(Matvec, method='centroid')
        #Y = sch.linkage(Matvec, method='median')
        #help(sch.linkage)
        #threshold = 1.0 # good for single
        #threshold = 0.5 # good for single
        #threshold = 1.5 # good for average
        #threshold = 2.35 # good for complete
        #threshold = 3.0 # good for complete
        #threshold = 2.0 # good for complete
        clusters = sch.fcluster(Y, threshold, 'distance')
        print clusters
        for i in range(len(label)):
            print label[i] + " " + str(clusters[i])

        ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.6])
        sch.set_link_color_palette(['k', 'k', 'k', 'k', 'c', 'm', 'g'])
        Z1 = sch.dendrogram(Y, orientation='right', color_threshold=threshold)
        matplotlib.pyplot.plot(
            [threshold, threshold], [0, 10 * len(label)],
            'k--')  # draws a datshed line where dendogram is cut.
        #help(sch.dendrogram)
        ax1.set_xticks([])
        ax1.set_yticks([])

        # Compute and plot second dendrogram.
        ax2 = fig.add_axes([0.3, 0.71, 0.6, 0.2])
        Z2 = sch.dendrogram(Y, color_threshold=threshold)
        matplotlib.pyplot.plot(
            [0, 10 * len(label)], [threshold, threshold],
            'k--')  # draws a datshed line where dendogram is cut.
        ax2.set_xticks([])
        ax2.set_yticks([])
        #ax2.set_xlim(-1, n)

        # Plot distance matrix.
        axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6])
        idx1 = Z1['leaves']
        idx2 = Z2['leaves']
        print "#### index "
        for i in idx1:
            print i
        print "####"
        Mat = Mat[idx1, :]
        Mat = Mat[:, idx2]
        #xlabel[:] = xlabel[idx2]
        xlabel_new = []
        clusters_new = []
        for i in range(len(idx2)):
            xlabel_new.append(xlabel[idx2[i]])
            clusters_new.append(clusters[idx2[i]])
        del xlabel[:]
        xlabel = xlabel_new

        cluster_dic = {}

        print "systems sorted:"
        for i in range(len(xlabel)):
            print xlabel[i] + " " + str(clusters_new[i])
            if clusters_new[i] in cluster_dic.keys():
                cluster_dic[clusters_new[i]] = cluster_dic[
                    clusters_new[i]] + " " + xlabel[i]
            else:
                cluster_dic[clusters_new[i]] = xlabel[i]
        for key in cluster_dic.keys():
            print "cluster " + str(key) + ":" + cluster_dic[key]

    else:
        Mat = mat_to_mat(Mat)
        axmatrix = fig.add_axes([0.3, 0.1, 0.6, 0.6])

    #cdict = {'red': ((0.0, 0.0, 0.0),
    #                  (0.0, 0.0, 0.0),
    #                  (1.0, 1.0, 1.0)),
    #          'green': ((0.0, 0.0, 0.0),
    #                    (0.0, 0.0, 0.0),
    #                    (1.0, 1.0, 1.0)),
    #          'blue': ((0.0, 0.0, 0.0),
    #                   (0.0, 0.0, 0.0),
    #                   (1.0, 1.0, 1.0))}

    #cdict = {'red':   [(0.0,  0.0, 0.0),
    #              (0.5,  1.0, 1.0),
    #              (1.0,  1.0, 1.0)],
    #
    #    'green': [(0.0,  0.0, 0.0),
    #              (0.25, 0.0, 0.0),
    #              (0.75, 1.0, 1.0),
    #              (1.0,  1.0, 1.0)],
    #
    #    'blue':  [(0.0,  0.0, 0.0),
    #              (0.5,  0.0, 0.0),
    #              (1.0,  1.0, 1.0)]}

    ## red - white - blue
    # colorbar is from 0 to 5
    # I want the white to appare at the threshold value
    # midpoint  : threshodl
    # 0.0       =  0.0
    # 0.2       = ~1.0
    # 0.5       =  2.5
    # 0.8       = ~4.0
    # 1.0       =  5.0
    cmin = 0.5
    cmax = 3.0
    #mp  = (threshold - cmin) / (cmax - cmin)  # midpoint is where the white will appear
    mp = (heatmap_threshold - cmin) / (
        cmax - cmin)  # midpoint is where the white will appear
    tol = 0.02
    if mp > 0.9 or mp < 0.1:
        print "threshold = " + str(threshold) + "is too high or low"
        exit()

    cdict = {
        'red': [(0.0, 1.0, 1.0), (mp - tol, 1.0, 1.0), (mp, 1.0, 1.0),
                (mp + tol, 0.7, 0.7), (1.0, 0.0, 0.0)],
        'green': [(0.0, 0.0, 0.0), (mp - tol, 0.7, 0.7), (mp, 1.0, 1.0),
                  (mp + tol, 0.7, 0.7), (1.0, 0.0, 0.0)],
        'blue': [(0.0, 0.0, 0.0), (mp - tol, 0.7, 0.7), (mp, 1.0, 1.0),
                 (mp + tol, 1.0, 1.0), (1.0, 1.0, 1.0)]
    }

    ## blue - purple - red
    #cdict = {'red': [(0.0,  0.0, 0.0),
    #                 (0.5,  0.5, 0.5),
    #                 (1.0,  1.0, 1.0)],
    #
    #       'green': [(0.0,  0.0, 0.0),
    #                 (1.0,  0.0, 0.0)],
    #
    #       'blue':  [(0.0,  1.0, 1.0),
    #                 (0.5,  0.5, 0.5),
    #                 (1.0,  0.0, 0.0)]}

    my_cmap = matplotlib.colors.LinearSegmentedColormap(
        'my_colormap', cdict, 100)

    im = axmatrix.imshow(Mat,
                         aspect='auto',
                         origin='lower',
                         interpolation='nearest',
                         cmap=my_cmap)

    if (bool_sort):
        v = range(0, n)
        axmatrix.plot(v, v, 'yo', markersize=2)

    im.set_clim(cmin, cmax)
    axmatrix.set_xlim(-0.5, n - 0.5)
    axmatrix.set_ylim(-0.5, n - 0.5)
    axmatrix.set_xticks(range(0, m))
    axmatrix.set_xticklabels(xlabel)

    if (not bool_sort):
        axmatrix.set_yticks(range(0, n))
        axmatrix.set_yticklabels(ylabel)
        for i in range(0, n):
            labels = axmatrix.yaxis.get_major_ticks()[i].label
            labels.set_fontsize(3)
    else:
        axmatrix.set_yticks([])

    for i in range(0, m):
        labels = axmatrix.xaxis.get_major_ticks()[i].label
        labels.set_fontsize(3)
        labels.set_rotation('vertical')

    # Plot colorbar.
    axcolor = fig.add_axes([0.91, 0.1, 0.02, 0.6])
    pylab.colorbar(im, cax=axcolor)
    fig.show()
    fig.savefig(filename, dpi=600)

    ##   # make histograms
    ##   if not (bool_sort):
    ##       return
    ##
    ##   # this see how many clusters and how big they are
    ##   min_clust = min(clusters)
    ##   max_clust = max(clusters)
    ##   for i in range(min_clust,max_clust+1):
    ##       count = 0
    ##       for j in range(len(clusters)):
    ##           if i == clusters[j]:
    ##              count = count+1
    ##       print "cluster_"+str(i)+" has " + str(count) + " elements."

    ## if you do not want to gerenate the histograms
    ## comment back in the return
    return

    cluster1_1 = []
    cluster2_2 = []
    cluster3_3 = []
    cluster1_2 = []
    cluster1_3 = []
    cluster2_3 = []

    ##
    ## Looking at the heatmap we I denified the non
    ## singlton clusters.

    clustnum1 = 8  # closed
    clustnum2 = 7  # intermediate
    clustnum3 = 13  # open

    clustname = ["closed", "intermediate", "open"]

    print "Number of sytems = " + str(len(xlabel))

    for i in range(len(xlabel)):
        for j in range(i, len(xlabel)):
            ## Note that this is for a threshold of 2.0
            ## Looking at the heatmap we I denified the non
            ## singlton clusters.
            #if clusters[i] == 1 and clusters[j] == 1:
            if clusters[i] == clustnum1 and clusters[j] == clustnum1 \
            or clusters[j] == clustnum1 and clusters[i] == clustnum1:
                cluster1_1.append(Mat_copy[i, j])
            #elif clusters[i] == 2 and clusters[j] == 2:
            elif clusters[i] == clustnum2 and clusters[j] == clustnum2 \
              or clusters[j] == clustnum2 and clusters[i] == clustnum2:
                cluster2_2.append(Mat_copy[i, j])
            #elif clusters[i] == 3 and clusters[j] == 3:
            elif clusters[i] == clustnum3 and clusters[j] == clustnum3 \
              or clusters[j] == clustnum3 and clusters[i] == clustnum3:
                cluster3_3.append(Mat_copy[i, j])
            #elif clusters[i] == 1 and clusters[j] == 2:
            elif clusters[i] == clustnum1 and clusters[j] == clustnum2 \
              or clusters[j] == clustnum1 and clusters[i] == clustnum2:
                cluster1_2.append(Mat_copy[i, j])
            #elif clusters[i] == 1 and clusters[j] == 3:
            elif clusters[i] == clustnum1 and clusters[j] == clustnum3 \
              or clusters[j] == clustnum1 and clusters[i] == clustnum3:
                cluster1_3.append(Mat_copy[i, j])
            #elif clusters[i] == 2 and clusters[j] == 3:
            elif clusters[i] == clustnum2 and clusters[j] == clustnum3 \
              or clusters[j] == clustnum2 and clusters[i] == clustnum3:
                print clusters[i], clusters[j], Mat_copy[i, j]
                cluster2_3.append(Mat_copy[i, j])
            #else:
            #   print clusters[i], clusters[j]
            #print clusters[i], clusters[j]
    cluster1_1_sci = array_to_vector(cluster1_1)
    cluster1_2_sci = array_to_vector(cluster1_2)
    cluster1_3_sci = array_to_vector(cluster1_3)
    cluster2_2_sci = array_to_vector(cluster2_2)
    cluster2_3_sci = array_to_vector(cluster2_3)
    cluster3_3_sci = array_to_vector(cluster3_3)
    fig = pylab.figure(figsize=(8, 8))
    inbins = numpy.linspace(0, 4, 50)
    pbins = numpy.linspace(0.05, 3.95, 49)
    #inbins = [0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0]
    #pbins = [0.25,0.75,1.25,1.75,2.25,2.75,3.25,3.75,4.25,4.75]
    #n, bins, patches = matplotlib.pylab.hist(cluster2_2_sci, inbins, normed=1, histtype='bar')
    axis = fig.add_axes([0.1, 0.1, 0.3, 0.1])
    n1_1, bins, patches = axis.hist(cluster1_1_sci,
                                    inbins,
                                    normed=1,
                                    histtype='bar')
    #p1 = pylab.plot(pbins,n1_1,'k-') #.
    axis.set_xlim(0.0, 4.0)
    #axis.set_ylim(0.0, 10.0)
    axis.set_ylim(0.0, 5.0)

    axis = fig.add_axes([0.1, 0.3, 0.3, 0.1])
    n2_2, bins, patches = axis.hist(cluster2_2_sci,
                                    inbins,
                                    normed=1,
                                    histtype='bar')
    axis.set_xlim(0.0, 4.0)
    axis.set_ylim(0.0, 5.0)

    axis = fig.add_axes([0.1, 0.5, 0.3, 0.1])
    n3_3, bins, patches = axis.hist(cluster3_3_sci,
                                    inbins,
                                    normed=1,
                                    histtype='bar')
    axis.set_xlim(0.0, 4.0)
    axis.set_ylim(0.0, 5.0)

    axis = fig.add_axes([0.5, 0.1, 0.3, 0.1])
    n1_2, bins, patches = axis.hist(cluster1_2_sci,
                                    inbins,
                                    normed=1,
                                    histtype='bar')
    axis.set_xlim(0.0, 4.0)
    axis.set_ylim(0.0, 5.0)

    axis = fig.add_axes([0.5, 0.3, 0.3, 0.1])
    n1_3, bins, patches = axis.hist(cluster1_3_sci,
                                    inbins,
                                    normed=1,
                                    histtype='bar')
    axis.set_xlim(0.0, 4.0)
    axis.set_ylim(0.0, 5.0)

    axis = fig.add_axes([0.5, 0.5, 0.3, 0.1])
    n2_3, bins, patches = axis.hist(cluster2_3_sci,
                                    inbins,
                                    normed=1,
                                    histtype='bar')
    axis.set_xlim(0.0, 4.0)
    axis.set_ylim(0.0, 5.0)

    fig.show()
    #fig.savefig("single_hist1.png",dpi=600)
    fig.savefig("single_hist1_" + filename, dpi=600)
    fig = pylab.figure(figsize=(8, 8))
    #print n, bins, patches
    axis = fig.add_axes([0.3, 0.1, 0.6, 0.6])
    #axis = fig.add_axes([0.1,0.4,0.1,0.6])
    #matplotlib.pyplot.plot(pbins,n1_1,'y-o',pbins,n2_2,'b-o',pbins,n3_3,'r-o',pbins,n1_2,'g-o',pbins,n1_3,'m-o',pbins,n2_3,'k-o') #.
    #matplotlib.pyplot.plot(pbins,n1_1,'y-',label='1_1') #.
    p1 = pylab.plot(pbins, n1_1, 'm-')  #.
    p2 = pylab.plot(pbins, n2_2, 'c-')  #.
    p3 = pylab.plot(pbins, n3_3, 'g-')  #.
    p4 = pylab.plot(pbins, n1_2, 'r-')  #.
    p5 = pylab.plot(pbins, n1_3, 'b-')  #.
    p6 = pylab.plot(pbins, n2_3, 'y-')  #.
    #pylab.legend([p1[0],p2[0],p3[0],p4[0],p5[0],p6[0]],['1_1','2_2','3_3','1_2','1_3','2_3'])
    pylab.legend([p1[0],p2[0],p3[0],p4[0],p5[0],p6[0]],[ clustname[0]+'_'+clustname[0], clustname[1]+'_'+clustname[1], clustname[2]+'_'+clustname[2], \
                                                         clustname[0]+'_'+clustname[1],clustname[0]+'_'+clustname[2],clustname[1]+'_'+clustname[2]], \
                                                         bbox_to_anchor=(0., 1.02, 1., .102), loc=3)
    # loc=[0.3,0.8])

    pylab.xlabel("RMSD (angstroms)")
    pylab.ylabel("Normlized Count")
    fig.show()
    #fig.savefig("single_hist2.png",dpi=600)
    fig.savefig("single_hist2_" + filename, dpi=600)
    return
Пример #16
0
data_scaled_imputed_df = pd.DataFrame(data_scaled_imputed,
                                      columns=SDoH_COLS_NEW)
data_scaled_imputed_df['ssid'] = analysis_data['ssid'].values
data_scaled_imputed_df.to_csv(save_dir + 'dev_data_SDoH.csv', index=False)

## ------------ clustering ---------------- #
dist_mtx = euclidean_distances(data_scaled_imputed)
#dist_mtx = euclidean_distances(X_pca)
linkage = hc.linkage(sp.distance.squareform(dist_mtx, checks=False),
                     method='ward')
ns_plot = sns.clustermap(dist_mtx, row_linkage=linkage, col_linkage=linkage)
plt.savefig(save_dir + '_SDoH_clustergram.png', dpi=300)
plt.close()

plt.figure(figsize=[8, 6])
hc.set_link_color_palette(['#330066', '#7F00FF', '#CC99FF', 'k'])
d_plot = hc.dendrogram(linkage,
                       orientation='top',
                       color_threshold=100,
                       above_threshold_color='#808080')
plt.savefig(save_dir + '_SDoH_dendrogram.pdf')
plt.close()

C = 3
labels = fcluster(linkage, C, criterion='maxclust')

# rename labels
clust_label_map = {
    1: 1,
    2: 2,
    3: 3,
Пример #17
0
                    level=logging.INFO)

import matplotlib
matplotlib.use('Agg') # Must be before importing matplotlib.pyplot or pylab!

import matplotlib.pyplot as plt

import matplotlib.gridspec as gridspec
import seaborn as sb
matplotlib.rcParams['lines.linewidth'] = 0.8
from matplotlib.colors import rgb2hex
from scipy.cluster.hierarchy import linkage, dendrogram, set_link_color_palette

sb.set_palette('Set1', 10, 0.80)
palette = sb.color_palette()
set_link_color_palette(map(rgb2hex, palette))

from verification.verification import Verification
from verification.evaluation import evaluate, evaluate_with_threshold, average_precision_score
from verification.evaluation import rank_predict
from verification.plotting import draw_tree
from verification.preprocessing import prepare_corpus, Dataset
from sklearn.cross_validation import train_test_split
import numpy as np
import pandas as pd

# select a data set
dev = "../data/caesar_dev"
test = "../data/caesar_test"

# we prepare the corpus
Пример #18
0
    def create_dendrogram(self, metric, rank, excel_path):
        if metric == "Precall":
            precision_df = pd.read_excel(excel_path,
                                         sheet_name="Precision",
                                         engine='openpyxl').fillna(1)
            recall_df = pd.read_excel(excel_path,
                                      sheet_name="Recall",
                                      engine='openpyxl').fillna(1)
            # calculate harmonic mean = (2*p*r) / (p+r)
            p_df = precision_df.iloc[:, 3:]
            r_df = recall_df.iloc[:, 3:]
            harmonic_mean_df = (p_df.mul(r_df) * 2).div(
                p_df.add(r_df)).fillna(0)
            df = pd.concat([precision_df.iloc[:, :3], harmonic_mean_df],
                           axis=1)
        else:
            df = pd.read_excel(excel_path,
                               sheet_name=metric,
                               engine='openpyxl')

        if rank == '':
            tmp_df = df
        else:
            tmp_df = df[df['rank'] == rank]

        to_remove = ['Tax ID', 'rank', 'name', 'Aggregate']
        cols = [col for col in tmp_df.columns if col not in to_remove]

        tool_array = []
        names = []
        for item in cols:
            res = tmp_df[item]
            if np.sum(res) == 0:
                continue
            tool_array.append(res.tolist())
            names.append(item.split('.')[0])
        tool_array = np.array(tool_array)

        if len(tool_array) > 1:
            matplotlib.rcParams['lines.linewidth'] = 3
            bray_curt = distance.pdist(np.array(tool_array), 'braycurtis')
            link = linkage(bray_curt, 'average')
            set_link_color_palette(['y', 'c', 'g', 'm', 'r'])

            plt.figure(figsize=[20.4, 10.4], dpi=480)
            title = metric + ": " + rank.capitalize() + "-Dendrogram"
            plt.suptitle(title, size=36, weight='semibold')
            den = dendrogram(link, orientation='right', labels=names)

            plt.xlim(-0.05, 1.05)
            plt.xlabel("Bray Curtis Distance",
                       fontsize=20,
                       weight='semibold',
                       labelpad=15)
            plt.ylabel("Tools", fontsize=20, weight='semibold', labelpad=30)
            plt.tick_params(labelsize=16, labelcolor='#00213E')
            fn = title.replace(": ", "-")
            filename = fn.replace(" ", "_") + '.png'
            plt.savefig(os.path.join(self.output_path, filename),
                        dpi=480,
                        facecolor='#F5FFFF',
                        transparent=False,
                        bbox_inches='tight')

            plt.close()
            print("\n{} has been saved.".format(filename))
        #plt.show()

        # add arg to create subplot grouped by metric or rank (subplot='none'; 'metric'; 'rank')
        return
ax = axi.get_axes()
clean_axis(ax)
plt.show()

# calculate pairwise distances for rows
pairwise_dists = distance.squareform(distance.pdist(core_df, similarity))
# cluster
row_clusters = sch.linkage(pairwise_dists, method="complete")

# calculate pairwise distances for columns
col_pairwise_dists = distance.squareform(distance.pdist(core_df.T, similarity))
# cluster
col_clusters = sch.linkage(col_pairwise_dists, method="complete")

# make dendrograms black rather than letting scipy color them
sch.set_link_color_palette(["black"])

# plot the results
fig = plt.figure(figsize=figure_size)
# fig.suptitle(os.path.split(input_file_path)[1])
heatmapGS = gridspec.GridSpec(2, 2, wspace=0.0, hspace=0.0, width_ratios=[0.25, 1], height_ratios=[0.25, 1])

### col dendrogram ####
col_denAX = fig.add_subplot(heatmapGS[0, 1])
col_denD = sch.dendrogram(col_clusters, color_threshold=np.inf)
clean_axis(col_denAX)

### row dendrogram ###
row_denAX = fig.add_subplot(heatmapGS[1, 0])
row_denD = sch.dendrogram(row_clusters, color_threshold=np.inf, orientation="right")
clean_axis(row_denAX)
Пример #20
0
Dm = squareform(D)

# Dendrogram
fig = plt.figure(figsize=(18, 6))
plt.style.use('seaborn-whitegrid')
G = gridspec.GridSpec(1,
                      3,
                      wspace=0,
                      hspace=0.1,
                      top=0.86,
                      bottom=0.08,
                      left=0.14,
                      right=0.9)
ax0 = plt.subplot(G[0, :-1])
Z = sch.linkage(D, method='complete')
sch.set_link_color_palette(['r', 'b', 'g', 'm', 'y', 'c'])
dn = sch.dendrogram(
    Z,
    orientation='left',
    distance_sort='descending',
    no_labels=True,
    above_threshold_color='k',
    color_threshold=110,
)
#plt.xticks(np.arange(0, 1500, 100))
plt.margins(0.5, 0.1)
plt.title('Dendrogram', fontsize=20)
plt.xlabel('Euclidean distance', fontsize=14)
plt.axvline(x=110)

# Generate heatmap
Пример #21
0
if __name__ == '__main__':
    #argv[1]: cctable.dat from Kamo outputs
    #argv[2]: file contains coordinates of a specific residue of all structures
    #argv[3]: CLUSTERS.txt from Kamo outputs
    #argv[4]: height cutoff

    matrix = get_cc_matrix(sys.argv[1])
    z = sch.linkage(matrix, method='ward')

    rs = extract_coordinate(sys.argv[2])

    fig_dendro = plt.figure(figsize=(80, 50))
    plt.rc('ytick', labelsize=20)
    plt.ylabel('Height', fontsize=20)
    sch.set_link_color_palette(['g', 'r', 'c', 'm', 'y'])
    d = sch.dendrogram(z, color_threshold=float(sys.argv[4]))
    color_cluster = d['color_list']
    ivl = d['ivl']
    print(ivl)
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    for i in range(len(rs)):
        rs[i][0] = float(rs[i][0])
        rs[i][1] = float(rs[i][1])
        rs[i][2] = float(rs[i][2])
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    kmeans = KMeans(n_clusters=5)
    kmeans.fit(rs)
Пример #22
0
import matplotlib

matplotlib.use('Agg')  # Must be before importing matplotlib.pyplot or pylab!

import matplotlib.pyplot as plt

import matplotlib.gridspec as gridspec
import seaborn as sb

matplotlib.rcParams['lines.linewidth'] = 0.8
from matplotlib.colors import rgb2hex
from scipy.cluster.hierarchy import linkage, dendrogram, set_link_color_palette

sb.set_palette('Set1', 10, 0.80)
palette = sb.color_palette()
set_link_color_palette(map(rgb2hex, palette))

from verification.verification import Verification
from verification.evaluation import evaluate, evaluate_with_threshold, average_precision_score
from verification.evaluation import rank_predict
from verification.plotting import draw_tree
from verification.preprocessing import prepare_corpus, Dataset
from sklearn.cross_validation import train_test_split
import numpy as np
import pandas as pd

# select a data set
dev = "../data/caesar_dev"
test = "../data/caesar_test"

# we prepare the corpus
Пример #23
0
# convert to boolean
kobool = df > 0
kojacc = spatial.distance.pdist(kobool, metric=distance)
Z = hierarchy.linkage(kojacc, method=method, optimal_ordering=True)
maxdist = np.max(Z[:, 2])

# clustering (segmentación)
clust = hierarchy.fcluster(Z, maxdist * args.cutoff, criterion='distance')
n_clusts = len(np.unique(clust))  # number of clusters

# dendograms
# colors for clusters
# TODO: Add an option for selecting colormap
dend_colors = cm.jet(np.linspace(0, 1, n_clusts))
hexcolors = [mpl.colors.rgb2hex(rgb[:3]) for rgb in dend_colors]
hierarchy.set_link_color_palette(hexcolors)

# Plot
plt.figure(figsize=(15, 7))
dend = hierarchy.dendrogram(Z,
                            color_threshold=maxdist * args.cutoff,
                            no_labels=True)
plt.axhline(maxdist * args.cutoff, ls='--', alpha=0.3, c='k')
plt.tight_layout()

# legend and colors
if legend:
    legend_elements = []
    for i, rgb in enumerate(dend_colors):
        label = f"Cluster {i+1}"
        element = Line2D([0], [0], color=rgb, label=label, lw=3)
Пример #24
0
def main():
    """Main function"""
    args = argparser()
    matplotlib.rcParams['lines.linewidth'] = 0.4
    # load data
    print('[INFO] Loading data')
    data = pd.read_csv(args.filename, sep='\t', index_col=[0, 1])

    # *************************************************************************
    # *                         data scaling normaliation                     *
    # *************************************************************************
    # to do
    # data_scaling = (data.T - data.T.min())/(data.T.max() - data.T.min())
    # data = data_scaling.T
    # *************************************************************************
    # *               Fix columns (rows) with 0 or show error and             *
    # *                            warning to the user                        *
    # *************************************************************************
    # Fix for 0 columns
    print("[WARN] Fixing all 0s columns")
    data = data.loc[:, data.sum() != 0]

    # *************************************************************************
    # *                   Calculate distnaces - add to options                *
    # *************************************************************************
    # Rows distances
    d_metric = 'braycurtis'
    linkage_m = 'average'
    metadist = sch.distance.pdist(data, metric=d_metric)
    metalink = sch.linkage(metadist, method=linkage_m)
    metalink = metalink.clip(0, metalink.max() + 1)
    # columns distances
    profdist = sch.distance.pdist(data.T, metric=d_metric)
    proflink = sch.linkage(profdist, method=linkage_m)
    proflink = proflink.clip(0, proflink.max() + 1)

    ############
    # Plotting #
    ############
    print('[INFO] Plotting ...')
    # - Figure setup
    xf = 6.7
    yf = 8.6
    fig = plt.figure(figsize=(xf, yf))
    # Axes positions
    # # Axes without column names
    # posm = [0.01, 0.01, 0.2, 0.82]
    # posp = [0.24, 0.84, 0.67, 0.15]
    # posmat = [0.24, 0.01, 0.67, 0.82]
    # posm_colors = [0.215, 0.01, 0.02, 0.82]
    # poscbar = [0.92, 0.01, 0.015, 0.40]

    # new with labesl
    posm = [0.01, 0.23, 0.2, 0.62]
    posp = [0.24, 0.855, 0.67, 0.14]
    posmat = [0.24, 0.23, 0.67, 0.62]
    posm_colors = [0.215, 0.23, 0.02, 0.62]
    # poscbar = [0.94, 0.01, 0.02, 0.41]
    poscbar = [0.92, 0.23, 0.015, 0.30]
    poslegend = [0.01, 0.84, 0.23, 0.15]

    # colors for dendograms
    sch.set_link_color_palette([
        '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b',
        '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'
    ])

    # # - rows dendogram
    meta_ax = fig.add_axes(posm, frameon=False)
    metadend = sch.dendrogram(metalink,
                              color_threshold=0.2 * max(metalink[:, 2]),
                              orientation='left')
    meta_ax.set_xticks([])
    meta_ax.set_yticks([])

    # # - columns dendogram
    prof_ax = fig.add_axes(posp, frameon=False)
    profdend = sch.dendrogram(proflink,
                              color_threshold=0.2 * max(proflink[:, 2]),
                              orientation='top')
    prof_ax.set_xticks([])
    prof_ax.set_yticks([])

    # # - Matrix - HEATMAP
    matrix_ax = fig.add_axes(posmat)
    mat = data.get_values()
    mmask = metadend['leaves']
    pmask = profdend['leaves']
    mat = mat[mmask, :]
    mat = mat[:, pmask]
    im = matrix_ax.matshow(mat, aspect='auto', origin='lower', cmap='viridis')
    # ****************************************************************************
    # *  Here we can add options to show labels - needs to modify axes positions *
    # ****************************************************************************
    # Etiquetas
    matrix_ax.set_yticks([])
    matrix_ax.set_xticks(range(len(data.columns)))
    matrix_ax.set_xticklabels(data.columns[pmask],
                              rotation=90,
                              fontsize=5,
                              color='k')
    matrix_ax.xaxis.set_ticks_position('bottom')

    # # - Colorbar
    colorbar_ax = fig.add_axes(poscbar)
    cb = plt.colorbar(im, cax=colorbar_ax)
    cb.set_label('Completeness', fontsize='x-small')
    cb.ax.tick_params(labelsize='xx-small')

    # # - Color code
    # Get colors from the first element in the index
    general_index = sorted(pd.MultiIndex.to_frame(data.index)[0].unique(),
                           key=lambda x: int(x[1:]))
    color_as = {}
    for i, v in enumerate(range(len(general_index))):
        icolor = plt.cm.tab20(v / len(general_index))
        color_as[general_index[i]] = icolor
    # color vector
    color_vec = [color_as[i[0]] for i in data.index]
    color_vec = np.array(color_vec)
    color_vec = color_vec[mmask]
    color_ax = fig.add_axes(posm_colors, frameon=False)
    lefts = range(0, len(color_vec), 1)
    height = np.ones(len(color_vec))
    width = 1
    metabars = color_ax.barh(lefts,
                             height,
                             width,
                             color=color_vec,
                             edgecolor=color_vec)
    # Can you use matshow, pcolor or imshow?
    # im_col = color_ax.matshow(color_mat, aspect='auto',
    #                           origin="lower")
    # color_ax.set_xlim(-0.5, 0.5)
    color_ax.set_xticks([])
    color_ax.set_yticks([])
    color_ax.set_ylim((0, len(color_vec)))

    # # - Legend
    legend_ax = fig.add_axes(poslegend, frameon=False)
    legend_ax.set_xticks([])
    legend_ax.set_yticks([])
    patches = []
    for name, color_ in color_as.items():
        p = mpatches.Patch(color=color_, label=name)
        patches.append(p)
    plt.legend(handles=patches,
               fancybox=True,
               fontsize='xx-small',
               loc=2,
               framealpha=0.75)
    # # - show
    # plt.show()
    # # - Save Figure
    figname = 'heatmap.{}'.format(args.im_format)
    fig.savefig(figname, dpi=args.im_res)
X_umap = umpa_reducer.fit_transform(val_study_data[CLUSTERING_COLS])
#X_umap = umpa_reducer.fit_transform(X_pca)
plt.scatter(X_umap[:, 0], X_umap[:, 1], s=1, alpha=0.5)
plt.show()
plt.close()

## ------------ clustering ---------------- #
dist_mtx = euclidean_distances(val_study_data[CLUSTERING_COLS].values)
#dist_mtx = euclidean_distances(X_pca)
linkage = hc.linkage(sp.distance.squareform(dist_mtx, checks=False), method='ward')
ns_plot = sns.clustermap(dist_mtx, row_linkage=linkage, col_linkage=linkage)
plt.savefig(MAIN_DIR + '\\Results\\' + OUTPUT_FOLDER + '\\clustergram.png', dpi=300)
plt.close()

plt.figure(figsize=[8, 6])
hc.set_link_color_palette([ '#CE4257', '#F9C77E', '#79A3D9', '#7B967A']) 
d_plot = hc.dendrogram(linkage, orientation='top', color_threshold=60, above_threshold_color='#808080')
plt.savefig(MAIN_DIR + '\\Results\\' + OUTPUT_FOLDER + '\\dendrogram.pdf', dpi=300)
plt.close()



C = 4
labels = fcluster(linkage, C, criterion='maxclust')
lable_color = {1:'#79A3D9', 2:'#7B967A', 3:'#F9C77E', 4:'#CE4257'}
               
lable_annotation = {1:'Subphenotype I',
                    2:'Subphenotype II',
                    3:'Subphenotype III',
                    4:'Subphenotype IV',
                    }             
Пример #26
0
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.cluster.hierarchy import set_link_color_palette
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

set_link_color_palette(["black"])
pd.set_option('display.max_columns', 500)

np.random.seed(123)
variables = ["X", "Y", "Z"]
labels = ["ID_0", "ID_1", "ID_2", "ID_3", "ID_4"]
X = np.random.random_sample([5, 3]) * 10
df = pd.DataFrame(X, columns=variables, index=labels)

row_clusters = linkage(df.values, method="complete", metric="euclidean")

row_dendr = dendrogram(row_clusters,
                       labels=np.asarray(labels),
                       color_threshold=np.inf)

plt.tight_layout()
plt.ylabel("Euclidean Distance")
plt.show()
Пример #27
0
def make_heatmap_png(data_matrix, colLabel_list, rowLabel_list,\
                     isRowClustering, isColClustering, ratio, fontSize, outputPATH):

    colOrder    = [i for i in range(len(data_matrix[0]))]
    rowOrder    = [j for j in range(len(data_matrix))]
    data_matrix = numpy.array(data_matrix)
    fig         = plt.figure()
    if ratio == -1:
        figRatio    = len(rowLabel_list)/float(len(colLabel_list))
    else:
        figRatio    = ratio
    if figRatio >= 1:
        figWidth    = 50.0/figRatio
    else:
        figWidth    = 50
    figHeight    = figWidth*figRatio
    fig.set_size_inches(figWidth,figHeight)
    heatmapGS   = gridspec.GridSpec(2,2,wspace=0.0,hspace=0.0,width_ratios=[10,figWidth],height_ratios=[10,figHeight])

    #clustering
    clusterMethod       = 'average'
    if isRowClustering:
        #row clustering and dendrogram
        rowDendro_ax        = fig.add_subplot(heatmapGS[1,0])
        rowPairwiseDist     = dist.squareform(dist.pdist(data_matrix), 'euclidean')
        rowCluster          = sch.linkage(rowPairwiseDist, method=clusterMethod)
        sch.set_link_color_palette(['black'])
        row_dendro          = sch.dendrogram(rowCluster, color_threshold=numpy.inf, orientation='right')
        rowOrder            = row_dendro['leaves']
        clean_axis(rowDendro_ax)
        data_matrix = data_matrix[rowOrder, :]

    if isColClustering:
        #column clustering and dendrogram
        colDendro_ax        = fig.add_subplot(heatmapGS[0,1])
        colPairwiseDist     = dist.squareform(dist.pdist(numpy.transpose(data_matrix)), 'euclidean')
        colCluster          = sch.linkage(colPairwiseDist, method=clusterMethod)
        sch.set_link_color_palette(['black'])
        col_dendro          = sch.dendrogram(colCluster, color_threshold=numpy.inf)
        colOrder            = col_dendro['leaves']
        clean_axis(colDendro_ax)
        data_matrix = data_matrix[:, colOrder]





    #depict heatmap
    ax          = fig.add_subplot(heatmapGS[1,1])
    heatmap     = ax.imshow(data_matrix, cmap=plt.cm.PuBuGn,interpolation='nearest',aspect='auto',origin='lower', alpha=1)
    clean_axis(ax)

    #tick and labels
    x_index     = numpy.arange(data_matrix.shape[1])
    y_index     = numpy.arange(data_matrix.shape[0])
    ax.yaxis.tick_left()
    ax.set_xticks(x_index, minor=False)
    ax.set_yticks(y_index, minor=False)
    ax.yaxis.set_ticks_position('right')
    ax.set_xticklabels([colLabel_list[i] for i in colOrder], rotation=90, minor=False)
    if rowLabel_list!=[]:
        ax.set_yticklabels([rowLabel_list[i] for i in rowOrder], minor=False)
    if fontSize == -1:
        ylabelsize = 36
        xlabelsize = ylabelsize*len(rowOrder)/float(len(colOrder))
        if figRatio != -1:
            xlabelsize = xlabelsize/figRatio
    else:
        ylabelsize = fontSize
        xlabelsize = ylabelsize*len(rowOrder)/float(len(colOrder))
        if figRatio != -1:
            xlabelsize = xlabelsize/figRatio

    plt.tick_params(axis='x', labelsize=xlabelsize)
    plt.tick_params(axis='y', labelsize=ylabelsize)
    plt.setp(ax.get_xticklines()+ax.get_yticklines(), visible=False)


    #for colorbar
    scale_cbGSSS    = gridspec.GridSpecFromSubplotSpec(1,2,subplot_spec=heatmapGS[0,0],wspace=0.0,hspace=0.0)
    scale_cbAX      = fig.add_subplot(scale_cbGSSS[0,0])
    cBar            = fig.colorbar(heatmap, scale_cbAX, drawedges=False)
    cBar.ax.tick_params(labelsize=ylabelsize)
    cBar.outline.set_linewidth(0)
    cBar.ax.yaxis.set_ticks_position('left')
    plt.setp(cBar.ax.get_yticklines(), visible=False)

    #plt.tight_layout()
    if outputPATH[-1] == '/':
        plt.savefig(outputPATH+"heatmap.png", format='png')
    else:
        plt.savefig(outputPATH+".png", format='png')
clustering = AgglomerativeClustering().fit(points)

AgglomerativeClustering(affinity='euclidean',
                        compute_full_tree='auto',
                        connectivity=None,
                        distance_threshold=None,
                        linkage='single',
                        memory=None,
                        n_clusters=1,
                        pooling_func='deprecated')

S = hierarchy.linkage(dc, 'single')
sdn = hierarchy.dendrogram(S)
fig = plt.gcf()
fig.canvas.set_window_title('Single-Linkage Clustering')
#plt.show()

A = hierarchy.linkage(dc, 'average')
adn = hierarchy.dendrogram(A)
fig = plt.gcf()
fig.canvas.set_window_title('Average Linkage Clustering')
#plt.show()

C = hierarchy.linkage(dc, 'complete')
cdn = hierarchy.dendrogram(C)
fig = plt.gcf()
fig.canvas.set_window_title('Complete Linkage Clustering')
#plt.show()

hierarchy.set_link_color_palette(None)  # reset to default after use
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt
import numpy as np

# A very basic example:

ytdist = np.array([662., 877., 255., 412., 996., 295., 468., 268.,
                   400., 754., 564., 138., 219., 869., 669.])
Z = hierarchy.linkage(ytdist, 'single')
plt.figure()
dn = hierarchy.dendrogram(Z)

# Now plot in given axes, improve the color scheme and use both vertical and
# horizontal orientations:

hierarchy.set_link_color_palette(['m', 'c', 'y', 'k'])
fig, axes = plt.subplots(1, 2, figsize=(8, 3))
dn1 = hierarchy.dendrogram(Z, ax=axes[0], above_threshold_color='y',
                           orientation='top')
dn2 = hierarchy.dendrogram(Z, ax=axes[1], above_threshold_color='#bcbddc',
                           orientation='right')
hierarchy.set_link_color_palette(None)  # reset to default after use
plt.show()
Пример #30
0
#normalized standardize features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_new)
X_new=scaler.transform(X_new)
X_new=pd.DataFrame(X_new,columns=index)


pca = PCA(n_components = 2) 
X_principal = pca.fit_transform(X_new)
# Calculate the distance between each sample
#Z = hierarchy.linkage(X_principal, 'ward')
Z = hierarchy.linkage(X_new, 'ward')
 
# Set the colour of the cluster here:
hierarchy.set_link_color_palette(['r', 'b'])
 
# Make the dendrogram and give the colour above threshold
hierarchy.dendrogram(Z, color_threshold=14, above_threshold_color='grey')
 
# Add horizontal line.
plt.axhline(y=14, c='black', lw=2, linestyle='dashed')

#from scipy.cluster.hierarchy import fcluster
#d=shc.linkage(X_principal, method ='ward')

ac2 = AgglomerativeClustering(n_clusters = 2,compute_full_tree=True)

# Visualizing the clustering 
plt.figure(figsize =(6, 6)) 
Пример #31
0
def main():
    """
    Cluster distance matrix with scipy.cluster.hierarchy
    """
    parser = argparse.ArgumentParser(description='description')
    parser.add_argument('--input',
                        '-i',
                        type=str,
                        required=True,
                        help='Location of input file'
                        ' that contains the distance matrix as csv')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        required=False,
                        help='Location of id-label mapping file')
    parser.add_argument('--output', '-o', required=False, help='output file')
    args = parser.parse_args()

    logger.info("loading matrix")
    matrix = np.loadtxt(args.input, delimiter=",")
    labels = [
        line.rstrip('\n').split('\t')[0] for line in open(args.label, 'r')
    ]

    logger.info("clustering")
    Z = linkage(squareform(matrix), 'ward')

    logger.info("generating flat clusters")

    clusters = fcluster(Z, 250, 'maxclust')
    cluster_map = {}

    # Output clusters
    output = open(args.output, 'w')
    for disease_id, cluster_id in zip(labels, clusters):
        try:
            cluster_map[cluster_id].append(disease_id)
        except KeyError:
            cluster_map[cluster_id] = [disease_id]
        output.write("{}\t{}\n".format(disease_id, cluster_id))

    # Singletons
    singleton_count = sum(
        [len(v) for k, v in cluster_map.items() if len(v) == 1])
    sizes = [len(v) for k, v in cluster_map.items()]
    logger.info("{} singletons".format(singleton_count))
    logger.info("Avg cluster size: {}".format(mean(sizes)))
    logger.info("median cluster size: {}".format(median(sizes)))

    # Draw dendrogram
    plt.figure()

    dn = hierarchy.dendrogram(Z)

    hierarchy.set_link_color_palette(['m', 'c', 'y', 'k'])
    fig, axes = plt.subplots(1, 2, figsize=(8, 3))
    dn1 = hierarchy.dendrogram(Z,
                               ax=axes[0],
                               above_threshold_color='y',
                               orientation='top')
    dn2 = hierarchy.dendrogram(Z,
                               ax=axes[1],
                               above_threshold_color='#bcbddc',
                               orientation='right')
    hierarchy.set_link_color_palette(None)  # reset to default after use
    plt.show()
ax = fig.add_subplot(111)

cax = ax.matshow(probDf, interpolation='nearest', cmap='hot_r')
fig.colorbar(cax)

ax.set_xticklabels([''] + list(probDf.columns))
ax.set_yticklabels([''] + list(probDf.index))

plt.show()
'''

rowDist = pd.DataFrame(squareform(pdist(probDf, metric='euclidean')), columns=sortedRowNames, index=sortedRowNames)

rowClusters = linkage(pdist(probDf, metric='euclidean'), method='complete')

hierarchy.set_link_color_palette(['black'])

fig = plt.figure(figsize = (8,8))
axd = fig.add_axes([0.09,0.1,0.2,0.6])

rowDendr = dendrogram(rowClusters, orientation = 'right', color_threshold = np.inf,)
dfRowClust = probDf.ix[rowDendr['leaves'][::-1]]
print(rowDendr['leaves'])

axd.set_xticks([])
axd.set_yticks([])

for i in axd.spines.values():
        i.set_visible(False)

Пример #33
0
    def plot_img_with_dendrograms(self, use_abs_cor=True):
        '''
        Plot an image or correlation matrix along with dendrograms
        Uses methods from:
        http://nbviewer.ipython.org/github/ucsd-scientific-python/user-group/blob/master/presentations/20131016/hierarchical_clustering_heatmaps_gridspec.ipynb
        
        Parameters
        -----------
        use_abs_cor : {True, False}, optional
            Use the absolute values of correlation matrix  
            
            
        '''
        import matplotlib.gridspec as gridspec
        import scipy.cluster.hierarchy as sch

        # helper for cleaning up axes by removing ticks, tick labels, frame, etc.
        def clean_axis(ax):
            """Remove ticks, tick labels, and frame from axis"""
            ax.get_xaxis().set_ticks([])
            ax.get_yaxis().set_ticks([])
            for sp in ax.spines.values():
                sp.set_visible(False)

        fig = plt.figure()
        heatmapGS = gridspec.GridSpec(2,
                                      2,
                                      wspace=0.0,
                                      hspace=0.0,
                                      width_ratios=[1, 0.25],
                                      height_ratios=[0.25, 1])
        if use_abs_cor == True:
            D = np.abs(self.array)
        if use_abs_cor == False:
            D = self.array

        ## Col Dendrogram
        col_denAX = fig.add_subplot(heatmapGS[0, 0])
        clusters1 = sch.linkage(D, method='centroid')
        sch.set_link_color_palette(['black'])
        col_denD = sch.dendrogram(clusters1,
                                  labels=self.df.columns.values,
                                  orientation='top',
                                  color_threshold=np.inf)
        clean_axis(col_denAX)

        ## Row Dendrogram
        row_denAX = fig.add_subplot(heatmapGS[1, 1])
        clusters2 = sch.linkage(D, method='single')
        sch.set_link_color_palette(['black'])
        row_denD = sch.dendrogram(clusters2,
                                  labels=self.df.index.values,
                                  orientation='left',
                                  color_threshold=np.inf)
        clean_axis(row_denAX)

        # Heatmap
        heatmapAX = fig.add_subplot(heatmapGS[1, 0])
        idx1 = row_denD['leaves']
        idx2 = col_denD['leaves']
        D_remap = D.copy()
        D_remap = D_remap[idx1, :]
        D_remap = D_remap[:, idx2]
        axi = heatmapAX.imshow(D_remap,
                               interpolation='nearest',
                               aspect='auto',
                               origin='lower',
                               vmin=0,
                               vmax=1)

        def _format_coord(x, y):
            x = int(x + 0.5)
            y = int(y + 0.5)
            par_row = row_denD.items()[0][1][y]
            par_col = col_denD.items()[0][1][x]
            try:
                return "%.3f %s | %s" % (D_remap[y, x], par_row, par_col)
            except IndexError:
                return ""

        heatmapAX.format_coord = _format_coord
        clean_axis(heatmapAX)

        ## row labels ##
        heatmapAX.set_yticks(np.arange(self.df.shape[0]))
        heatmapAX.yaxis.set_ticks_position('left')
        heatmapAX.set_yticklabels(self.df.index[row_denD['leaves']])
        # remove the tick lines
        for l in heatmapAX.get_xticklines() + heatmapAX.get_yticklines():
            l.set_markersize(0)

        ## col labels ##
        heatmapAX.set_xticks(np.arange(self.df.shape[1]))
        heatmapAX.xaxis.set_ticks_position('bottom')
        xlabelsL = heatmapAX.set_xticklabels(
            self.df.columns[col_denD['leaves']])
        # rotate labels 90 degrees
        for label in xlabelsL:
            label.set_rotation(90)
        # remove the tick lines
        for l in heatmapAX.get_xticklines() + heatmapAX.get_yticklines():
            l.set_markersize(0)

        ### scale colorbar ###
        scale_cbGSSS = gridspec.GridSpecFromSubplotSpec(
            1, 2, subplot_spec=heatmapGS[0, 1], wspace=0.5, hspace=0.5)
        scale_cbAX = fig.add_subplot(
            scale_cbGSSS[0, 0])  # colorbar for scale in upper corner
        cb = fig.colorbar(
            axi, scale_cbAX
        )  # note that we tell colorbar to use the scale_cbAX axis
        cb.set_label('Abs. Cor.')
        cb.ax.yaxis.set_ticks_position(
            'right'
        )  # move ticks to left side of colorbar to avoid problems with tight_layout
        cb.ax.yaxis.set_label_position(
            'right'
        )  # move label to left side of colorbar to avoid problems with tight_layout
        cb.outline.set_linewidth(0)
        # make colorbar labels smaller
        tickL = cb.ax.yaxis.get_ticklabels()
        for t in tickL:
            t.set_fontsize(t.get_fontsize() - 3)

        heatmapGS.tight_layout(fig, h_pad=0.1, w_pad=0.5)
Пример #34
0
ax = axi.get_axes()
clean_axis(ax)
plt.show()

# calculate pairwise distances for rows
pairwise_dists = distance.squareform(distance.pdist(core_df, similarity))
# cluster
row_clusters = sch.linkage(pairwise_dists, method='complete')

# calculate pairwise distances for columns
col_pairwise_dists = distance.squareform(distance.pdist(core_df.T, similarity))
# cluster
col_clusters = sch.linkage(col_pairwise_dists, method='complete')

# make dendrograms black rather than letting scipy color them
sch.set_link_color_palette(['black'])

# plot the results
fig = plt.figure(figsize=figure_size)
#fig.suptitle(os.path.split(input_file_path)[1])
heatmapGS = gridspec.GridSpec(2, 2, wspace=0.0, hspace=0.0, width_ratios=[0.25, 1], height_ratios=[0.25, 1])

### col dendrogram ####
col_denAX = fig.add_subplot(heatmapGS[0, 1])
col_denD = sch.dendrogram(col_clusters, color_threshold=np.inf)
clean_axis(col_denAX)

### row dendrogram ###
row_denAX = fig.add_subplot(heatmapGS[1, 0])
row_denD = sch.dendrogram(row_clusters, color_threshold=np.inf, orientation='right')
clean_axis(row_denAX)
Пример #35
0
def dendro(ax,
           dist,
           cut=None,
           labels=None,
           root="top",
           leaf_rotation=90,
           leaf_font_size=10,
           sorting="distance",
           palette_name="LaSalle",
           cluster_colors=True,
           legend_loc="upper right",
           label_colors=False,
           label_color_map=None,
           label_title="",
           labs=("", "", "Distance"),
           font_size=(16, 12, 10)):
    """
    Plot a dendrogram given the artist and the distance matrix at minimum. Can
    produce a refined dendrogram with customized color palette for the clusters,
    and each xtick labelled (even colored if there is a target variable). Also
    enables adding legend for each cluster and the color codes if applicable.

    Inputs:
        - ax (Axes): canvas
        - dist (ndArray): the hierarchical clustering encoded as a linkage
            matrix
        - cut (float): height at which to cut the tree
        - labels (Pandas.Index): index to use for xtick labels
        - root (str): plots the root at the top with "top", and left with "left"
        - leaf_rotation (float): the angle (in degrees) to rotate the leaf
            labels
        - leaf_font_size (float): the font size (in points) of the leaf labels
        - sorting (str): for each node n, the order (visually, from
            left-to-right) n’s two descendent links are plotted is determined
            either by number of objects in its cluster descending, or by
            distance between its direct descendents descending
        - palette_name (str): user-defined palette name for 'Palette' class,
            find more in the 'palette' module
        - cluster_colors (bool): whether to use default or user-defined clusters
            coloring palette
        - legend_loc (str): location for the cluster legend, consistent with
            Matplotlib legend location definitions
        - label_colors (bool): if there is a established target variable,
            whether to color xtick labels according to that variable
        - label_color_map (Pandas.Series): target column if there is an
            established target variable (supervised)
        - label_title (str): title of the label coloring legend, using target
            column name is recommended
        - labs ((str, str, str)): title, x-axis label, y-axis label
        - font_size ((int, int, int)): title, axis label, tick label font
            properties

    Returns:
        ([[int]]) cluster output as color-coded by the dendrogram
    """
    palette = Palette().getPallete(palette_name, path="../../../palettes/")
    _, axis_font, ticks_font = create_font_setting(font_size)
    if cluster_colors:
        set_link_color_palette(palette.color_lst[::-1])

    default = {"show_leaf_counts": True, "above_threshold_color": "grey"}

    # Sort child nodes by distance or by count descending, or neither
    if sorting == "distance":
        default["distance_sort"] = 'descending'
    elif sorting == "count":
        default["count_sort"] = 'descending'

    # Plotting dendrogram and cut
    den = dendrogram(dist,
                     labels=labels,
                     orientation=root,
                     color_threshold=cut,
                     leaf_rotation=leaf_rotation,
                     leaf_font_size=leaf_font_size,
                     **default)

    # Cluster legend
    cluster_colors = []
    for color in den['color_list']:
        if color != "grey" and color not in cluster_colors:
            cluster_colors.append(color)
    c_leg = ax.legend(
        [Line2D([0], [0], color=c, lw=6) for c in cluster_colors],
        ['Cluster %s' % i for i in range(len(cluster_colors))],
        prop=axis_font,
        loc=legend_loc,
        shadow=False)

    # Get color-coded clusters
    color_cluster = {
        col: cluster
        for cluster, col in enumerate(cluster_colors)
    }
    col_lst = den['color_list'][:] + [den['color_list'][-1]]
    for i, col in enumerate(col_lst):
        if col == "grey":
            col_lst[i] = col_lst[i - 1]
    clusters = [[row[1]] for row in sorted(zip(
        den['leaves'], [color_cluster[col] for col in col_lst]),
                                           key=lambda x: x[0])]

    # Color the labels by target if applicable
    if label_colors:
        targets = label_color_map.unique()
        if len(targets) == 2:
            label_colors = palette.pair
        else:
            label_colors = palette.color_lst
        label_color_dict = {
            label: label_colors[i]
            for i, label in enumerate(targets)
        }
        for lbl in ax.get_xmajorticklabels():
            lbl.set_color(label_color_dict[label_color_map[lbl.get_text()]])
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

        leg = ax.legend([
            Line2D([0], [0], color='white', lw=0)
            for _ in range(len(targets) + 1)
        ], [label_title.title()] + list(targets),
                        loc='center left',
                        bbox_to_anchor=(1, 0.5),
                        prop=axis_font)
        for i, text in enumerate(leg.get_texts()[1:]):
            text.set_color(label_colors[i])
            text.set_ha('left')
        ax.add_artist(c_leg)

    # Plot cut
    if cut:
        if root == "left":
            func = ax.axvline
        else:
            func = ax.axhline
        func(cut, ls='--', color='r')

    ax.set_xticklabels(ax.get_xticklabels(), fontproperties=ticks_font)
    ax.set_yticklabels(ax.get_yticks(), fontproperties=ticks_font)
    ax.tick_params(axis='y', direction='in')

    labelTitleAxis(ax, labs, font_size)

    return clusters
Пример #36
0
def heatmapper(X,
               xLabels=[],
               yLabels=[],
               save=os.getcwd() + os.path.sep,
               WRITE_CLUSTER=True,
               methods="pca",
               CPU=os.cpu_count() // 2,
               cluster_both=True,
               SHOW=True,
               tCOLOR='nipy_spectral',
               hCOLOR="YlGnBu",
               _spectral=18,
               _n_neighbors=5,
               _min_dist=0.1,
               _perplexity=50,
               _n_iter=5000,
               _pca_comp=2,
               _color_threshold=0.1):
    """  
    X: M x N array.
    xLabels: N array. The labels or names of data X by column.  
    yLabels: M array. The labels or names of data X by row.
    save: a saving directory with a prefix
    WRITE_CLUSTER: True or False. choose if cluster information is output ot not.
    methods: "", "tsne", "umap", "pca". Dimension reduction methods to apply before hierarchical clustering.
    CPU: CPU number to use. It has effect only when tsne methods is used.
    """
    plt.rcParams.update({'font.size': 12})
    Xshape = np.shape(X)
    assert len(Xshape) == 2, "matrix must be two-dimensional"
    pca_comp1 = Xshape[1]
    pca_comp2 = Xshape[0]

    if WRITE_CLUSTER:
        if len(yLabels) == 0:
            print(
                "Warning: y label names are automatically set as serial numbers. Provide yLabels option so that label names make sense."
            )

            yLabels = list(map(str, range(Xshape[0])))
            #sys.exit("if WRITE_CLUSTER=True, provide xLabels")
        if cluster_both == True and len(xLabels) == 0:
            print(
                "Warning: x label names are automatically set as serial numbers. Provide xLabels option so that label names make sense."
            )
            xLabels = list(map(str, range(Xshape[1])))
    """
    This function generates heatmap of transcriptome data with the hierarchical clustering.  
    """
    save = save + "_" + methods
    # Compute and plot first dendrogram.
    if methods != "":
        print("reducing X axis dimension with " + methods)
        if methods == "umap":
            embeddingX = umap.UMAP(n_neighbors=_n_neighbors,
                                   min_dist=_min_dist,
                                   metric='euclidean',
                                   n_components=2).fit_transform(X)
        elif methods == "pca":
            embeddingX = PCA(n_components=_pca_comp).fit_transform(X)
        elif methods == "tsne":
            if CPU == 0:
                CPU = 1
            tsne = TSNE(n_jobs=CPU, perplexity=_perplexity, n_iter=_n_iter)
            embeddingX = tsne.fit_transform(X)
        np.savez_compressed(save + "_heatmap_array.npz", X=embeddingX)
    else:
        embeddingX = np.array(X)
    fig = plt.figure(figsize=(8, 20))
    ax1 = fig.add_axes([0.09, 0.1, 0.2, 0.8])
    print("calculating Y axis linkage")
    Y = fcl.linkage(embeddingX, method='ward', metric='euclidean')
    _cmap = cm.get_cmap(tCOLOR, _spectral)
    cmap = _cmap(range(_spectral))
    #cmap = cm.nipy_spectral(np.linspace(0, 1, _spectral))
    sch.set_link_color_palette([mpl.colors.rgb2hex(rgb[:3]) for rgb in cmap])

    print('drawing dendrogram...')
    Z1 = sch.dendrogram(Y,
                        orientation='left',
                        color_threshold=_color_threshold * max(Y[:, 2]))
    if cluster_both:

        Xt = np.transpose(X)
        if methods != "":
            print("reducing Y axis dimension with " + methods)
            if methods == "umap":

                embeddingXt = umap.UMAP(n_neighbors=_n_neighbors,
                                        min_dist=_min_dist,
                                        metric='euclidean',
                                        n_components=2).fit_transform(Xt)
            elif methods == "pca":
                embeddingXt = PCA(n_components=_pca_comp).fit_transform(Xt)
            elif methods == "tsne":
                tsne = TSNE(n_jobs=CPU, perplexity=_perplexity, n_iter=_n_iter)
                embeddingXt = tsne.fit_transform(Xt)
        else:
            embeddingXt = Xt
        ax2 = fig.add_axes([0.3, 0.9, 0.5, 0.05])
        #Xt=np.transpose(embeddingXt)
        print("calculating X axis linkage")
        Y2 = fcl.linkage(embeddingXt, method='ward', metric='euclidean')

        print('drawing dendrogram...')
        _cmap = cm.get_cmap(tCOLOR, _spectral)
        cmap2 = _cmap(range(_spectral))
        sch.set_link_color_palette(
            [mpl.colors.rgb2hex(rgb[:3]) for rgb in cmap2])
        Z2 = sch.dendrogram(Y2,
                            orientation='top',
                            color_threshold=_color_threshold * max(Y2[:, 2]))
        idx2 = Z2['leaves']
    ax1.set_xticks([])
    ax1.set_yticks([])

    # Plot distance matrix.
    axmatrix = fig.add_axes([0.3, 0.1, 0.5, 0.8])
    idx1 = Z1['leaves']
    #idx2 = Z2['leaves']

    X2 = X[idx1]
    if cluster_both:
        X2 = X2[:, idx2]
        if WRITE_CLUSTER:
            new_xLabels = []
            for i in idx2:
                new_xLabels.append(xLabels[i])
            cluster_list2 = []
            _tmp_set = set()
            cluster_idxs2 = defaultdict(list)
            #print(Z2['color_list'])
            for c, ic, dc in zip(Z2['color_list'], Z2['icoord'], Z2['dcoord']):
                for l in [[0, 1], [3, 2]]:
                    if dc[l[0]] == 0.0:
                        i = int((ic[l[1]] - 5.0) / 10.0)
                        if not i in _tmp_set:
                            _tmp_set.add(i)
                            cluster_list2.append([i, c])
                            cluster_idxs2[c].append(i)

            cluster_list2 = sorted(cluster_list2)
            assert save is not ""
            with open(save + "_clusters_X_axis.txt", "w") as fo:
                #for k, v in cluster_idxs.items():
                klist = []
                m = 0
                for k, v in cluster_list2:
                    #for _v in v:
                    #print _v, idx1[_v], yLabels[idx1[_v]]
                    #print(k,v)
                    _pos = xLabels[idx2[k]]
                    #print(_pos, k, v)
                    if v == "b":
                        fo.write(_pos + "\t" + v + "\n")
                    else:
                        _key = ",".join(map(str, hex_to_rgb(v)))
                        if len(klist) == 0:
                            _c = ";" + str(m)
                            m += 1
                        elif klist[-1] != _key:
                            _c = ";" + str(m)
                            m += 1
                        fo.write(_pos + "\t" + _key + _c + "\n")
                        klist.append(_key)

    cluster_idxs = defaultdict(list)
    _tmp_set = set()
    cluster_list = []
    for c, ic, dc in zip(Z1['color_list'], Z1['icoord'], Z1['dcoord']):
        for l in [[0, 1], [3, 2]]:
            if dc[l[0]] == 0.0:
                i = int((ic[l[1]] - 5.0) / 10.0)
                if not i in _tmp_set:
                    _tmp_set.add(i)
                    cluster_list.append([i, c])
                    cluster_idxs[c].append(i)
                else:
                    print(c, ic, dc)
    cluster_list = sorted(cluster_list)
    if WRITE_CLUSTER:
        assert save is not ""
        with open(save + "_clusters_Y_axis.txt", "w") as fo:
            #for k, v in cluster_idxs.items():
            klist = []
            m = 0
            for k, v in cluster_list:
                #for _v in v:
                #print _v, idx1[_v], yLabels[idx1[_v]]
                _pos = yLabels[idx1[k]]
                if v == "b":
                    fo.write(_pos + "\t" + v + "\n")
                else:
                    _key = ",".join(map(str, hex_to_rgb(v)))
                    if len(klist) == 0:
                        _c = ";" + str(m)
                        m += 1
                    elif klist[-1] != _key:
                        _c = ";" + str(m)
                        m += 1
                    fo.write(_pos + "\t" + _key + _c + "\n")
                    klist.append(_key)

    labels = []
    sizes = []
    colors = []
    for k, v in cluster_idxs.items():
        sizes.append(len(v))

        colors.append(k)
        labels.append(len(v))

    sizes, colors, labels = zip(
        *sorted(zip(sizes, colors, labels), reverse=True))
    print("drawing heatmap")
    im = axmatrix.imshow(X2, aspect='auto', origin='lower', cmap=hCOLOR)
    if len(xLabels) <= 50:
        axmatrix.set_xticks(range(len(xLabels)))
        axmatrix.set_xticklabels(xLabels, rotation=90)
    else:
        axmatrix.set_xticks([])
        axmatrix.set_xticklabels([])

    axmatrix.yaxis.tick_right()
    if len(yLabels) <= 50:
        axmatrix.set_yticks(range(len(yLabels)))
        axmatrix.set_yticklabels(yLabels)
    else:
        axmatrix.set_yticks([])
        axmatrix.set_yticklabels([])
    #for label in axmatrix.get_yticklabels():
    #label.set_fontname('Arial')
    #label.set_fontsize(6)
    # Plot colorbar.
    axcolor = fig.add_axes([0.5, 0.05, 0.16, 0.02])
    pylab.colorbar(im, cax=axcolor, orientation='horizontal')
    fig2 = pylab.figure(figsize=(8, 8))
    plt.pie(sizes,
            labels=labels,
            colors=colors,
            autopct='%1.1f%%',
            shadow=True,
            startangle=90)
    if save is not "":

        fig.savefig(save + "_heatmap.png", format="png")
        fig2.savefig(save + "_pie.pdf", format="pdf")
    if SHOW == True:
        plt.show()
Пример #37
0
def fig_cluster(out: Dataset,
                fontsize: float = 8,
                **fig_kws: dict) -> Tuple[Figure, Axes]:
    """Plots the dendrogram of a hierarchical clustering.

    Parameters
    ----------
    out
        Valid Dataset from :func:`~araucaria.stats.cluster.cluster`.
    fontsize
        Font size for labels.
        The default is 8.
    fig_kws
        Additional arguments to pass to the :meth:`~matplotlib.figure.Figure.subplots` 
        routine of ``Matplotlib``.

    Returns
    -------
    figure
        ``Matplolib`` figure object.
    axes
        ``Matplotlib`` axes object. 
   
    Raises
    ------
    TypeError
        If ``out`` is not a valid Dataset instance.
    KeyError
        If attributes from :func:`~araucaria.stats.cluster.cluster` 
        do not exist in ``out``.

    See also
    --------
    :func:`~araucaria.stats.cluster.cluster` : Performs hierarchical clustering on a collection.

    Example
    -------
    .. plot::
        :context: reset

        >>> import matplotlib.pyplot as plt
        >>> from araucaria.testdata import get_testpath
        >>> from araucaria.xas import pre_edge
        >>> from araucaria.stats import cluster
        >>> from araucaria.io import read_collection_hdf5
        >>> from araucaria.plot import fig_cluster
        >>> fpath      = get_testpath('Fe_database.h5')
        >>> collection = read_collection_hdf5(fpath)
        >>> collection.apply(pre_edge)
        >>> datgroup   = cluster(collection, cluster_region='xanes')
        >>> fig, ax    = fig_cluster(datgroup)
        >>> fig.tight_layout()
        >>> plt.show(block=False)
    """
    check_objattrs(out,
                   Dataset,
                   attrlist=['groupnames', 'Z', 'cluster_pars'],
                   exceptions=True)

    # plotting the results
    fig, ax = plt.subplots(1, 1, **fig_kws)
    hierarchy.set_link_color_palette(['c', 'm', 'y', 'k'])
    dn = hierarchy.dendrogram(out.Z,
                              ax=ax,
                              orientation='right',
                              leaf_font_size=fontsize,
                              above_threshold_color='gray',
                              labels=out.groupnames)
    ax.set_title(out.cluster_pars['cluster_region'].upper() + ' dendrogram')
    return (fig, ax)
Пример #38
0
    def plot_img_with_dendrograms(self, use_abs_cor = True):
        
        '''
        Plot an image or correlation matrix along with dendrograms
        Uses methods from:
        http://nbviewer.ipython.org/github/ucsd-scientific-python/user-group/blob/master/presentations/20131016/hierarchical_clustering_heatmaps_gridspec.ipynb
        
        Parameters
        -----------
        use_abs_cor : {True, False}, optional
            Use the absolute values of correlation matrix  
            
            
        '''
        import matplotlib.gridspec as gridspec
        import scipy.cluster.hierarchy as sch
        
        # helper for cleaning up axes by removing ticks, tick labels, frame, etc.
        def clean_axis(ax):
            """Remove ticks, tick labels, and frame from axis"""
            ax.get_xaxis().set_ticks([])
            ax.get_yaxis().set_ticks([])
            for sp in ax.spines.values():
                sp.set_visible(False)        
   
        fig = plt.figure()
        heatmapGS = gridspec.GridSpec(2,2,wspace=0.0,hspace=0.0,width_ratios=[1,0.25],height_ratios=[0.25,1])
        if use_abs_cor == True:
            D = np.abs(self.array)
        if use_abs_cor == False:
            D = self.array

        ## Col Dendrogram
        col_denAX = fig.add_subplot(heatmapGS[0,0])
        clusters1 = sch.linkage(D, method='centroid')  
        sch.set_link_color_palette(['black'])
        col_denD = sch.dendrogram(clusters1, labels = self.df.columns.values, orientation='top', color_threshold=np.inf)
        clean_axis(col_denAX)
        
        ## Row Dendrogram
        row_denAX = fig.add_subplot(heatmapGS[1,1])
        clusters2 = sch.linkage(D, method='single')
        sch.set_link_color_palette(['black'])
        row_denD = sch.dendrogram(clusters2, labels = self.df.index.values, orientation='left', color_threshold=np.inf)
        clean_axis(row_denAX)
        
      
        # Heatmap
        heatmapAX = fig.add_subplot(heatmapGS[1,0])
        idx1 = row_denD['leaves']
        idx2 = col_denD['leaves']
        D_remap = D.copy()
        D_remap = D_remap[idx1,:]
        D_remap = D_remap[:,idx2]
        axi = heatmapAX.imshow(D_remap,interpolation='nearest',aspect='auto',origin='lower',vmin = 0, vmax = 1)
        def _format_coord(x, y):
            x = int(x + 0.5)
            y = int(y + 0.5)
            par_row = row_denD.items()[0][1][y]
            par_col = col_denD.items()[0][1][x]
            try:
                return "%.3f %s | %s" % (D_remap[y, x], par_row, par_col)
            except IndexError:
                    return ""
        heatmapAX.format_coord = _format_coord 
        clean_axis(heatmapAX)
        
        
        ## row labels ##
        heatmapAX.set_yticks(np.arange(self.df.shape[0]))
        heatmapAX.yaxis.set_ticks_position('left')
        heatmapAX.set_yticklabels(self.df.index[row_denD['leaves']])
        # remove the tick lines
        for l in heatmapAX.get_xticklines() + heatmapAX.get_yticklines(): 
            l.set_markersize(0)
            
        ## col labels ##
        heatmapAX.set_xticks(np.arange(self.df.shape[1]))
        heatmapAX.xaxis.set_ticks_position('bottom')
        xlabelsL = heatmapAX.set_xticklabels(self.df.columns[col_denD['leaves']])
        # rotate labels 90 degrees
        for label in xlabelsL:
            label.set_rotation(90)
        # remove the tick lines
        for l in heatmapAX.get_xticklines() + heatmapAX.get_yticklines(): 
            l.set_markersize(0)
            
        ### scale colorbar ###
        scale_cbGSSS = gridspec.GridSpecFromSubplotSpec(1,2,subplot_spec=heatmapGS[0,1],wspace=0.5,hspace=0.5)
        scale_cbAX = fig.add_subplot(scale_cbGSSS[0,0]) # colorbar for scale in upper corner
        cb = fig.colorbar(axi,scale_cbAX) # note that we tell colorbar to use the scale_cbAX axis
        cb.set_label('Abs. Cor.')
        cb.ax.yaxis.set_ticks_position('right') # move ticks to left side of colorbar to avoid problems with tight_layout
        cb.ax.yaxis.set_label_position('right') # move label to left side of colorbar to avoid problems with tight_layout
        cb.outline.set_linewidth(0)
        # make colorbar labels smaller
        tickL = cb.ax.yaxis.get_ticklabels()
        for t in tickL:
            t.set_fontsize(t.get_fontsize() - 3)
        

        heatmapGS.tight_layout(fig, h_pad = 0.1, w_pad = 0.5)
        #fig.tight_layout()



        
Пример #39
0
def scatter(X,
            xLabels=[],
            yLabels=[],
            save=os.getcwd() + os.path.sep,
            WRITE_CLUSTER=True,
            methods="tsne",
            CPU=os.cpu_count() // 2,
            SHOW=True,
            COLOR='nipy_spectral',
            _spectral=18,
            _n_neighbors=5,
            _min_dist=0.1,
            _perplexity=50,
            _n_iter=5000,
            _color_threshold=0.1,
            s=0.5**2):
    """  
    X: M x N array.
    xLabels: N array. The labels or names of data X by column.  
    yLabels: M array. The labels or names of data X by row.
    save: a saving directory with a prefix
    WRITE_CLUSTER: True or False. choose if cluster information is output ot not.
    methods: "", "tsne", "umap", "pca". Dimension reduction methods to apply before hierarchical clustering.
    CPU: CPU number to use. It has effect only when tsne methods is used.
    """

    Xshape = np.shape(X)
    yind = list(map(str, range(Xshape[0])))
    plt.rcParams.update({'font.size': 12})

    if WRITE_CLUSTER:
        if len(yLabels) == 0:
            print(
                "Warning: y label names are automatically set as serial numbers. Provide yLabels option so that label names make sense."
            )

            yLabels = list(map(str, range(Xshape[0])))

    save = save + "_" + methods
    # Compute and plot first dendrogram.
    if methods != "":
        print("reducing X axis dimension with " + methods)
        if methods == "umap":
            embeddingX = umap.UMAP(n_neighbors=_n_neighbors,
                                   min_dist=_min_dist,
                                   metric='euclidean',
                                   n_components=2).fit_transform(X)
        elif methods == "pca":
            embeddingX = PCA(n_components=2).fit_transform(X)
        elif methods == "tsne":
            if CPU == 0:
                CPU = 1
            tsne = TSNE(n_jobs=CPU, perplexity=_perplexity, n_iter=_n_iter)
            embeddingX = tsne.fit_transform(X)
        else:
            sys.exit("methods options can only accept umap, pca, tsne or ''.")
        np.savez_compressed(save + "_scatter_array.npz", X=embeddingX)
    else:
        print("skipping dimensionality reduction")
        if Xshape[1] != 2:
            sys.exit(
                "if methods is '', then the shape of the matrix must be N x 2."
            )
        embeddingX = X
    fig, ax = plt.subplots(figsize=(8, 8))

    print("calculating Y axis linkage")
    Y = fcl.linkage(embeddingX, method='ward', metric='euclidean')
    _cmap = cm.get_cmap(COLOR, _spectral)
    cmap = _cmap(range(_spectral))
    sch.set_link_color_palette([mpl.colors.rgb2hex(rgb[:3]) for rgb in cmap])

    print('drawing dendrogram...')
    Z1 = sch.dendrogram(Y,
                        orientation='left',
                        color_threshold=_color_threshold * max(Y[:, 2]))

    #ax.set_xticks([])
    ax.set_yticks([])
    ax.set_title('Hierarchical clustering ')

    # Plot distance matrix.
    fig2, ax2 = plt.subplots(figsize=(8, 8))
    idx1 = Z1['leaves']
    #idx2 = Z2['leaves']

    X2 = X[idx1]

    cluster_idxs = defaultdict(list)
    _tmp_set = set()
    cluster_list = []
    #print(Z1['color_list'])
    for c, ic, dc in zip(Z1['color_list'], Z1['icoord'], Z1['dcoord']):
        for l in [[0, 1], [3, 2]]:
            if dc[l[0]] == 0.0:
                i = int((ic[l[1]] - 5.0) / 10.0)
                if not i in _tmp_set:
                    _tmp_set.add(i)
                    cluster_list.append([i, c])
                    cluster_idxs[c].append(i)
                else:
                    print(c, ic, dc)
    cluster_list = sorted(cluster_list)
    #print("sample num: "+str(Xshape[0])+"\ncluster_list: "+str(len(cluster_list)))
    _color_list = [""] * len(yLabels)
    if WRITE_CLUSTER:
        assert save is not ""
        with open(save + "_clusters_on_scatter_plot.txt", "w") as fo:
            #for k, v in cluster_idxs.items():
            klist = []
            m = 0
            for k, v in cluster_list:
                #for _v in v:
                #print _v, idx1[_v], yLabels[idx1[_v]]
                _pos = str(yLabels[idx1[k]])
                _ind = str(yind[idx1[k]])
                #print(mpl.colors.hex2color(v))
                _color_list[idx1[k]] = list(mpl.colors.hex2color(v)) + [1.0]
                if v == "b":
                    fo.write(_ind + "\t" + _pos + "\t" + v + "\n")
                else:
                    _key = ",".join(map(str, hex_to_rgb(v)))
                    if len(klist) == 0:
                        _c = ";" + str(m)
                        m += 1
                    elif klist[-1] != _key:
                        _c = ";" + str(m)
                        m += 1
                    fo.write(_ind + "\t" + _pos + "\t" + _key + _c + "\n")
                    klist.append(_key)
    else:
        for k, v in cluster_list:

            _color_list[idx1[k]] = list(mpl.colors.hex2color(v)) + [1.0]

    print("drawing scatter plot")

    plt.scatter(embeddingX[:, 0], embeddingX[:, 1], color=_color_list, s=s)
    ax2.set_title('Scatter plot colored by clusters')
    #plt.scatter(X[:, 0],X[:,1], color=_color_list)
    fig.savefig(save + "_dendro.png", format="png")
    fig2.savefig(save + "_scatter.png", format="png")
    if SHOW == True:
        plt.show()
Пример #40
0
url = "https://examples.obspy.org/dissimilarities.npz"
with io.BytesIO(urlopen(url).read()) as fh, np.load(fh) as data:
    dissimilarity = data["dissimilarity"]

plt.subplot(121)
plt.imshow(1 - dissimilarity, interpolation="nearest", cmap=obspy_sequential)

dissimilarity = distance.squareform(dissimilarity)
threshold = 0.3
linkage = hierarchy.linkage(dissimilarity, method="single")
clusters = hierarchy.fcluster(linkage, threshold, criterion="distance")

# A little nicer set of colors.
cmap = plt.get_cmap("Paired", lut=6)
colors = ["#%02x%02x%02x" % tuple(col * 255 for col in cmap(i)[:3]) for i in range(6)]
try:
    hierarchy.set_link_color_palette(colors[1:])
except AttributeError:
    # Old version of SciPy
    pass

plt.subplot(122)
try:
    hierarchy.dendrogram(linkage, color_threshold=0.3, above_threshold_color=cmap(0))
except TypeError:
    # Old version of SciPy
    hierarchy.dendrogram(linkage, color_threshold=0.3)
plt.xlabel("Event number")
plt.ylabel("Dissimilarity")
plt.show()
    '1-1-1', '2-1-2', '2-2-1', '3-2-1', '4-2-2', '3-2-2', '3-1-1', '1-2-2'
])
print(distDF)
print("**********Lowest Level***************")
totalStd = 0
for winnerWells in final:
    print(f"{winnerWells} with a standard err of {stderror[winnerWells]}")
    totalStd += stderror[winnerWells]
print(f"Average Standard Err: {round(totalStd / (len(final)-1),2)}")
print(f"Final confidence of {round(norm.cdf(totalStd / len(final)) * 100,2)}%")
# if SYSTEM["SHOW_DENDROGRAM"]:
# distDF = distDF.replace(np.nan,1)
# print(z)
# print(dn)
# plt.savefig("static/results.jpg")
# return (returnString,distDF)
distDF.to_csv(
    f'jaccard_darkgreenLT_C{str(COUNT_CUTOFF)}_L{str(LENGTH_CUTOFF[0])}-{str(LENGTH_CUTOFF[1])}_Recovery_{str(int(RECOVERY_EFFICIENCY*100))}.csv'
)
z = hierarchy.linkage(
    distDF, 'average'
)  ##there are a few clustering choices here. for UPGMA, a standard algorithm, use 'average' instead of 'ward'
# plt.figure(figsize=(14,6),dpi=100)
hierarchy.set_link_color_palette(['k'])
dn = hierarchy.dendrogram(z,
                          labels=distDF.index,
                          above_threshold_color='#bbbbbb',
                          orientation='left')

plt.show()