예제 #1
0
def master_loop():
    gene_names = []
    master_store = []
    time_stamps = []
    dump_dict = {}
    for file_name in os.listdir(source_folder):
        if file_name[-4:] == '.vcf':
            print file_name
            master_mix = np.array(pull_average_depth_for_genes(pull_depth_from_vcf(file_name)))
            time_stamps.append(file_name[:-4])
            gene_names = master_mix[:, 0]
            master_store.append(master_mix[:, 1])
            dump_dict[file_name[:-4]] = [master_mix]

    print dump_dict
    dump(dump_dict, open('my_pipe.dmp', 'w'))
    arg_sorter = np.argsort(np.array([int(val[1:]) for val in time_stamps]))
    time_stamps = np.array(time_stamps)[arg_sorter]
    print time_stamps
    master_store = np.array(master_store)
    master_store = master_store[arg_sorter, :].astype(np.float32)
    master_store[np.isnan(master_store)] = 0
    print master_store
    dataframe = pd.DataFrame(data=master_store.T, index=gene_names, columns=time_stamps)
    sns.heatmap(dataframe)
    sns.plt.show()
    sns.clustermap(dataframe)
    sns.plt.show()
    sns.clustermap(dataframe, col_cluster=False)
    sns.plt.show()
예제 #2
0
def plot_filter_seq_heat(filter_outs, out_pdf, whiten=True, drop_dead=True):
    # compute filter output means per sequence
    filter_seqs = filter_outs.mean(axis=2)

    # whiten
    if whiten:
        filter_seqs = preprocessing.scale(filter_seqs)

    # transpose
    filter_seqs = np.transpose(filter_seqs)

    if drop_dead:
        filter_stds = filter_seqs.std(axis=1)
        filter_seqs = filter_seqs[filter_stds > 0]

    # downsample sequences
    seqs_i = np.random.randint(0, filter_seqs.shape[1], 500)

    hmin = np.percentile(filter_seqs[:,seqs_i], 0.1)
    hmax = np.percentile(filter_seqs[:,seqs_i], 99.9)

    sns.set(font_scale=0.3)

    plt.figure()
    sns.clustermap(filter_seqs[:,seqs_i], row_cluster=True, col_cluster=True, linewidths=0, xticklabels=False, vmin=hmin, vmax=hmax)
    plt.savefig(out_pdf)
    #out_png = out_pdf[:-2] + 'ng'
    #plt.savefig(out_png, dpi=300)
    plt.close()
예제 #3
0
def plot_centrimo(centrimo_in, figure_output):
    centrimo_table = pd.read_table(centrimo_in, index_col=0)
    centrimo_table.sort(columns="Average", axis=0, ascending=False, inplace=True)
    sns.clustermap(centrimo_table, method='single', metric="euclidean",
                   z_score=None, row_cluster=False, col_cluster=True)
    f = plt.gcf()
    f.savefig(figure_output, bbox_inches='tight')
예제 #4
0
def plot_target_corr(filter_outs, seq_targets, filter_names, target_names, out_pdf, seq_op='mean'):
    num_seqs = filter_outs.shape[0]
    num_targets = len(target_names)

    if seq_op == 'mean':
        filter_outs_seq = filter_outs.mean(axis=2)
    else:
        filter_outs_seq = filter_outs.max(axis=2)

    # std is sequence by filter.
    filter_seqs_std = filter_outs_seq.std(axis=0)
    filter_outs_seq = filter_outs_seq[:,filter_seqs_std > 0]
    filter_names_live = filter_names[filter_seqs_std > 0]

    filter_target_cors = np.zeros((len(filter_names_live),num_targets))
    for fi in range(len(filter_names_live)):
        for ti in range(num_targets):
            cor, p = spearmanr(filter_outs_seq[:,fi], seq_targets[:num_seqs,ti])
            filter_target_cors[fi,ti] = cor

    cor_df = pd.DataFrame(filter_target_cors, index=filter_names_live, columns=target_names)

    sns.set(font_scale=0.3)
    plt.figure()
    sns.clustermap(cor_df, cmap='BrBG', center=0, figsize=(8,10))
    plt.savefig(out_pdf)
    plt.close()
def visualizeConsensus(consensusMat, connectivityMatrices, clusters, colNames):
	if colNames=='noXLabels':
		#put concensus matrix into dataframe to build hierarchical clustermap		
		dataframe=pd.DataFrame(data=consensusMat)
		#clusters by columns and rows and annotates probablility a particular sample clusters together
		#cluster distance is meausred by average Euclidean Distance in seaborn for hierarchical clustering
		consensusClustered=sns.clustermap(dataframe, col_cluster=True, row_cluster=True, annot=True)
		consensusClustered.savefig(str(matrixPath)+'consensus_Matrix_over_'+str(len(connectivityMatrices))+'_runs_at_k='+str(clusters)+'.png')
	
	else:
		#assigns sample names to consensus matrix
		sampleNames=[]
		with open(colNames) as input:
			for line in input:
				sampleNames.append(line.rstrip('\n'))
		#put concensus matrix into dataframe to build hierarchical clustermap		
		dataframe=pd.DataFrame(data=consensusMat, index=sampleNames, columns=sampleNames)
		#clusters by columns and rows and annotates probablility a particular sample clusters together
		#cluster distance is meausred by average Euclidean Distance in seaborn for hierarchical clustering
		consensusClustered=sns.clustermap(dataframe, col_cluster=True, row_cluster=True, annot=True)
		consensusClustered_non_annt=sns.clustermap(dataframe, col_cluster=True, row_cluster=True, annot=False)
		plt.setp(consensusClustered.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
		plt.setp(consensusClustered_non_annt.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
		plt.setp(consensusClustered.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
		plt.setp(consensusClustered_non_annt.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
		consensusClustered.savefig(str(matrixPath)+'consensus_Matrix_over_'+str(len(connectivityMatrices))+'_runs_at_k='+str(clusters)+'.png')
		consensusClustered_non_annt.savefig(str(matrixPath)+'non_annotated_consensus_Matrix_over_'+str(len(connectivityMatrices))+'_runs_at_k='+str(clusters)+'.png')
def produce_clustermap(rankings, fname, metric):
    sns.set()
    sns.set_context("paper")

    pp = PdfPages(fname)
    sns.clustermap(rankings, col_cluster=False, metric=metric)
    pp.savefig()
    pp.close()
예제 #7
0
def drawClustermap(df, output):
    if args.scaling == 'z_score':
        g = sns.clustermap(df, method=args.cluster_method, metric=args.distance_metric, linewidths=0.5, cmap=args.color, col_cluster=cluster, z_score=0, figsize=figSize)
    elif args.scaling == 'standard':
        g = sns.clustermap(df, method=args.cluster_method, metric=args.distance_metric, linewidths=0.5, cmap=args.color, col_cluster=cluster, standard_scale=0, figsize=figSize)
    else:
        g = sns.clustermap(df, method=args.cluster_method, metric=args.distance_metric, linewidths=0.5, cmap=args.color, col_cluster=cluster, figsize=figSize)
    plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0, size=int(args.yaxis_fontsize), family=args.font)
    plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90, size=int(args.xaxis_fontsize), family=args.font, weight='bold')
    g.savefig(output, format='pdf', dpi=1000, bbox_inches='tight')
예제 #8
0
def plot_filter_seg_heat(filter_outs, out_pdf, whiten=True, drop_dead=True):
    b = filter_outs.shape[0]
    f = filter_outs.shape[1]
    l = filter_outs.shape[2]

    s = 5
    while l/float(s) - (l/s) > 0:
        s += 1
    print '%d segments of length %d' % (s,l/s)

    # split into multiple segments
    filter_outs_seg = np.reshape(filter_outs, (b, f, s, l/s))

    # mean across the segments
    filter_outs_mean = filter_outs_seg.max(axis=3)

    # break each segment into a new instance
    filter_seqs = np.reshape(np.swapaxes(filter_outs_mean, 2, 1), (s*b, f))

    # whiten
    if whiten:
        filter_seqs = preprocessing.scale(filter_seqs)

    # transpose
    filter_seqs = np.transpose(filter_seqs)

    if drop_dead:
        filter_stds = filter_seqs.std(axis=1)
        filter_seqs = filter_seqs[filter_stds > 0]

    # downsample sequences
    seqs_i = np.random.randint(0, filter_seqs.shape[1], 500)

    hmin = np.percentile(filter_seqs[:,seqs_i], 0.1)
    hmax = np.percentile(filter_seqs[:,seqs_i], 99.9)

    sns.set(font_scale=0.3)
    if whiten:
        dist = 'euclidean'
    else:
        dist = 'cosine'

    plt.figure()
    sns.clustermap(filter_seqs[:,seqs_i], metric=dist, row_cluster=True, col_cluster=True, linewidths=0, xticklabels=False, vmin=hmin, vmax=hmax)
    plt.savefig(out_pdf)
    #out_png = out_pdf[:-2] + 'ng'
    #plt.savefig(out_png, dpi=300)
    plt.close()
예제 #9
0
def get_seaborn_clustermap(dfr, params, title=None, annot=True):
    """Returns a Seaborn clustermap."""
    fig = sns.clustermap(
        dfr,
        cmap=params.cmap,
        vmin=params.vmin,
        vmax=params.vmax,
        col_colors=params.colorbar,
        row_colors=params.colorbar,
        figsize=(params.figsize, params.figsize),
        linewidths=params.linewidths,
        xticklabels=params.labels,
        yticklabels=params.labels,
        annot=annot,
    )
    fig.cax.yaxis.set_label_position("left")
    if title:
        fig.cax.set_ylabel(title)

    # Rotate ticklabels
    fig.ax_heatmap.set_xticklabels(fig.ax_heatmap.get_xticklabels(), rotation=90)
    fig.ax_heatmap.set_yticklabels(fig.ax_heatmap.get_yticklabels(), rotation=0)

    # Return clustermap
    return fig
예제 #10
0
def clust_heatmap(gene_list, df_by_gene, num_to_plot=len(gene_list), title='', plot=False, label_map=False):
    if num_to_plot >175:
        sns.set(context= 'poster', font_scale = 0.65/(num_to_plot/100))
    else:
        sns.set(context= 'poster', font_scale = .80, font ='Verdana')
    sns.set_palette('RdBu',4,0.1)
    cell_list = df_by_gene.index.tolist()
    cg = sns.clustermap(df_by_gene[gene_list[0:num_to_plot]].transpose(), metric=metric, method=method, z_score=0, figsize=(30, 25))
    col_order = cg.dendrogram_col.reordered_ind
    cg.ax_heatmap.set_title(title)
    if label_map:
        Xlabs = [cell_list[i] for i in col_order]
        colors = [label_map[cell][0] for cell in Xlabs]
        for xtick, color in zip(cg.ax_heatmap.get_xticklabels(), colors):
            xtick.set_color(color)
            xtick.set_rotation(270)
    if plot:
        plt.show()
    cell_linkage = cg.dendrogram_col.linkage

    link_mat = pd.DataFrame(cell_linkage,
                columns=['row label 1', 'row label 2', 'distance', 'no. of items in clust.'],
                index=['cluster %d' %(i+1) for i in range(cell_linkage.shape[0])])
    if title != '':
        save_name = '_'.join(title.split(' ')[0:2])
        cg.savefig(os.path.join(filename, save_name+'_heatmap.pdf'), bbox_inches='tight')
    else:
        cg.savefig(os.path.join(filename,'Non_group_heatmap_z1_deleted.pdf'), bbox_inches='tight')
    plt.close()
    return cell_linkage, df_by_gene[gene_list[0:num_to_plot]], col_order
def plot_clustermap(dat, cmap='purple', save_fig=False, save_name='Clustermap'):
    """Plot clustermap.

    Parameters
    ----------
    dat : pandas.DataFrame
        Data to create clustermap from.
    """

    # Set up plotting and aesthetics
    sns.set()
    sns.set_context("paper", font_scale=1.5)

    # Set colourmap
    if cmap == 'purple':
        cmap = sns.cubehelix_palette(as_cmap=True)
    elif cmap == 'blue':
        cmap = sns.cubehelix_palette(as_cmap=True, rot=-.3, light=0.9, dark=0.2)

    # Create the clustermap
    cg = sns.clustermap(dat, cmap=cmap, method='complete', metric='cosine', figsize=(12, 10))

    # Fix axes
    cg.cax.set_visible(True)
    _ = plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=60, ha='right')
    _ = plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)

    # Save out - if requested
    if save_fig:

        db = check_db(db)
        s_file = os.path.join(db.figs_path, save_name + '.svg')

        cg.savefig(s_file, transparent=True)
def plot_heatmap_with_dendrogram(similarity_matrix, plot_name, show_link):
    """
    Makes a plot of heatmap and dendrogram. Arguments are a similarity matrix
    (as Pandas data frame) and the name of the plot.
    """
    # the distance matrix has to be condensed first
    cond_dist_matrix = pdist.squareform(100 - similarity_matrix)
    # linkage makes the whole mathematics
    Z = linkage(cond_dist_matrix, 'average')
    # uncomment the print statement to see the linkage matrix
    if show_link:
        print(Z)
    sns.set(font='sans-serif', font_scale=0.7)
    # round the figures displayed in the heatmap as to integers
    pairwise_cognacy_displayinheatmap = np.round(
            similarity_matrix, decimals=0).astype(int)
    # create a seaborn clustermap object
    heatncluster = sns.clustermap(
            pairwise_cognacy_displayinheatmap, annot=True, cmap='inferno_r',
            vmax=100, fmt='d', col_linkage=Z, row_linkage=Z)
    plt.setp(heatncluster.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    plt.setp(heatncluster.ax_heatmap.xaxis.get_majorticklabels(), rotation=45)

    file_name = plot_name + '.png'
    click.echo('Writing ' + file_name, err=True)
    plt.savefig(file_name)
예제 #13
0
파일: plot.py 프로젝트: jrderuiter/ngs-tk
def plot_heatmap(data, columns=None, chrom="chrom", position="position", vline_color="black", **kwargs):
    """Plots a (clustered) CNV heatmap for multiple samples."""

    # Select all columns by default.
    if columns is None:
        columns = [c for c in data if c not in {chrom, position}]

    # Sort data by position.
    data = data.sort([chrom, position], ascending=True)

    # Plot heatmap.
    g = sns.clustermap(data[columns].T, linewidths=0, col_cluster=False, **kwargs)
    g.ax_heatmap.set_xticks([])

    # Plot chromosome breaks.
    breaks = np.where(~data[chrom].duplicated(take_last=True))[0]
    breaks += 1

    for loc in breaks[:-1]:
        g.ax_heatmap.axvline(loc, color=vline_color)

    # Add chromosome labels.
    label_pos = np.concatenate([[0], breaks])
    label_pos = (label_pos[:-1] + label_pos[1:]) / 2

    g.ax_heatmap.set_xticks(label_pos)
    g.ax_heatmap.set_xticklabels(data[chrom].unique(), rotation=0)

    # Label axes.
    g.ax_heatmap.set_xlabel(chrom)

    return g
예제 #14
0
파일: clustering.py 프로젝트: subkar/msda
def plot_clustermap(df, output_path, cmap=None, legend_label='',
                    z_score=None, xticklabels=False, yticklabels=True,
                    colors_dict=None, col_colors=None, row_colors=None):
    """Make clustermap figure

    Parameters
    ----------
    df
    df_meta
    output_path

    Returns
    -------
    cg
    """

    cg = sns.clustermap(df, col_colors=col_colors, row_colors=None,
                        cmap=cmap, z_score=z_score,
                        yticklabels=yticklabels, xticklabels=xticklabels)

    if colors_dict:
        for cat in colors_dict.keys():
            for label in colors_dict[cat]:
                cg.ax_col_dendrogram.bar(0, 0, color=colors_dict[cat][label],
                                         label=label, linewidth=0)
        cg.ax_col_dendrogram.legend(loc=(-0.7, -2), ncol=1)

    plt.subplots_adjust(top=1, bottom=0.02, left=0.3, right=0.8)
    fig = plt.gcf()
    fig.set_size_inches([10, 7.5])
    cg.cax.set_position((.025, .1, 0.025, .15))
    cg.cax.text(-0.3, -0.2, legend_label, fontsize=9)
    plt.savefig(output_path, dpi=300)
    return cg
예제 #15
0
    def __call__(self, data, path):

        colorbar = self.getColorBar(data)
        n_samples = data.shape[0]
        data = data.iloc[:, :n_samples]
        ax = seaborn.clustermap(data, row_colors=colorbar)
        return ResultBlocks(ResultBlock("""#$mpl %i$#\n""" % ax.cax.figure.number, title="ClusterMapPlot"))
예제 #16
0
파일: enrich.py 프로젝트: BennerLab/atg
    def plot_enrichment_multiple(self, multi_gene_list, output_filename):
        enrichment_results_df = self.iterative_enrichment_multilist(multi_gene_list)

        # replace accession numbers with GO term descriptions
        formatted_enrichment_df = enrichment_results_df.merge(self.term_definition, left_index=True,
                                                              right_on='GO term accession')
        formatted_enrichment_df['GO term'] = (formatted_enrichment_df['GO term name'] + '\n[' +
                                              formatted_enrichment_df['GO term accession'] + ']')
        formatted_enrichment_df.drop(self.term_definition.columns, axis=1, inplace=True)
        formatted_enrichment_df.set_index('GO term', inplace=True)

        sns.set(style='whitegrid')
        plt.figure()
        plot = sns.clustermap(formatted_enrichment_df, cmap="Reds_r")
        plt.setp(plot.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
        # plot.set_xlabel(r'$-log_{10}(p)$')
        # plot.set_ylabel('')
        # plot.xaxis.grid(False)
        # plot.yaxis.grid(True)
        # sns.despine(left=True, bottom=True)

        if output_filename:
            plot.savefig(output_filename)
        else:
            plt.show()
예제 #17
0
def plot_dist_matrix(matrix, fasta_names, heatmap_out, dendrogram_out):
    """Cluster the distance matrix hierarchically and plot using seaborn.
    Average linkage method is used."""
    # Load required modules for plotting
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    import seaborn as sns
    import pandas as pd
    from scipy.cluster.hierarchy import dendrogram, linkage

    # Create
    pdm = pd.DataFrame(matrix, index=fasta_names, columns=fasta_names)

    # Plot heatmap
    figsizex = max(10, len(fasta_names) / 4)
    clustergrid = sns.clustermap(pdm, metric='euclidean', method='average',
            figsize=(figsizex, figsizex))
    clustergrid.savefig(heatmap_out)

    # Plot dendrogram
    sns.set_style('white')
    figsizey = max(10, len(fasta_names) / 8)
    f, ax = plt.subplots(figsize=(figsizex, figsizey))
    link = linkage(pdm, metric='euclidean', method='average')
    dendrogram(link, labels=pdm.index, ax=ax)
    no_spine = {'left': True, 'bottom': True, 'right': True, 'top': True}
    sns.despine(**no_spine)
    plt.xticks(rotation=90)
    f.tight_layout()
    plt.savefig(dendrogram_out)
예제 #18
0
def plot_transition_clustermap(data_array, gene_names, pseudotimes, n_clusters=10, gradient=False):
    if gradient:
        data_to_plot = zscore(np.gradient(data_array)[1].T, axis=0)
        scale = None
        metric = 'seuclidean'
        row_linkage = linkage(pdist(abs(data_to_plot), metric=metric), method='complete')
    else:
        data_to_plot = data_array.T
        scale = 0
        metric = 'correlation'
        row_linkage = linkage(pdist(data_to_plot, metric=metric), method='complete')
    
    assignments = fcluster(row_linkage, n_clusters, criterion='maxclust')
    cm = sns.clustermap(data_to_plot, col_cluster=False, standard_scale=scale, 
                        yticklabels=gene_names, row_linkage=row_linkage,
                        row_colors=[settings.STATE_COLORS[i] for i in assignments])
    r = np.arange(10, data_array.shape[0], data_array.shape[0]/10)
    plt.setp(cm.ax_heatmap.get_yticklabels(), fontsize=5)
    cm.ax_heatmap.set_xticks(r)
    cm.ax_heatmap.set_xticklabels(['%.1f' % x for x in pseudotimes[r]])
    cm.ax_heatmap.set_xlabel('Pseudotime')
    cm.ax_heatmap.set_ylabel('Gene')
    
    gene_clusters = defaultdict(list)
    for i, cl in enumerate(assignments):
        gene_clusters[settings.STATE_COLORS[cl]].append(gene_names[i])
    return gene_clusters
예제 #19
0
파일: corr.py 프로젝트: crowy415/sysbio
def CorrFig(data,outname):
    data.corr().to_csv(outname+'.corr',index=True,header=True,sep='\t')
    seaborn.set_context('notebook', font_scale=1.2)
    fig1 = seaborn.clustermap(data.corr(), method='average', metric='euclidean', figsize=(12,12), cmap='YlGnBu', annot=True)
    plt.setp(fig1.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    plt.setp(fig1.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
    plt.savefig(outname+'.corr.pdf')
    return
예제 #20
0
def plot_clustermap(df):
    # corr = df.corr()
    # yticks = corr.index
    
    # sns.clustermap(corr, 'yticklabels=yticks')
    cg=sns.clustermap(df.corr())
    # plt.yticks(rotation=0)
    plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)    
예제 #21
0
def clustermap(corpus, distance_matrix=None, color_leafs=True,
               outputfile=None, fontsize=5,
               save=False, show=False, return_svg=False):
    """
    Draw a square clustermap of the corpus using seaborn's
    `clustermap`.
    
    Parameters
    ----------
    corpus : `Corpus` instance
        The corpus to be plotted.
    distance_matrix : array-like, [n_texts, n_texts]
        A square distance matrix holding the 
        pairwise distances between all the texts in 
        the corpus.
    color_leafs: boolean, default=True,
        If true, will color the text labels on the
        axis according to their category.
    outputfile : str
        The path where the plot should be saved.
    fontsize : int, default=5
        The fontsize of the labels on the axes.
    save : boolean, default=False
        Whether to save the plot to `outputfile`.

    """
    plt.clf()
    # convert to pandas dataframe:
    labels = corpus.titles
    df = pd.DataFrame(data=distance_matrix, columns=labels)
    df = df.applymap(lambda x:int(x*1000)).corr()

    # clustermap plotting:
    cm = sns.clustermap(df)
    ax = cm.ax_heatmap
        # xlabels:
    for idx, label in enumerate(ax.get_xticklabels()):
        label.set_rotation('vertical')
        label.set_fontname('Arial')
        label.set_fontsize(fontsize)
        if color_leafs:
            label.set_color(plt.cm.spectral(corpus.target_ints[idx] / 10.))

    # ylabels:
    for idx, label in enumerate(ax.get_yticklabels()):
        label.set_rotation('horizontal')
        label.set_fontname('Arial')
        label.set_fontsize(fontsize)
        if color_leafs:
            label.set_color(plt.cm.spectral(corpus.target_ints[-idx-1] / 10.)) # watch out: different indexing on this axis
    if save:
        if outputfile:
            outputfile = os.path.expanduser(outputfile)
        cm.savefig(outputfile)
    if show:
        plt.show()
    if return_svg:
        return plt_fig_to_svg(cm)
예제 #22
0
def main():
    args = parser.parse_args()
    import numpy as np
    import pandas as pd
    import seaborn as sns
    major_index = args.major_index
    minor_index = args.minor_index
    df = pd.read_table(args.tsv, index_col=[major_index, minor_index], sep=args.delimiter)
    df = np.log2(df) if args.log_normalize else df
    # set our undected samples to our lowest detection
    df[df==-1*np.inf] = df[df!=-1*np.inf].min().min()
    # translate our data so we have no negatives (which would screw up our addition and makes no biological sense)
    if args.translate:
        df+=abs(df.min().min())
    major_counts = df.groupby(level=[major_index]).count()
    # we only want to plot samples with multiple values in the minor index
    cutoff = args.minor_cutoff
    multi = df[df.index.get_level_values(major_index).isin(major_counts[major_counts>=cutoff].dropna().index)]

    # Let's select the most variable minor axis elements
    most_variable = multi.groupby(level=major_index).var().mean(axis=1).order(ascending=False)
    # and group by 20s
    for i in xrange(11):
        dat = multi[multi.index.get_level_values(major_index).isin(most_variable.index[10*i:10*(i+1)])]
        # we want to cluster by our major index, and then under these plot the values of our minor index
        major_dat = dat.groupby(level=major_index).sum()
        seaborn_map = sns.clustermap(major_dat, row_cluster=True, col_cluster=True)
        # now we keep this clustering, but recreate our data to fit the above clustering, with our minor
        # index below the major index (you can think of transcript levels under gene levels if you are
        # a biologist)
        merged_dat = pd.DataFrame(columns=[seaborn_map.data2d.columns])
        for major_val in seaborn_map.data2d.index:
            minor_rows = multi[multi.index.get_level_values(major_index)==major_val][seaborn_map.data2d.columns]
            major_row = major_dat.loc[major_val, ][seaborn_map.data2d.columns]
            merged_dat.append(major_row)
            merged_dat = merged_dat.append(major_row).append(minor_rows)
        merged_map = sns.clustermap(merged_dat, row_cluster=False, col_cluster=False)

        # recreate our dendrogram, this is undocumented and probably a hack but it works
        seaborn_map.dendrogram_col.plot(merged_map.ax_col_dendrogram)

        # for rows, I imagine at some point it will fail to fall within the major axis but fortunately
        # for this dataset it is not true
        seaborn_map.dendrogram_row.plot(merged_map.ax_row_dendrogram)
        merged_map.savefig('{}_heatmap_{}.png'.format(os.path.split(args.tsv.name)[1], i))
예제 #23
0
def make_heatmap_w2vrelated(model, rel_wds):
    """
    Given a model (from word2vec) and a list of related words,
    make a square heatmap using the cosine similarity between the given words
    """
    n = len(rel_wds)
    names = [wd[0] for wd in rel_wds]
    data_mat = np.zeros((n,n))
    for i, word1 in enumerate(names):
        for j, word2 in enumerate(names):
            data_mat[i,j] = model.similarity(word1, word2)
            if i == j:
                data_mat[i,j] = 0

    df = pd.DataFrame(data=data_mat,
                     columns=names,
                     index=names)
    sb.clustermap(df, linewidths=.5,)
def graphDendrogram(csv):
    dend_fn = csv
    dend_data = pd.read_csv(dend_fn, na_values = 'n/a')
    dend_data = dend_data.rename(columns = {'Unnamed: 0':'Project1'})
    dend_data = pd.melt(dend_data, id_vars=['Project1'])
    dend_data = dend_data.rename(columns = {'variable':'project2','value':'ANI'})
    dend_data = dend_data.pivot("Project1", "project2", "ANI")
    g = sns.clustermap(dend_data)
    g.savefig(outputfile + "_dendrogram.pdf")     
def heatmap_plot_zscore_bigneuron(df_zscore_features, df_all, output_dir, title=None):

    print "heatmap plot:bigneuron"

    #taiwan
    metric ='nt_type'
    mtypes = np.unique(df_all[metric])
    print mtypes
    mtypes_pal = sns.color_palette("hls", len(mtypes))

    mtypes_lut = dict(zip(mtypes, mtypes_pal))
    mtypes_colors = df_all[metric].map(mtypes_lut)



    linkage = hierarchy.linkage(df_zscore_features, method='ward', metric='euclidean')

    data = df_zscore_features.transpose()
    row_linkage = hierarchy.linkage(data, method='ward', metric='euclidean')
    feature_order = hierarchy.leaves_list(row_linkage)

    #print data.index
    matchIndex = [data.index[x] for x in feature_order]
    #print matchIndex
    data = data.reindex(matchIndex)

    pl.figure()
    g = sns.clustermap(data, row_cluster = False, col_linkage=linkage, method='ward', metric='euclidean',
                       linewidths = 0.0,col_colors = [mtypes_colors],
                       cmap = sns.cubehelix_palette(light=1, as_cmap=True),figsize=(40,10))

    pl.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    pl.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
    #g.ax_heatmap.set_xticklabels([])
    pl.subplots_adjust(left=0.1, bottom=0.1, right=0.9, top=0.95)  # !!!!!

    if title:
        pl.title(title)


    location ="best"
    num_cols=1
    # Legend for row and col colors

    for label in mtypes:
         g.ax_row_dendrogram.bar(0, 0, color=mtypes_lut[label], label=label, linewidth=0.0)
         g.ax_row_dendrogram.legend(loc=location, ncol=num_cols,borderpad=0)

    filename = output_dir + '/zscore_feature_heatmap.png'
    pl.savefig(filename, dpi=300)
    #pl.show()
    print("save zscore matrix heatmap figure to :" + filename)
    pl.close()
    print "done clustering and heatmap plotting"
    return linkage
예제 #26
0
def cluster_all_map(mat, ylab, out):
    """Returns nothing. Generates a figure where all tissues are clustered together based a Scipy clustering metric provided.

    Args:
        mat (array): Fold change normalized matrix containing fold change values for all three tissue types where the fold change for a given sample
        is tissue_fpkm[i]/tissue_fpkm[controls].mean()
        ylab (list): Y axis labels generated
        out (str): The title of the Seaborn clustermap generated out.pdf'

    Returns:
        Fold change normalized clustermap containing fold change values for all three tissue types where the fold change for a given sample
        is tissue_fpkm[i]/tissue_fpkm[controls].mean(). The Seaborn clustermap will be labeled as <out>.pdf

    """
    heart = ["#3498db"]
    brain = ["#e74c3c"]
    quad = ["#2ecc71"]
    wt = ["#8FBC8F"]
    het = ["#B22222"]
    aso = ["#FFD700"]
    ko = ["#FF69B4"]
    white = ['#FFFFFF']

    color_leg = ["#3498db", "#e74c3c", "#2ecc71", "#FFFFFF", "#8FBC8F", "#B22222", "#FFD700", "#FF69B4"]
    legend_lab = ["heart", "brain", "quad", " ", "wt", "het", "aso", "ko"]
    xlabel = array(["wt", "wt", "wt", "Het", "Het", "aso", "aso", "ko", "ko",
                    "wt", "wt", "wt", "Het", "Het", "aso", "aso", "ko", "ko",
                    "wt", "wt", "Het", "Het", "aso", "aso", "ko", "ko", ])

    sample_type = (sns.color_palette(wt, 3) +
                   sns.color_palette(het, 2) +
                   sns.color_palette(aso, 2) +
                   sns.color_palette(ko, 2) +
                   sns.color_palette(wt, 3) +
                   sns.color_palette(het, 2) +
                   sns.color_palette(aso, 2) +
                   sns.color_palette(ko, 2) +
                   sns.color_palette(wt, 2) +
                   sns.color_palette(het, 2) +
                   sns.color_palette(aso, 2) +
                   sns.color_palette(ko, 2))

    season_colors = (sns.color_palette(heart, 9) +
                     sns.color_palette(brain, 9) +
                     sns.color_palette(quad, 8))

    g = sns.clustermap(mat, annot=False, method='weighted', metric='euclidean', col_colors=[sample_type, season_colors],
                       col_cluster=True, xticklabels=xlabel, yticklabels=ylab)
    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    for C, L in zip([c for c in color_leg], legend_lab):
        g.ax_col_dendrogram.bar(0, 0, color=C, label=L, linewidth=0)
    g.ax_col_dendrogram.legend(loc="upper right", ncol=2)
    plt.suptitle('Fold change filter across Tissues +/- 1 FC')
    plt.savefig(out, format="pdf", dpi=1000)
예제 #27
0
def heatmap(output_dir, table: pd.DataFrame,
            metadata: qiime2.CategoricalMetadataColumn=None,
            normalize: bool=True, title: str=None, metric: str='euclidean',
            method: str='average', cluster: str='both',
            color_scheme: str='rocket') -> None:
    if table.empty:
        raise ValueError('Cannot visualize an empty table.')

    if metadata is not None:
        table = _munge_metadata(metadata, table, cluster)

    cbar_label = 'frequency'
    if normalize:
        table = table.apply(lambda x: np.log10(x + 1))
        cbar_label = 'log10 frequency'

    # Hard-coded values for reasonable plots
    scaletron, labelsize, dpi = 50, 8, 100
    sns.set(rc={'xtick.labelsize': labelsize, 'ytick.labelsize': labelsize,
                'figure.dpi': dpi})
    width, height = table.shape[1] / scaletron, table.shape[0] / scaletron

    heatmap_plot = sns.clustermap(table, method=method, metric=metric,
                                  **_clustering_map[cluster],
                                  cmap=color_scheme,
                                  xticklabels=True, yticklabels=True,
                                  cbar_kws={'label': cbar_label})
    if title is not None:
        heatmap_plot.fig.suptitle(title)

    hm = heatmap_plot.ax_heatmap.get_position()
    cbar = heatmap_plot.cax.get_position()
    row = heatmap_plot.ax_row_dendrogram.get_position()
    col = heatmap_plot.ax_col_dendrogram.get_position()

    # Resize the plot to set cell aspect-ratio to square
    heatmap_plot.ax_heatmap.set_position([hm.x0, hm.y0, width, height])
    heatmap_plot.cax.set_position([cbar.x0, hm.y0 + height, cbar.width,
                                   cbar.height])
    heatmap_plot.ax_row_dendrogram.set_position([row.x0, row.y0, row.width,
                                                 height])
    heatmap_plot.ax_col_dendrogram.set_position([col.x0, hm.y0 + height, width,
                                                 col.height])

    # https://stackoverflow.com/a/34697479/3776794
    plt.setp(heatmap_plot.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
    plt.setp(heatmap_plot.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)

    for ext in ['png', 'svg']:
        img_fp = os.path.join(output_dir, 'feature-table-heatmap.%s' % ext)
        heatmap_plot.savefig(img_fp)

    index_fp = os.path.join(TEMPLATES, 'index.html')
    q2templates.render(index_fp, output_dir, context={'normalize': normalize})
예제 #28
0
def plot_nn_weights(w, x_labels, y_labels, fig_path, row_linkage=None, fig_size=(10, 3)):
	plt.figure(figsize=fig_size)
	clmap = sns.clustermap(pd.DataFrame(w, columns=x_labels),
							method='average', metric='cosine', row_linkage=row_linkage,
							col_cluster=False, robust=True, yticklabels=y_labels)
	plt.setp(clmap.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
	plt.setp(clmap.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
	clmap.cax.set_visible(False)
	plt.savefig(fig_path)
	plt.clf()
	plt.close()
예제 #29
0
파일: de.py 프로젝트: dmnfarrell/mirnaseq
def cluster_map(data, names):
    """Cluster map of genes"""

    import seaborn as sns
    import pylab as plt
    data = data.ix[names]
    X = np.log(data).fillna(0)
    X = X.apply(lambda x: x-x.mean(), 1)
    cg = sns.clustermap(X,cmap='RdYlBu_r',figsize=(8,10),lw=.5,linecolor='gray')
    mt=plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    mt=plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
    return cg
예제 #30
0
def plot_gene_clustermap_by_membership(data_array, memberships):
    groups = list(set(memberships))
    group_means = np.zeros([len(groups), data_array.shape[1]])
    for gi, group in enumerate(groups):
        group_means[gi, :] = data_array[memberships == group, :].mean(axis=0)
    cm = sns.clustermap((group_means - group_means.min(axis=0)).T, col_cluster=False)
    r = np.arange(len(groups)) + 0.5
    cm.ax_heatmap.set_xticks(r)
    cm.ax_heatmap.set_xticklabels(groups)
    cm.ax_heatmap.set_xlabel('Cell type')
    cm.ax_heatmap.set_ylabel('Gene')
    return cm
예제 #31
0
print "standardizing by log scale..."
log_df = z + 0.01
log_df = np.log(log_df)

print "standardizing columns to the max in each row..."
std_df = z.div(z.max(axis=1), axis=0)

x = float(sys.argv[2])
y = float(sys.argv[3])

print "making log heat map..."
sns.set(font_scale=1.5)
g = sns.clustermap(log_df,
                   figsize=(x, y),
                   cmap="magma",
                   yticklabels=False,
                   xticklabels=False,
                   col_colors=lut)
plt.savefig("heatmap_log.png", bbox_inches="tight", dpi=600)
plt.clf()

print "making standard heat map..."
g = sns.clustermap(std_df,
                   figsize=(x, y),
                   vmin=0,
                   vmax=1,
                   cmap="magma",
                   yticklabels=False,
                   xticklabels=False,
                   col_colors=lut)
plt.savefig("heatmap_std.png", bbox_inches="tight", dpi=600)
예제 #32
0
import numpy as np
import pandas as pd
from numpy.random import randn
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

df = sns.load_dataset('flights')
df2 = df.pivot('year', 'month', 'passengers')
print(df2)
sns.clustermap(df2).savefig('cl1.png')
sns.clustermap(df2, col_cluster=False).savefig('cl2.png')
sns.clustermap(df2, standard_scale=0).savefig('cl3.png')
sns.clustermap(df2, standard_scale=1).savefig('cl4.png')
예제 #33
0
def jacobian_kinetics(
    adata,
    basis='umap',
    regulators=None,
    effectors=None,
    mode="pseudotime",
    tkey="potential",
    color_map="bwr",
    gene_order_method='raw',
    show_colorbar=False,
    cluster_row_col=[False, True],
    figsize=(11.5, 6),
    standard_scale=1,
    save_show_or_return='show',
    save_kwargs={},
    **kwargs
):
    """Plot the gene expression dynamics over time (pseudotime or inferred real time) in a heatmap.

    Note that by default `potential` estimated with the diffusion graph built from reconstructed vector field will be
    used as the measure of pseudotime.

    Parameters
    ----------
        adata: :class:`~anndata.AnnData`
            an Annodata object.
        basis: `str`
            The reduced dimension basis.
        regulators: `list` or `None` (default: `None`)
            The list of genes that will be used as regulators for plotting the Jacobian heatmap, only limited to genes
            that have already performed Jacobian analysis.
        effectors: `List` or `None` (default: `None`)
            The list of genes that will be used as targets for plotting the Jacobian heatmap, only limited to genes
            that have already performed Jacobian analysis.
        mode: `str` (default: `vector_field`)
            Which data mode will be used, either vector_field or pseudotime. if mode is vector_field, the trajectory predicted by
            vector field function will be used, otherwise pseudotime trajectory (defined by time argument) will be used.
            By default `potential` estimated with the diffusion graph built reconstructed vector field will be used as
            pseudotime.
        tkey: `str` (default: `potential`)
            The .obs column that will be used for timing each cell, only used when mode is not `vector_field`.
        color_map: `str` (default: `BrBG`)
            Color map that will be used to color the gene expression. If `half_max_ordering` is True, the
            color map need to be divergent, good examples, include `BrBG`, `RdBu_r` or `coolwarm`, etc.
        gene_order_method: `str` (default: `half_max_ordering`) [`half_max_ordering`, `maximum`]
            Supports two different methods for ordering genes when plotting the heatmap: either `half_max_ordering`,
            or `maximum`. For `half_max_ordering`, it will order genes into up, down and transit groups by the half
            max ordering algorithm (HA Pliner, et. al, Molecular cell 71 (5), 858-871. e8). While for `maximum`,
            it will order by the position of the highest gene expression.
        show_colorbar: `bool` (default: `False`)
            Whether to show the color bar.
        cluster_row_col: `[bool, bool]` (default: `[False, False]`)
            Whether to cluster the row or columns.
        figsize: `str` (default: `(11.5, 6)`
            Size of figure
        standard_scale: `int` (default: 1)
            Either 0 (rows, cells) or 1 (columns, genes). Whether or not to standardize that dimension, meaning for each row or column,
            subtract the minimum and divide each by its maximum.
        save_show_or_return: {'show', 'save_fig', 'return'} (default: `show`)
            Whether to save_fig, show or return the figure.
        save_kwargs: `dict` (default: `{}`)
            A dictionary that will passed to the save_fig function. By default it is an empty dictionary and the save_fig function
            will use the {"path": None, "prefix": 'kinetic_curves', "dpi": None, "ext": 'pdf', "transparent": True, "close":
            True, "verbose": True} as its parameters. Otherwise you can provide a dictionary that properly modify those keys
            according to your needs.
        kwargs:
            All other keyword arguments are passed to heatmap(). Currently `xticklabels=False, yticklabels='auto'` is passed
            to heatmap() by default.
    Returns
    -------
        Nothing but plots a heatmap that shows the element of Jacobian matrix dynamics over time (potential decreasing).

    Examples
    --------
    >>> import dynamo as dyn
    >>> adata = dyn.sample_data.hgForebrainGlutamatergic()
    >>> adata = dyn.pp.recipe_monocle(adata)
    >>> dyn.tl.dynamics(adata)
    >>> dyn.vf.VectorField(adata, basis='pca')
    >>> valid_gene_list = adata[:, adata.var.use_for_transition].var.index[:2]
    >>> dyn.vf.jacobian(adata, regulators=valid_gene_list[0], effectors=valid_gene_list[1])
    >>> dyn.pl.jacobian_kinetics(adata)
    """

    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt

    Jacobian_ = "jacobian" if basis is None else "jacobian_" + basis
    Der, cell_indx, jacobian_gene, regulators_, effectors_ = adata.uns[Jacobian_].get('jacobian'), \
                                                              adata.uns[Jacobian_].get('cell_idx'), \
                                                              adata.uns[Jacobian_].get('jacobian_gene'), \
                                                              adata.uns[Jacobian_].get('regulators'), \
                                                              adata.uns[Jacobian_].get('effectors')
    if tkey == "potential" and "potential" not in adata.obs_keys():
        ddhodge(adata)

    adata_ = adata[cell_indx, :]
    time = adata_.obs[tkey]
    jacobian_mat = Der.reshape((-1, Der.shape[2])) if Der.ndim == 3 else Der[None, :]
    n_source_targets_ = Der.shape[0] * Der.shape[1] if Der.ndim == 3 else 1
    targets_, sources_ = (np.repeat(effectors_, Der.shape[1]), np.tile(regulators_, Der.shape[0])) if Der.ndim == 3 \
        else (np.repeat(effectors_, Der.shape[0]), np.repeat(effectors_, Der.shape[0]))
    source_targets_ = [sources_[i] + '->' + targets_[i] for i in range(n_source_targets_)]

    regulators = regulators_ if regulators is None else regulators
    effectors = effectors_ if effectors is None else effectors
    if type(regulators) == str: regulators = [regulators]
    if type(effectors) == str: effectors = [effectors]
    regulators = list(set(regulators_).intersection(regulators))
    effectors = list(set(effectors_).intersection(effectors))
    if len(regulators) == 0 or len(effectors) == 0:
        raise ValueError(f"Jacobian related to source genes {regulators} and target genes {effectors}"
                         f"you provided are existed. Available source genes includes {regulators_} while "
                         f"available target genes includes {effectors_}")
    n_source_targets = len(regulators) * len(effectors)
    targets, sources = np.repeat(effectors, len(regulators)), np.tile(regulators, len(effectors))
    source_targets = [sources[i] + '->' + targets[i] for i in range(n_source_targets)]

    jacobian_mat = jacobian_mat[:, np.argsort(time)]

    if gene_order_method == "half_max_ordering":
        time, all, valid_ind, gene_idx = _half_max_ordering(
            jacobian_mat, time, mode=mode, interpolate=True, spaced_num=100
        )
        all, source_targets = all[np.isfinite(all.sum(1)), :], np.array(source_targets)[gene_idx][np.isfinite(all.sum(1))]

        df = pd.DataFrame(all, index=source_targets_)
    elif gene_order_method == 'maximum':
        jacobian_mat = lowess_smoother(time, jacobian_mat, spaced_num=100)
        jacobian_mat = jacobian_mat[np.isfinite(jacobian_mat.sum(1)), :]

        if standard_scale is not None:
            exprs = (jacobian_mat - np.min(jacobian_mat, axis=standard_scale)[:, None]) / np.ptp(
                jacobian_mat, axis=standard_scale
            )[:, None]
        max_sort = np.argsort(np.argmax(exprs, axis=1))
        df = pd.DataFrame(exprs[max_sort, :], index=np.array(source_targets_)[max_sort])
    elif gene_order_method == "raw":
        jacobian_mat /= np.abs(jacobian_mat).max(1)[:, None]
        df = pd.DataFrame(jacobian_mat, index=np.array(source_targets_))
    else:
        raise Exception('gene order_method can only be either half_max_ordering or maximum')

    heatmap_kwargs = dict(xticklabels=False, yticklabels=1)
    if kwargs is not None:
        heatmap_kwargs = update_dict(heatmap_kwargs, kwargs)

    sns_heatmap = sns.clustermap(
        df.loc[source_targets, :],
        col_cluster=cluster_row_col[0],
        row_cluster=cluster_row_col[1] if len(source_targets) > 2 else False,
        cmap=color_map,
        figsize=figsize,
        center=0,
        **heatmap_kwargs
    )
    if not show_colorbar: sns_heatmap.cax.set_visible(False)

    if save_show_or_return == "save_fig":
        s_kwargs = {"path": None, "prefix": 'jacobian_kinetics', "dpi": None,
                    "ext": 'pdf', "transparent": True, "close": True, "verbose": True}
        s_kwargs = update_dict(s_kwargs, save_kwargs)

        save_fig(**s_kwargs)
    elif save_show_or_return == "show":
        if show_colorbar:
            plt.subplots_adjust(right=0.85)
        plt.tight_layout()
        plt.show()
    elif save_show_or_return == "return":
        return sns_heatmap
예제 #34
0
### Heatmaps for Calls made by Hour and Day of Week
byDayofWeekHour = call_data.groupby(
    by=['Day_of_Week', 'Hour']).count()['twp'].unstack(level=-1)

fig8 = plt.figure(figsize=(10, 6))
ax8 = fig8.add_axes([
    .1,
    .1,
    .8,
    .8,
])
ax8 = sns.heatmap(byDayofWeekHour, cmap="coolwarm")
fig8.suptitle('Heatmap: Hour by Day of the Week')

clm = sns.clustermap(byDayofWeekHour, cmap="coolwarm", figsize=(10, 6))
fig9 = clm.fig
fig9.suptitle('Clustermap: Hour by Day of the Week')

### Heatmaps for Calls made by Month and Day of Week
byDayofWeekMonth = call_data.groupby(
    by=['Day_of_Week', 'Month']).count()['twp'].unstack(level=-1)

fig8 = plt.figure(figsize=(10, 6))
ax8 = fig8.add_axes([
    .1,
    .1,
    .8,
    .8,
])
ax8 = sns.heatmap(byDayofWeekMonth, cmap="coolwarm")
예제 #35
0
# Create a custom palette to identify the networks
network_pal = sns.cubehelix_palette(len(used_networks),
                                    light=.9, dark=.1, reverse=True,
                                    start=1, rot=-2)
network_lut = dict(zip(map(str, used_networks), network_pal))

# Convert the palette to vectors that will be drawn on the side of the matrix
networks = df.columns.get_level_values("network")
network_colors = pd.Series(networks, index=df.columns).map(network_lut)

# Create a custom colormap for the heatmap values
cmap = sns.diverging_palette(h_neg=210, h_pos=350, s=90, l=30, as_cmap=True)

# Draw the full plot
sns.clustermap(df.corr(), row_colors=network_colors, linewidths=.5,
               col_colors=network_colors, figsize=(13, 13), cmap=cmap)


'''Scatterplot with categorical variables

https://seaborn.pydata.org/examples/scatterplot_categorical.html
'''

sns.set(style="whitegrid", palette="muted")
 
# Load the example iris dataset
iris = sns.load_dataset("iris")
 
# "Melt" the dataset to "long-form" or "tidy" representation
iris = pd.melt(iris, "species", var_name="measurement")
 
예제 #36
0
df = df.apply(lambda x: x / x.max(), axis=1)
df = df.drop('total', axis=1)
df.loc['sum'] = df.sum(axis=0)
df = df.sort_values('sum', axis=1, ascending=False)
df = df.drop('sum', axis=0)

df.to_csv(args.output + ".csv")

if args.count > 0:
    df = df.iloc[:, :args.count]
else:
    args.count = df.shape[1]

sz = min(50, max(args.count, df.shape[0])) // 5
g = clustermap(data=df,
               metric='braycurtis',
               col_cluster=False,
               robust=True,
               figsize=(sz + 5, sz + 5))
if args.count > 50:
    g.ax_heatmap.get_xaxis().set_visible(False)
plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(),
         fontsize=min(100, 40 * sz // args.count))
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(),
         fontsize=min(100, 40 * sz // df.shape[0]))
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0, va='center')
plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
for a in g.ax_row_dendrogram.collections:
    a.set_linewidth(2)
g.savefig(args.output + ".svg")
g.savefig(args.output + ".png")
예제 #37
0
import pandas as pd
import numpy as np

# names of columns
drug_conditions = ["marimastat 3 uM", "ibudilast 20 uM", "ibudilast 200 uM", 
                   "cabozantinib 0.1 uM", "cabozantinib 1 uM", "sorafenib 2 uM", "sorafenib 20 uM", 
                   "axitinib 2 uM", "axitinib 20 uM", "tofacitinib 2 uM", "tofacitinib 20 uM", 
                   "thalidomide 0.5 uM", "thalidomide 5 uM", "icatibant 0.1 uM", "icatibant 1 uM"]

# names of rows
cell_lines = ["G523", "G885", "G729", "G564", "G861"]              

# corresponding numerical value for each cell
viability_scores = np.array([[103.2445191,66.64440593,4.739848128,97.90725205,94.42274748,116.9796604,3.906406317,14.56179271,23.33680375,114.614175,117.3194148,92.78114457,104.8391006,99.06876532,99.90973465],
                             [99.54387203,95.89649106,4.679616554,99.55142491,99.892707,98.84118352,3.74480514,5.318624446,6.386772279,99.10143741,66.16415959,99.87811211,99.80021925,103.0671789,103.4136019],
                             [111.7364657,101.337013,13.41437035,115.9142502,109.4053095,116.1193251,9.227651647,84.10387886,55.32228376,115.9941438,115.0353046,93.96947089,112.1451494,94.90497327,109.030808],
                             [100.5542982,100.0080483,54.18581791,95.78376446,100.0588735,98.01859304,2.305208161,57.75735247,33.72282194,98.08674223,98.5647624,116.1383385,100.4812294,112.8439546,100.0594984],
                             [103.7184012,101.7273883,89.18527618,100.145608,61.24720474,103.5849188,2.019981229,46.41365913,4.292586744,101.9055264,89.33413613,102.7615483,100.2794762,111.1432579,105.007406]])
 
# compiles all of the above information into a pandas dataframe
data = pd.DataFrame(data=viability_scores, index=cell_lines, columns=drug_conditions)

print(data)

# make heirarchically clustered heatmap using seaborn
seaborn.set(color_codes=True)
clustered_map = seaborn.clustermap(data)

# save the generated heatmap
clustered_map.savefig("clustered.jpg")
예제 #38
0
tr_dict = df_tickers['Total Return Index (UBS)'].to_dict()
tr_dict = {v: k for k, v in tr_dict.items()}

df_class = df_tickers['Classification']
df_class = df_class.replace({'DM': '#9FD356', 'EM': '#FA824C'})

# Read Total Return Index
df_tr = pd.read_excel(file_path, index_col=0, sheet_name='Total Return')
df_tr = df_tr.rename(tr_dict, axis=1)

# Comnpute returns
df = df_tr.pct_change(1)
df = df['2009-01-01' <= df.index]
df = df[df.index <= '2020-01-31']
corr = df.corr()

# Chart
sns.clustermap(data=corr,
               method='average',
               metric='euclidean',
               figsize=(10, 10),
               cmap='mako',
               row_colors=df_class,
               col_colors=df_class,
               linewidths=0)
plt.savefig(
    '/Users/gustavoamarante/Dropbox/CQF/Final Project/figures/Correlation and Dendrogam.pdf',
    pad_inches=0)
plt.show()
예제 #39
0




df[df['Reason']=='Fire'].groupby('Date').count()['twp'].plot()
plt.title('Fire')
plt.tight_layout()

df[df['Reason']=='EMS'].groupby('Date').count()['twp'].plot()
plt.title('EMS')
plt.tight_layout()


dayHour = df.groupby(by=['Day of Week','Hour']).count()['Reason'].unstack()
dayHour.head()

plt.figure(figsize=(12,6))
sns.heatmap(dayHour,cmap='viridis')


sns.clustermap(dayHour,cmap='viridis')

dayMonth = df.groupby(by=['Day of Week','Month']).count()['Reason'].unstack()
dayMonth.head()

plt.figure(figsize=(12,6))
sns.heatmap(dayMonth,cmap='viridis')

sns.clustermap(dayMonth,cmap='viridis')
rois_beta_df["GM_mean"] = rois_avg_df["GM_mean"]

##
# Seaborn
import scipy.cluster.hierarchy as hc
import scipy.spatial as sp

# https://stackoverflow.com/questions/38705359/how-to-give-sns-clustermap-a-precomputed-distance-matrix
DF = rois_beta_df.copy()

DF_corr = DF.corr()
DF_dism = 1 - DF_corr  # ** 2

linkage = hc.linkage(sp.distance.squareform(DF_dism), method='average')
g = sns.clustermap(DF_corr, col_linkage=linkage, row_linkage=linkage)
plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0)  # For y axis
plt.savefig(os.path.join(WD_CLUST, "cor_rois.pdf"))

# Positive / Negatives
# --------------------

beta_pos_msk = comp > 0
beta_neg_msk = comp < 0

print(beta_pos_msk.sum(), beta_neg_msk.sum())

Xscores = np.zeros((X_adni.shape[0], 5))

Xscores[:, 0] = np.dot(X_adni[:, beta_pos_msk], comp[beta_pos_msk]).ravel()
Xscores[:, 1] = X_adni[:, beta_pos_msk].mean(axis=1)
def visualizeConsensus(consensusMat, connectivityMatrices, clusters, colNames,
                       suffix):
    plt.rcParams['font.size'] = '8'
    plt.rcParams['pdf.fonttype'] = 42
    if colNames == 'noXLabels':
        # put concensus matrix into dataframe to build hierarchical clustermap
        dataframe = pd.DataFrame(data=consensusMat)
        dataframe.to_csv(str(matrixPath + 'consensus_matrix_table.txt'),
                         sep="\t")
        # clusters by columns and rows and annotates probablility a particular sample clusters together
        # cluster distance is meausred by average Euclidean Distance in seaborn for hierarchical clustering
        consensusClustered = sns.clustermap(dataframe,
                                            col_cluster=True,
                                            row_cluster=True,
                                            annot=True)
        consensusClustered.savefig(
            str(matrixPath) + 'consensus_Matrix_over_' +
            str(len(connectivityMatrices)) + '_runs_at_k=' + str(clusters) +
            '.' + suffix)

    else:
        # assigns sample names to consensus matrix
        sampleNames = []
        with open(colNames) as input:
            for line in input:
                sampleNames.append(line.rstrip('\n'))
        # put consensus matrix into dataframe to build hierarchical clustermap
        dataframe = pd.DataFrame(data=consensusMat,
                                 index=sampleNames,
                                 columns=sampleNames)
        # clusters by columns and rows and annotates probablility a particular sample clusters together
        # cluster distance is measured by average Euclidean Distance in seaborn for hierarchical clustering
        consensusClustered = sns.clustermap(dataframe,
                                            col_cluster=True,
                                            row_cluster=True,
                                            annot=True)
        ax = consensusClustered.ax_heatmap
        xaxis = []
        for ind in consensusClustered.dendrogram_col.reordered_ind:
            xaxis.append(sampleNames[ind])
        ax.set_xticklabels(xaxis, rotation=90)
        yaxis = []
        for ind in consensusClustered.dendrogram_row.reordered_ind:
            yaxis.append(sampleNames[ind])
        ax.set_yticklabels(yaxis, rotation=0)
        consensusClustered_non_annt = sns.clustermap(dataframe,
                                                     col_cluster=True,
                                                     row_cluster=True,
                                                     annot=False)
        axNA = consensusClustered_non_annt.ax_heatmap
        xaxis = []
        for ind in consensusClustered_non_annt.dendrogram_col.reordered_ind:
            xaxis.append(sampleNames[ind])
        axNA.set_xticklabels(xaxis, rotation=90)
        yaxis = []
        for ind in consensusClustered_non_annt.dendrogram_row.reordered_ind:
            yaxis.append(sampleNames[ind])
        axNA.set_yticklabels(yaxis, rotation=0)

        consensusClustered.savefig(
            str(matrixPath) + 'consensus_Matrix_over_' +
            str(len(connectivityMatrices)) + '_runs_at_k=' + str(clusters) +
            '.' + suffix)
        consensusClustered_non_annt.savefig(
            str(matrixPath) + 'non_annotated_consensus_Matrix_over_' +
            str(len(connectivityMatrices)) + '_runs_at_k=' + str(clusters) +
            '.' + suffix)
        df_ordered_by_clust = dataframe.reindex(index=xaxis, columns=xaxis)
        df_ordered_by_clust.to_csv(str(matrixPath +
                                       'consensus_matrix_table.txt'),
                                   sep="\t")
예제 #42
0
def heatmap(
        adata,
        var_names,
        sortby="latent_time",
        layer="Ms",
        color_map="viridis",
        col_color=None,
        palette="viridis",
        n_convolve=30,
        standard_scale=0,
        sort=True,
        colorbar=None,
        col_cluster=False,
        row_cluster=False,
        context=None,
        font_scale=None,
        figsize=(8, 4),
        show=None,
        save=None,
        **kwargs,
):
    """\
    Plot time series for genes as heatmap.

    Arguments
    ---------
    adata: :class:`~anndata.AnnData`
        Annotated data matrix.
    var_names: `str`,  list of `str`
        Names of variables to use for the plot.
    sortby: `str` (default: `'latent_time'`)
        Observation key to extract time data from.
    layer: `str` (default: `'Ms'`)
        Layer key to extract count data from.
    color_map: `str` (default: `'viridis'`)
        String denoting matplotlib color map.
    col_color: `str` or list of `str` (default: `None`)
        String denoting matplotlib color map to use along the columns.
    palette: list of `str` (default: `'viridis'`)
        Colors to use for plotting groups (categorical annotation).
    n_convolve: `int` or `None` (default: `30`)
        If `int` is given, data is smoothed by convolution
        along the x-axis with kernel size n_convolve.
    standard_scale : `int` or `None` (default: `0`)
        Either 0 (rows) or 1 (columns). Whether or not to standardize that dimension
        (each row or column), subtract minimum and divide each by its maximum.
    sort: `bool` (default: `True`)
        Wether to sort the expression values given by xkey.
    colorbar: `bool` or `None` (default: `None`)
        Whether to show colorbar.
    {row,col}_cluster : `bool` or `None`
        If True, cluster the {rows, columns}.
    context : `None`, or one of {paper, notebook, talk, poster}
        A dictionary of parameters or the name of a preconfigured set.
    font_scale : float, optional
        Scaling factor to scale the size of the font elements.
    figsize: tuple (default: `(8,4)`)
        Figure size.
    show: `bool`, optional (default: `None`)
        Show the plot, do not return axis.
    save: `bool` or `str`, optional (default: `None`)
        If `True` or a `str`, save the figure. A string is appended to the default
        filename. Infer the filetype if ending on {'.pdf', '.png', '.svg'}.
    kwargs:
        Arguments passed to seaborns clustermap,
        e.g., set `yticklabels=True` to display all gene names in all rows.

    Returns
    -------
    If `show==False` a `matplotlib.Axis`
    """

    import seaborn as sns

    var_names = [name for name in var_names if name in adata.var_names]

    tkey, xkey = kwargs.pop("tkey", sortby), kwargs.pop("xkey", layer)
    time = adata.obs[tkey].values
    time = time[np.isfinite(time)]

    X = (adata[:, var_names].layers[xkey]
         if xkey in adata.layers.keys() else adata[:, var_names].X)
    if issparse(X):
        X = X.A
    df = pd.DataFrame(X[np.argsort(time)], columns=var_names)

    if n_convolve is not None:
        weights = np.ones(n_convolve) / n_convolve
        for gene in var_names:
            try:
                df[gene] = np.convolve(df[gene].values, weights, mode="same")
            except Exception:
                pass  # e.g. all-zero counts or nans cannot be convolved

    if sort:
        max_sort = np.argsort(np.argmax(df.values, axis=0))
        df = pd.DataFrame(df.values[:, max_sort], columns=df.columns[max_sort])
    strings_to_categoricals(adata)

    if col_color is not None:
        col_colors = to_list(col_color)
        col_color = []
        for _, col in enumerate(col_colors):
            if not is_categorical(adata, col):
                obs_col = adata.obs[col]
                cat_col = np.round(obs_col / np.max(obs_col),
                                   2) * np.max(obs_col)
                adata.obs[f"{col}_categorical"] = pd.Categorical(cat_col)
                col += "_categorical"
                set_colors_for_categorical_obs(adata, col, palette)
            col_color.append(interpret_colorkey(adata, col)[np.argsort(time)])

    if "dendrogram_ratio" not in kwargs:
        kwargs["dendrogram_ratio"] = (
            0.1 if row_cluster else 0,
            0.2 if col_cluster else 0,
        )
    if "cbar_pos" not in kwargs or not colorbar:
        kwargs["cbar_pos"] = None

    kwargs.update(
        dict(
            col_colors=col_color,
            col_cluster=col_cluster,
            row_cluster=row_cluster,
            cmap=color_map,
            xticklabels=False,
            standard_scale=standard_scale,
            figsize=figsize,
        ))

    args = {}
    if font_scale is not None:
        args = {"font_scale": font_scale}
        context = context or "notebook"

    with sns.plotting_context(context=context, **args):
        try:
            cm = sns.clustermap(df.T, **kwargs)
        except Exception:
            logg.warn("Please upgrade seaborn with `pip install -U seaborn`.")
            kwargs.pop("dendrogram_ratio")
            kwargs.pop("cbar_pos")
            cm = sns.clustermap(df.T, **kwargs)

    savefig_or_show("heatmap", save=save, show=show)
    if show is False:
        return cm
rate = n_error_outliers/y_out.size
print("Classification rate = ",100*(1-rate),"%")

import seaborn as sns
sns.pairplot(df)

type(t_out)

plot_data = pd.DataFrame(np.array(t_out).reshape(328,))

import seaborn as sns
sns.pairplot(plot_data)

sns.distplot(plot_data)

sns.clustermap(X)
# extra work not required.

sns.violinplot([X])
# extra work not required.



"""# Problem 2"""

# Load the digit data
digits = datasets.load_digits()
# View the features of the first observation
digits.data[0:1]
# View the target of the first observation
digits.target[0:1]
예제 #44
0
def kinetic_heatmap(
    adata,
    genes,
    mode="vector_field",
    basis=None,
    layer="X",
    project_back_to_high_dim=True,
    tkey="potential",
    dist_threshold=1e-10,
    color_map="BrBG",
    gene_order_method='half_max_ordering',
    show_colorbar=False,
    cluster_row_col=[False, False],
    figsize=(11.5, 6),
    standard_scale=1,
    save_show_or_return='show',
    save_kwargs={},
    **kwargs
):
    """Plot the gene expression dynamics over time (pseudotime or inferred real time) in a heatmap.

    Note that by default `potential` estimated with the diffusion graph built from reconstructed vector field will be
    used as the measure of pseudotime.

    Parameters
    ----------
        %(kin_curves.parameters.no_ncol|color|c_palette)s
        color_map: `str` (default: `BrBG`)
            Color map that will be used to color the gene expression. If `half_max_ordering` is True, the
            color map need to be divergent, good examples, include `BrBG`, `RdBu_r` or `coolwarm`, etc.
        gene_order_method: `str` (default: `half_max_ordering`) [`half_max_ordering`, `maximum`]
            Supports two different methods for ordering genes when plotting the heatmap: either `half_max_ordering`,
            or `maximum`. For `half_max_ordering`, it will order genes into up, down and transit groups by the half
            max ordering algorithm (HA Pliner, et. al, Molecular cell 71 (5), 858-871. e8). While for `maximum`,
            it will order by the position of the highest gene expression.
        show_colorbar: `bool` (default: `False`)
            Whether to show the color bar.
        cluster_row_col: `[bool, bool]` (default: `[False, False]`)
            Whether to cluster the row or columns.
        figsize: `str` (default: `(11.5, 6)`
            Size of figure
        standard_scale: `int` (default: 1)
            Either 0 (rows, cells) or 1 (columns, genes). Whether or not to standardize that dimension, meaning for each row or column,
            subtract the minimum and divide each by its maximum.
        save_show_or_return: {'show', 'save_fig', 'return'} (default: `show`)
            Whether to save_fig, show or return the figure.
        save_kwargs: `dict` (default: `{}`)
            A dictionary that will passed to the save_fig function. By default it is an empty dictionary and the save_fig function
            will use the {"path": None, "prefix": 'kinetic_heatmap', "dpi": None, "ext": 'pdf', "transparent": True, "close":
            True, "verbose": True} as its parameters. Otherwise you can provide a dictionary that properly modify those keys
            according to your needs.
        kwargs:
            All other keyword arguments are passed to heatmap(). Currently `xticklabels=False, yticklabels='auto'` is passed
            to heatmap() by default.

    Returns
    -------
        Nothing but plots a heatmap that shows the gene expression dynamics over time.
    """

    import pandas as pd
    import seaborn as sns
    import matplotlib.pyplot as plt

    if tkey == "potential" and "potential" not in adata.obs_keys():
        ddhodge(adata)

    exprs, valid_genes, time = fetch_exprs(
        adata, basis, layer, genes, tkey, mode, project_back_to_high_dim
    )

    exprs = exprs.A if issparse(exprs) else exprs

    if dist_threshold is not None and mode == 'vector_field':
        valid_ind = list(
            np.where(np.sum(np.diff(exprs, axis=0) ** 2, axis=1) > dist_threshold)[0]
            + 1
        )
        valid_ind.insert(0, 0)
        exprs = exprs[valid_ind, :]
        time = time[valid_ind]

    if gene_order_method == "half_max_ordering":
        time, all, valid_ind, gene_idx = _half_max_ordering(
            exprs.T, time, mode=mode, interpolate=True, spaced_num=100
        )
        all, genes = all[np.isfinite(all.sum(1)), :], np.array(valid_genes)[gene_idx][np.isfinite(all.sum(1))]

        df = pd.DataFrame(all, index=genes)
    elif gene_order_method == 'maximum':
        exprs = lowess_smoother(time, exprs.T, spaced_num=100)
        exprs = exprs[np.isfinite(exprs.sum(1)), :]

        if standard_scale is not None:
            exprs = (exprs - np.min(exprs, axis=standard_scale)[:, None]) / np.ptp(
                exprs, axis=standard_scale
            )[:, None]
        max_sort = np.argsort(np.argmax(exprs, axis=1))
        df = pd.DataFrame(exprs[max_sort, :], index=np.array(valid_genes)[max_sort])
    else:
        raise Exception('gene order_method can only be either half_max_ordering or maximum')

    heatmap_kwargs = dict(xticklabels=False, yticklabels=1)
    if kwargs is not None:
        heatmap_kwargs = update_dict(heatmap_kwargs, kwargs)

    sns_heatmap = sns.clustermap(
        df,
        col_cluster=cluster_row_col[0],
        row_cluster=cluster_row_col[1],
        cmap=color_map,
        figsize=figsize,
        **heatmap_kwargs
    )
    if not show_colorbar: sns_heatmap.cax.set_visible(False)

    if save_show_or_return == "save":
        s_kwargs = {"path": None, "prefix": 'kinetic_heatmap', "dpi": None,
                    "ext": 'pdf', "transparent": True, "close": True, "verbose": True}
        s_kwargs = update_dict(s_kwargs, save_kwargs)

        save_fig(**s_kwargs)
    elif save_show_or_return == "show":
        if show_colorbar:
            plt.subplots_adjust(right=0.85)
        plt.tight_layout()
        plt.show()
    elif save_show_or_return == "return":
        return sns_heatmap
예제 #45
0
plt.legend()
plt.savefig(output_file)
plt.show()
plt.close(f)

#

# Plot dissimilarity matrices
num_models = len(models)
model_names = [models[k][1] for k in selected_scores.keys()]
ss = np.concatenate([v[0][None, :] for v in selected_scores.values()], axis=0)
sl = np.concatenate([v[0][None, :] for v in selected_labels.values()], axis=0)
df = pd.DataFrame(ss.transpose(), columns=model_names)

# Create a categorical palette to identify the networks
network_pal = sns.husl_palette(len(model_names), s=.45)
network_lut = dict(zip(map(str, model_names), network_pal))

# Convert the palette to vectors that will be drawn on the side of the matrix
network_colors = pd.Series(model_names, index=df.columns).map(network_lut)

# Draw the full plot
sns.clustermap(df.corr(),
               center=0,
               cmap="RdBu",
               row_colors=network_colors,
               col_colors=network_colors,
               linewidths=.75,
               figsize=(13, 13))
plt.show()
예제 #46
0
    def cn_heatmap(self,
                   df,
                   cell_font_size=3,
                   max_cn=4,
                   method='ward',
                   cmap='bwr',
                   figsize=(15, 20),
                   xlabel='Contigs',
                   ylabel='Cells',
                   **kwargs):
        """
        Create a heatmap from a copy number matrix

        df: triple indexed dataframe with as columns ('contig', start, end ), as rows cells/samples

        cell_font_size (int): font size of the cell labels

        max_cn (int) : dataframe will be clipped to this value. (Maximum copy number shown)

        method (str) : clustering metric

        cmap (str) : colormap used

        figsize(tuple) : Size of the figure

        xlabel (str) : Label for the x-axis, by default this is Contigs

        ylabel (str) : Label for the x-axis, by default this is Cells

        **kwargs : Arguments which will be passed to seaborn.clustermap

        """

        try:
            clmap = sns.clustermap(df.sort_index(1)[self.contigs],
                                   col_cluster=False,
                                   method=method,
                                   cmap=cmap,
                                   vmax=max_cn,
                                   vmin=0,
                                   yticklabels=True,
                                   figsize=figsize,
                                   **kwargs)
            ax_heatmap = clmap.ax_heatmap
        except Exception as e:
            print('Falling back on heatmap without clustering')
            fig, ax_heatmap = plt.subplots(figsize=figsize)
            clmap = sns.heatmap(df.sort_index(1)[self.contigs],
                                cmap=cmap,
                                vmax=max_cn,
                                vmin=0,
                                yticklabels=True,
                                ax=ax_heatmap)

        prev = None
        xtick_pos = []
        xtick_label = []
        last_idx = 0
        for idx, (contig, start,
                  end) in enumerate(df.sort_index(1)[self.contigs].columns):
            if prev is not None and prev != contig:
                ax_heatmap.axvline(idx - 0.5, c='k', lw=1.5, zorder=10)
                xtick_pos.append((idx + last_idx) / 2)
                xtick_label.append(prev)
                last_idx = idx
            prev = contig

        ax_heatmap.set_xticks(xtick_pos)
        ax_heatmap.set_xticklabels(xtick_label, rotation=0, fontsize=8)
        ax_heatmap.set_xlabel(xlabel, labelpad=20)
        ax_heatmap.set_ylabel(ylabel, labelpad=20)

        return clmap
plt.tight_layout()
#%%
df[df['Reason']=='Fire'].groupby('Date').count()['twp'].plot()
plt.tight_layout()

#%%
df[df['Reason']=='EMS'].groupby('Date').count()['twp'].plot()
plt.tight_layout()

#%%
day_hour = df.groupby(by=['Day of Week','Hour']).count()['Reason'].unstack()

#%%
plt.figure(figsize=(12,6))
sns.heatmap(day_hour, cmap='viridis')

#%%
sns.clustermap(day_hour, cmap='viridis')

#%%
day_month = df.groupby(by=['Day of Week', 'Month']).count()['Reason'].unstack()

#%%
plt.figure(figsize=(12, 6))
sns.heatmap(day_month, cmap='viridis')

#%%
sns.clustermap(day_month, cmap='viridis')

#%%
예제 #48
0
plt.show()




'''
3.6 seaborn.heatmap

seaborn.heatmap() 主要是用于绘制热力图,也就类似于色彩矩阵。
'''
# 生成 10x10 的随机矩阵
matrix_data = np.random.rand(10, 10)

# 绘图
sns.heatmap(data=matrix_data)

plt.show()




'''
3.7 seaborn.clustermap

seaborn.clustermap() 可以将矩阵数据集绘制为层次聚类热图。
'''
iris_data.pop("species")  #去掉了花的类别列

# 绘图
sns.clustermap(iris_data)
plt.show()
예제 #49
0
sfinal_feat = pd.concat([sFeatMatAll[combi], conds2], axis=1)

#make a colormap to assign colours - based on class (ie clozapine10 is separate)
cmap1 = sns.color_palette("tab20", np.unique(sfinal_feat['drug']).shape[0])

#make a clustergram
#1. make lut for drug colors
#2. map the lut onto the clustergram
lut = dict(zip(np.unique(sfinal_feat['drug']), cmap1))

#add in row colors to the dataframe
row_colors = sfinal_feat['drug'].map(lut)  #map onto the feature Matrix

#make clustergram
cg=sns.clustermap(sfinal_feat.iloc[:,:-3], metric  = 'euclidean', cmap = 'inferno', \
                  row_colors = row_colors)
plt.setp(cg.ax_heatmap.yaxis.set_ticklabels\
         (sfinal_feat['drug'][cg.dendrogram_row.reordered_ind]))
plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0, fontsize=8)
plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90, fontsize=10)
col = cg.ax_col_dendrogram.get_position()
cg.ax_col_dendrogram.set_position(
    [col.x0, col.y0, col.width * 1, col.height * 1])
#save fig
plt.savefig(os.path.join(directoryA[0:-7], 'Figures', 'Agar_stats_LDA_clustergram1.tif'), \
            dpi =150, bbox_inches = 'tight', pad_inches = 1)
plt.show()

#make list of the final order of the drugs
drug_order = list(sfinal_feat['drug'][cg.dendrogram_row.reordered_ind])
conc_order = list(
예제 #50
0
    louvain = attr_df.loc[i, 'louvain']  #louvain of sample
    cell_type = attr_df.loc[i, 'biosample_cell_type']  #cell type

    l_color = louvain_colors[louvain]
    c_color = cell_type_colors[cell_type]
    attr_df.loc[i, 'louvain_colors'] = l_color
    attr_df.loc[i, 'cell_type_colors'] = c_color
#print(attr_df.head())

#print(len(louvain)) #8 clusters
#print(len(cell_type)) #9 cell)_type
cmap = sns.diverging_palette(220, 20, as_cmap=True)
g = sns.clustermap(features_df,
                   cmap=cmap,
                   row_cluster=True,
                   col_cluster=False,
                   row_colors=attr_df[['louvain_colors', 'cell_type_colors']],
                   linewidths=0,
                   xticklabels=False,
                   yticklabels=False)
#for some reason, it cant take two legends....
#legend_louvain = [mpatches.Patch(color=c, label=l) for c,l in attr_df[['louvain_colors','louvain']].drop_duplicates().values]
#l2=g.ax_heatmap.legend(loc='upper left',bbox_to_anchor=(0.05,1.3),handles=legend_louvain,frameon=True)
#l2.set_title(title='louvain cluster',prop={'size':10})
attr_df['biosample_cell_type'] = encoder.inverse_transform(
    attr_df['biosample_cell_type'].values.tolist())
legend_cell_type = [
    mpatches.Patch(color=k, label=v) for k, v in attr_df[
        ['cell_type_colors', 'biosample_cell_type']].drop_duplicates().values
]
l1 = g.ax_heatmap.legend(loc='upper left',
                         bbox_to_anchor=(1.01, 0.6),
예제 #51
0
def main(_):
  print("Loading data...")
  dfs = []
  for filename in os.listdir(FLAGS.data):
    if filename.endswith(".csv"):
      dfs.append(
          pd.read_csv(os.path.join(FLAGS.data, filename), encoding="utf-8"))
  data = pd.concat(dfs)
  print("%d Examples" % (len(set(data["id"]))))
  print("%d Annotations" % len(data))
  
  if not os.path.isdir(FLAGS.plot_dir):
    os.makedirs(FLAGS.plot_dir)

  with open(FLAGS.emotion_file, "r") as f:
    all_emotions = f.read().splitlines()
  all_emotions_neutral = all_emotions + ["neutral"]
  print("%d emotion Categories" % len(all_emotions))

  print("%d unique raters" % len(data["rater_id"].unique()))
  print("%.3f marked unclear" %
        (data["example_very_unclear"].sum() / len(data)))

  # Since the ones marked as difficult have no labels, exclude those
  data = data[data[all_emotions_neutral].sum(axis=1) != 0]

  print("Distribution of number of labels per example:")
  print(data[all_emotions_neutral].sum(axis=1).value_counts() / len(data))
  print("%.2f with more than 3 labels" %
        ((data[all_emotions_neutral].sum(axis=1) > 3).sum() /
         len(data)))  # more than 3 labels

  print("Label distributions:")
  print((data[all_emotions_neutral].sum(axis=0).sort_values(ascending=False) /
         len(data) * 100).round(2))

  print("Plotting label correlations...")
  ratings = data.groupby("id")[all_emotions].mean()

  # Compute the correlation matrix
  corr = ratings.corr()

  # Generate a mask for the upper triangle
  mask = np.zeros_like(corr, dtype=np.bool)
  mask[np.triu_indices_from(mask)] = True

  # Set up the matplotlib figure
  fig, _ = plt.subplots(figsize=(11, 9))

  # Generate a custom diverging colormap
  cmap = sns.diverging_palette(220, 10, as_cmap=True)

  # Draw the heatmap with the mask and correct aspect ratio
  sns.heatmap(
      corr,
      mask=mask,
      cmap=cmap,
      vmax=.3,
      center=0,
      square=True,
      linewidths=.5,
      cbar_kws={"shrink": .5})
  fig.savefig(
      FLAGS.plot_dir + "/correlations.pdf",
      dpi=500,
      format="pdf",
      bbox_inches="tight")

  print("Plotting hierarchical relations...")
  z = linkage(
      pdist(ratings.T, metric="correlation"),
      method="ward",
      optimal_ordering=True)
  fig = plt.figure(figsize=(11, 4), dpi=400)
  plt.xlabel("")
  plt.ylabel("")
  dendrogram(
      z,
      labels=ratings.columns,
      leaf_rotation=90.,  # rotates the x axis labels
      leaf_font_size=12,  # font size for the x axis labels
      color_threshold=1.05,
  )
  fig.savefig(
      FLAGS.plot_dir + "/hierarchical_clustering.pdf",
      dpi=600,
      format="pdf",
      bbox_inches="tight")

  sent_color_map = {
      "positive": "#BEECAF",
      "negative": "#94bff5",
      "ambiguous": "#FFFC9E"
  }
  with open(FLAGS.sentiment_dict) as f:
    sent_dict = json.loads(f.read())
  sent_colors = {}
  for e in all_emotions:
    if e in sent_dict["positive"]:
      sent_colors[e] = sent_color_map["positive"]
    elif e in sent_dict["negative"]:
      sent_colors[e] = sent_color_map["negative"]
    else:
      sent_colors[e] = sent_color_map["ambiguous"]

  # Generate a mask for the upper triangle
  mask = np.zeros_like(corr, dtype=np.bool)
  mask[np.diag_indices(mask.shape[0])] = True

  # Generate a custom diverging colormap
  cmap = sns.diverging_palette(220, 10, as_cmap=True)

  row_colors = pd.Series(
      corr.columns, index=corr.columns, name="sentiment").map(sent_colors)

  # Draw the heatmap with the mask and correct aspect ratio
  g = sns.clustermap(
      corr,
      mask=mask,
      cmap=cmap,
      vmax=.3,
      vmin=-0.3,
      center=0,
      row_linkage=z,
      col_linkage=z,
      col_colors=row_colors,
      linewidths=.1,
      cbar_kws={
          "ticks": [-.3, -.15, 0, .15, .3],
          "use_gridspec": False,
          "orientation": "horizontal",
      },
      figsize=(10, 10))

  g.ax_row_dendrogram.set_visible(False)
  g.cax.set_position([.34, -0.05, .5, .03])

  for label in sent_color_map:
    g.ax_col_dendrogram.bar(
        0, 0, color=sent_color_map[label], label=label, linewidth=0)

  g.ax_col_dendrogram.legend(
      title="Sentiment", loc="center", bbox_to_anchor=(1.1, .5))

  g.savefig(FLAGS.plot_dir + "/hierarchical_corr.pdf", dpi=600, format="pdf")

  print("Calculating agreements...")
  unique_labels = data.groupby("id").apply(CheckAgreement, 1,
                                           all_emotions_neutral).to_dict()
  data["unique_labels"] = data["id"].map(unique_labels)
  agree_dict_2 = data.groupby("id").apply(CheckAgreement, 2,
                                          all_emotions_neutral).to_dict()
  data["agree_2"] = data["id"].map(agree_dict_2)
  agree_dict = data.groupby("id").apply(CheckAgreement, 3,
                                        all_emotions_neutral).to_dict()
  data["agree_3"] = data["id"].map(agree_dict)
  agree_dict = data.groupby("id").apply(CheckAgreement, 1, all_emotions_neutral,
                                        1).to_dict()
  data["no_agree"] = data["id"].map(agree_dict)

  filtered_2 = data[data["agree_2"].str.len() > 0]
  print(
      "%d (%d%%) of the examples have 2+ raters agreeing on at least one emotion label"
      % (len(filtered_2["id"].unique()), (len(filtered_2) / len(data) * 100)))

  filtered_3 = data[data["agree_3"].str.len() > 0]
  print(
      "%d (%d%%) of the examples have 3+ raters agreeing on at least one emotion label"
      % (len(filtered_3["id"].unique()), (len(filtered_3) / len(data) * 100)))

  print("Plotting number of labels...")
  data["num_unique_prefilter"] = data["unique_labels"].apply(CountLabels)
  data["num_unique_postfilter"] = data["agree_2"].apply(CountLabels)
  unique_ex = data.drop_duplicates("id")
  df = pd.DataFrame({
      "count":
          unique_ex["num_unique_prefilter"].tolist() +
          unique_ex["num_unique_postfilter"].tolist(),
      "type": ["pre-filter"] * len(unique_ex) + ["post-filter"] * len(unique_ex)
  })

  fig = plt.figure(dpi=600)
  ax = sns.countplot(
      data=df, x="count", hue="type", palette=["skyblue", "navy"])
  plt.xlim(-.5, 7.5)
  plt.legend(loc="center right", fontsize="x-large")
  plt.ylabel("Number of Examples", fontsize="x-large")
  plt.xlabel("Number of Labels", fontsize="x-large")
  plt.draw()
  labels = [item.get_text() for item in ax.get_yticklabels()]
  ax.set_yticklabels(["%dk" % (int(int(label) / 1000)) for label in labels])
  plt.tight_layout()

  fig.savefig(
      FLAGS.plot_dir + "/number_of_labels.pdf",
      dpi=600,
      format="pdf",
      bbox_inches="tight")

  print("Proportion of agreement per label:")
  print(
      filtered_2[all_emotions_neutral].sum(axis=0).sort_values(ascending=False)
      / len(data))
예제 #52
0
    if len(taxon_all) < 2:
        taxon = "unknown"
    else:
        taxon = taxa[index].split(';')[1].split("_")[-1]
    if toi != None:
        if toi not in taxon:
            df.drop([index], inplace=True, axis=0)
            continue

    if taxon in taxa_colors:
        row_colors.append(taxa_colors[taxon])
    else:
        row_colors.append("w")

print "plotting..."
sns.set(font_scale=1)
g = sns.clustermap(df,
                   figsize=(8, 8),
                   col_colors=col_colors,
                   col_cluster=False,
                   yticklabels=False,
                   xticklabels=False,
                   cmap="magma_r")

# adjust axis labels
plt.setp(g.ax_heatmap.get_xticklabels(), rotation=90)
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)

plt.savefig("figure.png", bbox_inches='tight', dpi=300)
plt.show()
예제 #53
0
            vertex_label_size=5,
            vertex_frame_width=0,
            vertex_size=20,
            edge_width=1.,
            target='%s/reports/Figure_4.pdf' % wd)
print '[INFO] Network exported: ', network_i.summary()

# -- Betas heatmap
cmap = sns.diverging_palette(220, 10, n=9, as_cmap=True)

plot_df = lm_betas_kinases.loc[:, [m in met_name for m in lm_betas_kinases]]
plot_df.columns = [met_name[m] for m in plot_df]
plot_df.index = [acc_name[i].split(';')[0] for i in plot_df.index]

sns.set(style='white', palette='pastel')
sns.clustermap(plot_df.T, figsize=(15, 20), cmap=cmap, linewidth=.5)
plt.savefig('%s/reports/Figure_Supp_4_kinases_dynamic_betas.pdf' % wd,
            bbox_inches='tight')
plt.close('all')

plot_df = lm_betas_tfs.loc[:, [m in met_name for m in lm_betas_tfs]]
plot_df.columns = [met_name[m] for m in plot_df]
plot_df.index = [acc_name[i].split(';')[0] for i in plot_df.index]
plot_df = plot_df[plot_df.std(1) != 0]

sns.set(style='white', palette='pastel')
sns.clustermap(plot_df.T, figsize=(15, 20), cmap=cmap, linewidth=.5)
plt.savefig(
    '%s/reports/Figure_Supp_4_transcription_factors_dynamic_betas.pdf' % wd,
    bbox_inches='tight')
plt.close('all')
예제 #54
0
z=pd.read_csv(sys.argv[1], sep='\t', index_col=0)

# add colored x-labels
lut=[]
for sample in z.columns.values:
        if "2014-09" in sample: lut.append(sys.argv[3])
        elif "2015-06" in sample: lut.append(sys.argv[4])
        elif "2016-02" in sample: lut.append(sys.argv[5])
        elif "2017-02" in sample: lut.append(sys.argv[6])
	else: lut.append('w')

# make heat map
sns.set(font_scale=0.6)
size=float(sys.argv[2])
g = sns.clustermap(z, figsize=(size,size), col_colors=lut, row_colors=lut, col_cluster=True, xticklabels=False, yticklabels=False, cmap="magma")


plt.subplots_adjust(left=0, right=1, top=0.99, bottom=0.01)

ratio=0.6
h_adjust=0.08
w_adjust=-0.05

hm = g.ax_heatmap.get_position()
xden = g.ax_col_dendrogram.get_position()
yden = g.ax_row_dendrogram.get_position()
col = g.ax_col_colors.get_position()
row = g.ax_row_colors.get_position()
legend = g.cax.get_position()
예제 #55
0
def plot_heatmap(data,
                 vmin=0,
                 vmax=100,
                 cm=None,
                 col_colors=None,
                 row_colors=None,
                 sorted_labels=None,
                 annot=True,
                 col_cluster=False,
                 row_cluster=False):
    # seaborn
    dpi = 72.27
    fontsize_x_pt = 8
    fontsize_y_pt = 10

    # compute the matrix height in points and inches
    matrix_height_pt = fontsize_y_pt * data.shape[0]
    matrix_height_in = matrix_height_pt / dpi
    matrix_width_pt = fontsize_x_pt * data.shape[1]
    matrix_width_in = matrix_width_pt / dpi

    # compute the required figure height
    top_margin = 0.04  # in percentage of the figure height
    bottom_margin = 0.04  # in percentage of the figure height
    coeff = 2
    figure_height = coeff * matrix_height_in / (1 - top_margin - bottom_margin)
    figure_width = coeff * matrix_width_in / (1 - top_margin - bottom_margin)
    ccr = 0.8 * col_colors.shape[
        0] / figure_height if col_colors is not None else 0
    # build the figure instance with the desired height
    # comput the matrix height in points and inches
    if cm is None:
        cm = plt.cm.get_cmap('gist_heat')  # plasma viridis
        cm = colors.LinearSegmentedColormap('hot_r',
                                            plt.cm.revcmap(cm._segmentdata))
        cm.set_bad('lightgray')
        cm.set_under('blue')
    if sorted_labels is not None:
        data = data.ix[sorted_labels]
    if sns.__version__ == "0.9.dev0+k":
        splot = sns.clustermap(
            data,
            col_cluster=col_cluster,
            row_cluster=row_cluster,
            figsize=(figure_width, figure_height),
            col_colors=col_colors,
            row_colors=row_colors,
            cmap=cm,
            mask=(data == 0),
            vmin=vmin,
            vmax=vmax,
            col_colors_ratio=ccr,
            xticklabels=1,  # print all labels
            annot=annot,
            annot_kws={'fontsize': 3},
            fmt='.2f')
    else:
        splot = sns.clustermap(
            data,
            col_cluster=col_cluster,
            row_cluster=row_cluster,
            figsize=(figure_width, figure_height),
            col_colors=col_colors,
            row_colors=row_colors,
            cmap=cm,
            mask=(data == 0),
            vmin=vmin,
            vmax=vmax,
            annot=annot,
            annot_kws={'fontsize': 3},
            fmt='.2f',
            xticklabels=1  # print all labels
        )

    splot.cax.set_visible(False)  # TODO
    plt.setp(splot.ax_row_dendrogram, visible=False)  # TODO
    plt.setp(splot.ax_col_dendrogram, visible=False)  # TODO
    splot.ax_heatmap.yaxis.set_ticks_position('left')
    splot.ax_heatmap.yaxis.set_label_position('left')
    splot.ax_heatmap.set_xlabel(data.columns.name, fontsize=10)
    splot.ax_heatmap.set_ylabel(data.index.name, fontsize=10)

    splot.ax_heatmap.set_yticks(numpy.arange(data.shape[0]) + 0.5, minor=False)
    plt.setp(splot.ax_heatmap.get_yticklabels(), rotation=0)
    plt.setp(splot.ax_heatmap.get_xticklabels(), rotation=90)
    plt.setp(splot.ax_heatmap.get_yticklabels(), fontsize=8)
    plt.setp(splot.ax_heatmap.get_xticklabels(), fontsize=6)
    return splot
예제 #56
0
sns.stripplot(x="day", y="total_bill", data=tips)
sns.stripplot(x="day", y="total_bill", data=tips,jitter=True) # adds jitter to better visualize 

#--- swarm plot
sns.swarmplot() 
sns.swarmplot(x="day", y="total_bill", data=tips)

#------------------------------------------#
#----------- MATRIX PLOTS -----------------#
#------------------------------------------#
#---- HEATMAP
sns.heatmap()
sns.heatmap(tips.corr(), cmap='RdBu_r') 

#--- CLUSTER MAP 
sns.clustermap()



#------------------------------------------#
#----------- GENERAL PLOT -----------------#
#------------------------------------------#
sns.factorplot()
sns.factorplot(x='sex',y='total_bill',data=tips,kind='bar') # 'kind' parameter decides type of plot

#----------- GENERAL GRIDS ---------------#
#--- pairgrid
sns.PairGrid(iris) # Just the Grid
g = sns.PairGrid(iris)
g.map_diag(plt.hist)
g.map_upper(plt.scatter)
예제 #57
0
def clustermap(adata,
               obs_keys=None,
               use_raw=True,
               show=None,
               save=None,
               **kwargs):
    """Hierarchically-clustered heatmap [Waskom16]_.

    Wraps `seaborn.clustermap <https://seaborn.pydata.org/generated/seaborn.clustermap.html>`_ for :class:`~scanpy.api.AnnData`.

    Parameters
    ----------
    adata : :class:`~scanpy.api.AnnData`
        Annotated data matrix.
    obs_keys : `str`
        Categorical annotation to plot with a different color map.
        Currently, only a single key is supported.
    use_raw : `bool`, optional (default: `True`)
        Use `raw` attribute of `adata` if present.
    show : bool, optional (default: `None`)
         Show the plot.
    save : `bool` or `str`, optional (default: `None`)
        If `True` or a `str`, save the figure. A string is appended to the
        default filename. Infer the filetype if ending on \{'.pdf', '.png', '.svg'\}.
    **kwargs : keyword arguments
        Keyword arguments passed to `seaborn.clustermap <https://seaborn.pydata.org/generated/seaborn.clustermap.html>`_.

    Returns
    -------
    If `show == False`, a `seaborn.ClusterGrid` object.

    Notes
    -----
    The returned object has a savefig() method that should be used if you want
    to save the figure object without clipping the dendrograms.

    To access the reordered row indices, use:
    clustergrid.dendrogram_row.reordered_ind

    Column indices, use: clustergrid.dendrogram_col.reordered_ind

    Examples
    --------
    Soon to come with figures. In the meanwile, see
    https://seaborn.pydata.org/generated/seaborn.clustermap.html.

    >>> import scanpy.api as sc
    >>> adata = sc.datasets.krumsiek11()
    >>> sc.pl.clustermap(adata, obs_keys='cell_type')
    """
    if not isinstance(obs_keys, (str, type(None))):
        raise ValueError('Currently, only a single key is supported.')
    sanitize_anndata(adata)
    X = adata.raw.X if use_raw and adata.raw is not None else adata.X
    df = pd.DataFrame(X, index=adata.obs_names, columns=adata.var_names)
    if obs_keys is not None:
        row_colors = adata.obs[obs_keys]
        utils.add_colors_for_categorical_sample_annotation(adata, obs_keys)
        # do this more efficiently... just a quick solution
        lut = dict(
            zip(row_colors.cat.categories, adata.uns[obs_keys + '_colors']))
        row_colors = adata.obs[obs_keys].map(lut)
        g = sns.clustermap(df, row_colors=row_colors, **kwargs)
    else:
        g = sns.clustermap(df, **kwargs)
    show = settings.autoshow if show is None else show
    if show: pl.show()
    else: return g
예제 #58
0
data.index = data.iloc[:, 0]
data = data.iloc[:, 2:]
clname = list(data.columns)
data = data[[
    'q_A',
    'q_A_unmedicated',
    'q_A_medicated',
    'q_B',
    'q_B_unmedicated',
    'q_B_medicated',
    'q_C',
    'q_C_unmedicated',
    'q_C_medicated',
]]

#First create the clustermap figure
g = sns.clustermap(data, row_colors=np.random.randn(94, 3), figsize=(13, 8))
# set the gridspec to only cover half of the figure
#g.gs.update(left=0.05, right=0.45)
#
##create new gridspec for the right part
#gs2 = matplotlib.gridspec.GridSpec(1,1, left=0.6)
## create axes within this new gridspec
#ax2 = g.fig.add_subplot(gs2[0])
## plot boxplot in the new axes
#sns.boxplot(data=iris, orient="h", palette="Set2", ax = ax2)
plt.show()

np.random.randint(0, 256, 3)
예제 #59
0
# 
# ## Correlations in data

# In[5]:

# Spearman is recommended for ordinal data.
correlations = df.corr(method='spearman')
sns.heatmap(correlations,
           square=True);


# Note that if we were to scale the data, the correlation matrix would be unchanged.

# In[6]:

cg = sns.clustermap(correlations, square=True)
plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(),
        rotation=0);  # Fix rotation of y-labels.


# The expected clusters emerged. Party ID got grouped with economics more than with moral attitudes. Economics and race line up with one another.

# ## Principal component analysis

# In[7]:

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer, StandardScaler

imp = Imputer(strategy='mean')
예제 #60
0
#save the dictionary values to a list
data = list(dict.values())
#Convert the list to an array
an_array = np.array(data, dtype=np.float64)

#Set the array in a dataframe with sample names as columns and annotations as index
df = pd.DataFrame(data=an_array, index=flat_list, columns=args.samples)
#Add a reference sample column to the dataframe only containing 100%
df[args.ref_sample] = [100] * len(df)
#Print the length of the dataframe
print('length dataframe: ', len(df))
#Save the dataframe as csv file
df.to_csv(args.output[:-4] + '.csv')

#Create heatmap of the dataframe
g = sns.clustermap(df, cmap="vlag")
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
#Save and show the heatmap
plt.savefig(args.output, bbox_inches="tight")
plt.show()

#Check the name of the reference sample
#Based on the name of the reference sample, extract the rows where the replicates of the reference are equal to 100 and all the other samples are unequal to 100.
if args.ref_sample == 'PO1':
    new_df = df[(df[args.ref_sample] == 100)
                & (df[args.ref_sample[:-1] + '2'] == 100) &
                (df[args.ref_sample[:-1] + '3'] == 100) & (df['PR1'] != 100) &
                (df['PR2'] != 100) & (df['PR3'] != 100)]
elif args.ref_sample == 'PR1':
    new_df = df[(df[args.ref_sample] == 100)
                & (df[args.ref_sample[:-1] + '2'] == 100) &