def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8, align="left", fontsize=16, edgecolor="k", **kwargs): """ mode can be all, lq, hq """ pylab.clf() L1 = [len(read['sequence']) for read in self.lq_sequence] L2 = [len(read['sequence']) for read in self.hq_sequence] if mode == "all": L = L1 + L2 elif mode == "lq": L = L1 else: L = L2 Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align, ec=edgecolor, **kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("Number of reads", fontsize=fontsize) pylab.grid() ax_twin = pylab.gca().twinx() shift = (X[1] - X[0]) / 2 ax_twin.plot(X[0:-1]- shift, len(L) - pylab.cumsum(Y), "k") ax_twin.set_ylim(bottom=0) pylab.ylabel("CDF", fontsize=fontsize) pylab.title("Read length of Consensus isoforms reads")
def scatter_plot(self, filename=None, hold=False): """Scatter plot of the score versus length of each ortholog .. plot:: :include-source: from sequana import BUSCO, sequana_data b = BUSCO(sequana_data("test_busco_full_table.tsv")) b.scatter_plot() Missing are not show since there is no information about contig . """ if hold is False: pylab.clf() colors = ["green", "orange", "red", "blue"] markers = ['o', 's', 'x', 'o'] for i, this in enumerate(["Complete", "Fragmented", "Duplicated"]): mask = self.df.Status == this if sum(mask) > 0: self.df[mask].plot(x="Length", y="Score", kind="scatter", color=colors[i], ax=pylab.gca(), marker=markers[i], label=this) pylab.legend() pylab.grid() if filename: pylab.savefig(filename)
def plot_hist_normalized_coverage(self, filename=None, binwidth=0.1, max_z=4): """ Barplot of the normalized coverage with gaussian fitting """ pylab.clf() # if there are a NaN -> can't set up binning d = self.df["scale"][self.range[0]:self.range[1]].dropna() # remove outlier -> plot crash if range between min and max is too high d = d[np.abs(d - d.mean()) <= (4 * d.std())] bins = self._set_bins(d, binwidth) self.mixture_fitting.data = d try: self.mixture_fitting.plot(self.gaussians_params, bins=bins, Xmin=0, Xmax=max_z) except ZeroDivisionError: pass pylab.grid(True) pylab.xlim([0,max_z]) pylab.xlabel("Normalised per-base coverage") try: pylab.tight_layout() except: pass if filename: pylab.savefig(filename)
def get_max_gc_correlation(self, reference): """Plot correlation between coverage and GC content by varying the GC window The GC content uses a moving window of size W. This parameter affects the correlation bewteen coverage and GC. This function find the *optimal* window length. """ pylab.clf() corrs = [] wss = [] def func(params): ws = int(round(params[0])) if ws < 10: return 0 self.bed.compute_gc_content(reference, ws) corr = self.get_gc_correlation() corrs.append(corr) wss.append(ws) return corr from scipy.optimize import fmin res = fmin(func, 100, xtol=1, disp=False) # guess is 200 pylab.plot(wss, corrs, "o") pylab.xlabel("GC window size") pylab.ylabel("Correlation") pylab.grid() return res[0]
def plot_bar_flags(self, logy=True, fontsize=16, filename=None): """Plot an histogram of the flags contained in the BAM .. plot:: :include-source: from sequana import BAM, sequana_data b = BAM(sequana_data('test.bam', "testing")) b.plot_bar_flags() .. seealso:: :class:`SAMFlags` for meaning of each flag """ df = self.get_flags_as_df() df = df.sum() pylab.clf() if logy is True: barplot = df.plot(kind='bar', logy=logy, grid=True) else: barplot = df.plot(kind='bar', grid=True) pylab.xlabel("flags", fontsize=fontsize) pylab.ylabel("count", fontsize=fontsize) pylab.tight_layout() if filename: pylab.savefig(filename) return barplot
def hist_contig_length(self, bins=30, fontsize=16): pylab.clf() pylab.hist(self.df.length, lw=1, ec="k", bins=bins) pylab.grid() pylab.xlabel("Contig length", fontsize=fontsize) pylab.ylabel("#", fontsize=fontsize) pylab.title("Distribution {} contigs".format(len(self.df)))
def plot_count_per_sample(self, fontsize=12, sample_list=None): """"Number of mapped reads per sample. Each color for each replicate .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_count_per_sample() """ sample_names = self.sample_names N = len(sample_names) dd = self.df[sample_names].sum() pylab.clf() colors = [] for sample in self.sample_names: colors.append(self.colors[self.get_cond_from_sample(sample)]) pylab.bar(range(N), (dd/1000000).values, color=colors, alpha=1, zorder=10, lw=1, ec="k", width=0.9) pylab.xlabel("Samples", fontsize=fontsize) pylab.ylabel("Total read count (millions)", fontsize=fontsize) pylab.grid(True, zorder=0) pylab.title("Total read count per sample", fontsize=fontsize) pylab.xticks(range(N), self.sample_names)
def find_motif(bamfile, motif="CAGCAG", window=200, savefig=False, local_th=5, global_th=10): """ If at least 10 position contains at least 5 instances of the motif, then this is a hit and the alignment is kept """ b1 = BAM(bamfile) # FIND motif and create pictures count = 0 found = [] Ss = [] alns = [] for a in b1: count +=1 if a.query_sequence is None: continue seq = a.query_sequence X1 = [seq[i:i+window].count(motif) for i in range(len(seq))] S = sum([x>local_th for x in X1]) Ss.append(S) als.append(a) if S > global_th: found.append(True) off = a.query_alignment_start pylab.clf() pylab.plot(range(off+a.reference_start, off+a.reference_start+len(seq)),X1) if savefig: pylab.savefig("{}_{}_{}.png".format(a.reference_name, S, a.query_name.replace("/", "_"))) else: found.append(False) return alns, found, Ss
def plot_percentage_null_read_counts(self): """ Bars represent the percentage of null counts in each samples. The dashed horizontal line represents the percentage of feature counts being equal to zero across all samples. .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_percentage_null_read_counts() """ N = len(self.sample_names) data = (self.df[self.sample_names]==0).sum() data = data / len(self.df) * 100 all_null = (self.df[self.sample_names].sum(axis=1) == 0).sum() pylab.clf() pylab.bar(range(N), data) pylab.axhline(all_null / len(self.df) * 100, lw=2, ls="--", color="k") pylab.xticks(range(N), self.sample_names) pylab.xlabel("Sample")
def plot_alignment(self, bamfile, motif, window=200, global_th=10,title=None,legend=True, legend_fontsize=11, valid_rnames=[], valid_flags=[]): """ plot alignments that match the motif. """ bam = BAM(bamfile) print("Found {} hits".format(len(bam))) pylab.clf() count = 0 for aln in bam: if valid_rnames and aln.rname not in valid_rnames: continue if valid_flags and aln.flag not in valid_flags: continue seq = aln.query_sequence if seq: count += 1 X1 = [seq[i:i+window].count(motif) for i in range(len(seq))] pylab.plot(range(aln.reference_start, aln.reference_start+len(seq)),X1, label=aln.query_name) print("Showing {} entries after filtering".format(count)) max_theo = int(1.2*window / len(motif)) pylab.ylim([0, max_theo]) if legend and count<15: pylab.legend(fontsize=legend_fontsize) if title: pylab.title(title, fontsize=16)
def hist_length_repeats(self, bins=20, alpha=0.5, hold=False, fontsize=12, grid=True, title="Repeat length", xlabel="Repeat length", ylabel="#"): """Plots histogram of the repeat lengths """ # check that user has set a threshold if self._list_len_repeats is None: self._get_list_len_repeats() if hold is False: pylab.clf() pylab.hist(self._list_len_repeats, alpha=alpha, bins=bins) pylab.title(title) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) if grid is True: pylab.grid(True)
def plot_idr_vs_peaks(self, filename=None, savefig=False): # global_idr is actually -log10(idr) pylab.clf() X1 = pylab.linspace(0, self.threshold, 100) X2 = pylab.linspace(self.threshold, 1, 100) # convert global idr to proba df1 = self.df.query("idr<@self.threshold") df2 = self.df.query("idr>[email protected]") pylab.plot([sum(df1['idr'] < x) for x in X1], X1, '-', color='r', lw=2) shift = len(df1) pylab.plot([shift + sum(df2['idr'] < x) for x in X2], X2, "-", color='k', lw=2) pylab.xlabel('Number of significant peaks') pylab.ylabel('IDR') pylab.axhline(0.05, color='b', ls='--') pylab.axvline(self.N_significant_peaks, color='b', ls='--') if savefig: pylab.savefig(filename)
def plot_specific_alignment(self, bamfile, query_name, motif,clf=True, show_figure=True, authorized_flags=[0,16], windows=[10, 50, 100, 150,200, 250,500, 1000], local_threshold=5): found = None bam = BAM(bamfile) for aln in bam: if aln.query_name == query_name and aln.flag in authorized_flags: found = aln break # we may have several entries. let us pick up the first sizes = [] if found: # Detection seq = found.query_sequence if clf:pylab.clf() for window in windows: X = [seq[i:i+window].count(motif) for i in range(len(seq))] if show_figure: pylab.plot(X, label=window) score = sum([x>local_threshold for x in X]) sizes.append(score-window) if show_figure: pylab.legend() pylab.ylabel("# {} in a given sliding window".format(motif)) pylab.title(query_name) else: print("{} Not found in {} file".format(query_name, bamfile)) return sizes
def plot_and_save_all(self, dpi=100, directory="."): def savefile(filename): outname = directory + os.sep + filename pylab.savefig(outname, dpi=dpi) pylab.clf() self.hist_polymerase_per_barcode() savefile("barcoding_hist_polymerase_per_barcode.png") pylab.clf() self.hist_quality_per_barcode() savefile("barcoding_hist_quality_per_barcode.png") pylab.clf() self.hist_mean_polymerase_read_length() savefile("barcoding_hist_mean_polymerase_read_length.png") pylab.clf() self.plot_polymerase_per_barcode() savefile("barcoding_polymerase_per_barcode.png") pylab.clf() self.plot_subreads_histogram() savefile("barcoding_subreads_histogram.png") print(self)
def plot_ranks(self, filename=None, savefig=False): # ranks # the *score* columns contains the scaled IDR value, min(int(log2(-125IDR), 1000). # e.g. peaks with an IDR of 0 have a score of 1000, idr 0.05 have a score of # int(-125log2(0.05)) = 540, and idr 1.0 has a score of 0. df1 = self.df.query('score>540') df2 = self.df.query('score<=540') pylab.clf() pylab.plot(df1.rep1_rank, df1.rep2_rank, 'ko', alpha=0.5, label='<0.05 IDR') pylab.plot(df2.rep1_rank, df2.rep2_rank, 'ro', alpha=0.5, label='>=0.05 IDR') pylab.xlabel("Peak rank - replicate 1") pylab.ylabel("Peak rank - replicate 2") N = len(self.df) pylab.plot([0, N], [0, N], color='blue', alpha=0.5, ls='--') #pylab.xlim([0,1.05]) #pylab.ylim([0,1.05]) pylab.legend(loc='lower right') if savefig: pylab.savefig(filename)
def scatterplot(self, enrich, cutoff=0.05, nmax=10, gene_set_size=[]): df = self._get_final_df(enrich.results, cutoff=cutoff, nmax=nmax) pylab.clf() pylab.scatter(-pylab.log10(df['Adjusted P-value']), range(len(df)), s=10 * df['size'], c=df['size']) pylab.xlabel("Odd ratio") pylab.ylabel("Gene sets") pylab.yticks(range(len(df)), df.name) a, b = pylab.xlim() pylab.xlim([0, b]) pylab.grid(True) ax = pylab.gca() M = max(df['size']) if M > 100: l1, l2, l3 = "10", "100", str(M) else: l1, l2, l3 = str(round(M / 3)), str(round(M * 2 / 3)), str(M) handles = [ pylab.Line2D([0], [0], marker="o", markersize=5, label=l1, ls=""), pylab.Line2D([0], [0], marker="o", markersize=10, label=l2, ls=""), pylab.Line2D([0], [0], marker="o", markersize=15, label=l3, ls="") ] ax.legend(handles=handles, loc="upper left", title="gene-set size") pylab.axvline(1.3, lw=2, ls="--", color="r") pylab.tight_layout() ax = pylab.colorbar(pylab.gci()) return df
def plot_specific_alignment(self, query_name, motif, clf=True, windows=[10, 50, 100, 200, 500, 1000]): found = None bam = BAM(self.bamfile) for aln in bam: if aln.query_name == query_name: found = aln if found: # Detection seq = found.query_sequence if clf: pylab.clf() for window in windows: X = [seq[i:i + window].count(motif) for i in range(len(seq))] pylab.plot(X, label=window) score = sum([x > window / 6 for x in X]) print(window, score / 3.) pylab.legend() pylab.ylabel("# {} in a given sliding window".format(motif)) pylab.title(query_name) else: print("Not found")
def scatter_plot(self, filename=None, hold=False): """Scatter plot of the score versus length of each ortholog .. plot:: :include-source: from sequana import BUSCO, sequana_data b = BUSCO(sequana_data("test_busco_full_table.tsv")) b.scatter_plot() """ if hold is False: pylab.clf() colors = ["green", "orange", "red", "blue"] markers = ['o', 's', 'x', 'o'] for i, this in enumerate(["Complete", "Fragmented", "Missing", "Duplicated"]): mask = self.df.Status == "Complete" if sum(mask)>0: self.df[mask].plot(x="Length", y="Score", kind="scatter", color=colors[i], marker=markers[i], label="Complete") pylab.legend() pylab.grid() if filename: pylab.savefig(filename)
def hist_length_repeats(self, bins=None, alpha=0.5, hold=False, fontsize=12, grid=True, label="Repeat length", xlabel="Repeat length", ylabel="#"): """Plots histogram of the repeat lengths """ # check that user has set a threshold if self._list_len_repeats is None: self._get_list_len_repeats() if bins is None: bins = range(max(0, self.threshold - 1), max(self._list_len_repeats) + 2) if hold is False: pylab.clf() pylab.hist(self._list_len_repeats, alpha=alpha, label=label, bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) if grid is True: pylab.grid(True)
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#",title="", clip_upper_SNR=30): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram. Note that the range starts at 0 and ends at clip_upper_SNR :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() maxSNR = 0 for letter in "ACGT": m = self._df.loc[:,"snr_{}".format(letter)].max() if m > maxSNR: maxSNR = m if maxSNR > clip_upper_SNR: maxSNR = clip_upper_SNR bins = pylab.linspace(0, maxSNR, bins) pylab.hist(self._df.loc[:,'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:,'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:,'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:,'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title,fontsize=fontsize) if grid is True: pylab.grid(True)
def hist_read_length(self, bins=80, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Read Length", ylabel="#", label="", title=None, logy=False, ec="k", hist_kwargs={}): """Plot histogram Read length :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_read_length() """ mean_len = np.mean(self.df.loc[:, 'read_length']) # set title if not provided if title is None: title = "Read length \n Mean length : %.2f" % (mean_len) if hold is False: pylab.clf() hist = HistCumSum(self.df.loc[:, 'read_length'], fontsize=fontsize, grid=grid) hist.title = title hist.xlabel = xlabel hist.ylabel = ylabel hist.plot(bins=bins, alpha=alpha, edgecolor=ec, label="%s, mean : %.0f, N : %d" % (label, mean_len, len(self)), log=logy, **hist_kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0)
def scatter_length_cov_gc(self, min_length=200, min_cov=10): pylab.clf() pylab.scatter(self.df.length, self.df['cov'], c=self.df.GC) pylab.loglog() pylab.axvline(min_length, lw=2, c="r", ls='--') pylab.axhline(min_cov, lw=2, c="r", ls='--') pylab.xlabel("contig length") pylab.ylabel("contig coverage") pylab.colorbar(label="GC") pylab.grid(True)
def hist_GC(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="GC %", ylabel="#", label="", title=None): """Plot histogram GC content :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: fontsize of the x and y labels and title. :param bool grid: add grid or not :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_GC() """ mean_GC = np.mean(self.df.loc[:, 'GC_content']) # set title if needed if title is None: title = "GC %% \n Mean GC : %.2f" % (mean_GC) # histogram GC percent if hold is False: pylab.clf() pylab.hist(self.df.loc[:, 'GC_content'], bins=bins, alpha=alpha, label=label + ", mean : " + str(round(mean_GC, 2)) + ", N : " + str(len(self))) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True) pylab.xlim([0, 100]) try: pylab.tight_layout() except: pass
def plot_volcano_differences(self, mode="all"): cond1, cond2 = "cond1", "cond2" labels = [cond1, cond2] A = self.r1.df.loc[self.r1.gene_lists[mode]] B = self.r2.df.loc[self.r2.gene_lists[mode]] AB = set(A.index).intersection(set(B.index)) Aonly = A.loc[set(A.index).difference(set(B.index))] Bonly = B.loc[set(B.index).difference(set(A.index))] Acommon = A.loc[AB] Bcommon = B.loc[AB] pylab.clf() pylab.plot(Acommon.log2FoldChange, -np.log10(Acommon.padj), marker="o", alpha=0.5, color="r", lw=0, label="Common in experiment 1", pickradius=4, picker=True) pylab.plot(Bcommon.log2FoldChange, -np.log10(Bcommon.padj), marker="o", alpha=0.5, color="orange", lw=0, label="Common in experiment 2", pickradius=4, picker=True) for x in AB: a_l = A.loc[x].log2FoldChange a_p = -np.log10(A.loc[x].padj) b_l = B.loc[x].log2FoldChange b_p = -np.log10(B.loc[x].padj) pylab.plot([a_l, b_l], [a_p, b_p], 'k', alpha=0.5) pylab.plot(Bonly.log2FoldChange, -np.log10(Bonly.padj), marker="*", alpha=0.5, color="blue", lw=0, label="In experiment 2 only", pickradius=4, picker=True) pylab.plot(Aonly.log2FoldChange, -np.log10(Aonly.padj), marker="*", alpha=0.5, color="cyan", lw=0, label="In experiment 1 only", pickradius=4, picker=True) for name, x in Bonly.iterrows(): x1 = x.log2FoldChange y1 = -np.log10(x.padj) x2 = self.r1.df.loc[name].log2FoldChange y2 = -np.log10(self.r1.df.loc[name].padj) pylab.plot( [x1,x2], [y1,y2], ls="--", color='r') for name, x in Aonly.iterrows(): x1 = x.log2FoldChange y1 = -np.log10(x.padj) x2 = self.r2.df.loc[name].log2FoldChange y2 = -np.log10(self.r2.df.loc[name].padj) pylab.plot( [x1,x2], [y1,y2], ls="-", color='r') pylab.axhline(1.33, alpha=0.5, ls="--", color="r") pylab.xlabel("log2 fold Change") pylab.ylabel("log10 adjusted p-values") pylab.legend() pylab.grid(True) return Aonly, Bonly, Acommon, Bcommon
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#", title=""): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import BAMPacbio from sequana import sequana_data b = BAMPacbio(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() pylab.hist(self._df.loc[:, 'snr_A'], alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:, 'snr_C'], alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:, 'snr_G'], alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:, 'snr_T'], alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def plot_feature_most_present(self): """""" df = [] for x, y in self.counts_raw.idxmax().iteritems(): most_exp_gene_count = self.counts_raw.stack().loc[y, x] total_sample_count = self.counts_raw.sum().loc[x] df.append({ "label": x, "gene_id": y, "count": most_exp_gene_count, "total_sample_count": total_sample_count, "most_exp_percent": most_exp_gene_count / total_sample_count * 100, }) df = pd.DataFrame(df).set_index("label") df = pd.concat([self.design_df, df], axis=1) pylab.clf() p = pylab.barh( df.index, df.most_exp_percent, color=df.group_color, zorder=10, lw=1, ec="k", height=0.9, ) for idx, rect in enumerate(p): pylab.text( 2, # * rect.get_height(), idx, # rect.get_x() + rect.get_width() / 2.0, df.gene_id.iloc[idx], ha="center", va="center", rotation=0, zorder=20, ) self._format_plot( # title="Counts monopolized by the most expressed gene", # xlabel="Sample", xlabel="Percent of total reads", ) pylab.tight_layout()
def boxplot_mapq_concordance(self): # method can only be bwa for now assert self.method == "bwa" data = self._get_data() df = pd.DataFrame(data, columns=["mapq", "length", "concordance"]) pylab.clf() pylab.boxplot([df[df.mapq == i]['concordance'] for i in range(1,61)]) pylab.xlabel("mapq") pylab.ylabel("concordance") pylab.grid() tt = [10,20,30,40,50,60] pylab.xticks(tt, tt)
def boxplot_mapq_concordance(self, method): # method can only be bwa for now assert method == "bwa" data = self._get_data(method) df = pd.DataFrame(data, columns=["mapq", "length", "concordance"]) pylab.clf() pylab.boxplot([df[df.mapq == i]['concordance'] for i in range(1, 61)]) pylab.xlabel("mapq") pylab.ylabel("concordance") pylab.grid() tt = [10, 20, 30, 40, 50, 60] pylab.xticks(tt, tt)
def plot_genesets_hist(self, bins=20): N = len(self.gene_sets.keys()) pylab.clf() pylab.hist([len(v) for k, v in self.gene_sets.items()], bins=bins, lw=1, ec="k") pylab.title("{} gene sets".format(N)) pylab.xlabel("Gene set sizes") pylab.grid(True) a, b = pylab.xlim() pylab.xlim([0, b])
def plot(self, clf=True): if clf: pylab.clf() M = self.df_shustring.shustring_length.max() print(M) M = int(M / 1000) + 1 for i in range(M): pylab.axhline(i * 1000, ls='--', color='grey') pylab.plot(self.df_shustring.shustring_length) pylab.xlabel('position (bp)') pylab.ylabel('Length of repeats') pylab.ylim(bottom=0)
def hist_len(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Read Length", ylabel="#", label="", title=None): """Plot histogram Read length :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import BAMPacbio from sequana import sequana_data b = BAMPacbio(sequana_data("test_pacbio_subreads.bam")) b.hist_len() """ if self._df is None: self._get_df() mean_len = np.mean(self._df.loc[:, 'read_length']) # set title if not provided if title is None: title = "Read length \n Mean length : %.2f" % (mean_len) # histogram GC percent if hold is False: pylab.clf() pylab.hist(self._df.loc[:, 'read_length'], bins=bins, alpha=alpha, label="%s, mean : %.0f, N : %d" % (label, mean_len, self._N)) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def hist_nb_passes(self, bins=None, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Number of ZMW passes", logy=True, ylabel="#", label="", title="Number of ZMW passes"): """Plot histogram of number of reads per ZMW (number of passes) :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param bool logy: use log scale on the y axis (default to True) :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_nb_passes() """ max_nb_pass = self.df.nb_passes.max() if bins is None: k = range(1, max_nb_pass + 1) # histogram nb passes if hold is False: pylab.clf() pylab.hist(self.df.nb_passes, bins=bins, alpha=alpha, label=label, log=logy, width=1) if len(k) < 5: pylab.xticks(range(6), range(6)) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def plot_GC_read_len(self, hold=False, fontsize=12, bins=[60, 60], grid=True, xlabel="GC %", ylabel="#", cmap="BrBG"): """Plot GC content versus read length :param bool hold: :param int fontsize: for x and y labels and title :param bins: a integer or tuple of 2 integers to specify the binning of the x and y 2D histogram. :param bool grid: :param str xlabel: :param str ylabel: .. plot:: :include-source: from sequana.pacbio import BAMPacbio from sequana import sequana_data b = BAMPacbio(sequana_data("test_pacbio_subreads.bam")) b.plot_GC_read_len(bins=[10, 10]) """ if self._df is None: self._get_df() mean_len = np.mean(self._df.loc[:, 'read_length']) mean_GC = np.mean(self._df.loc[:, 'GC_content']) if hold is False: pylab.clf() data = self._df.loc[:, ['read_length', 'GC_content']].dropna() h = biokit.viz.hist2d.Hist2D(data) res = h.plot(bins=bins, contour=False, norm='log', Nlevels=6, cmap=cmap) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("GC %", fontsize=fontsize) pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" % (mean_len, mean_GC), fontsize=fontsize) pylab.ylim([0, 100]) if grid is True: pylab.grid(True)
def hist_length_repeats(self, bins=20, alpha=0.5, hold=False, fontsize=12, grid=True, title="Repeat length", xlabel="Repeat length", ylabel="#", logy=True): """Plots histogram of the repeat lengths """ # check that user has set a threshold if hold is False: pylab.clf() pylab.hist(self.list_len_repeats, alpha=alpha, bins=bins) pylab.title(title) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) if grid is True: pylab.grid(True) if logy: pylab.semilogy()
def hist_transcript(self, hide_unmapped=True): pylab.clf() if hide_unmapped is True: query = "reference_length>0 and reference_name!=-1" else: query = "reference_length>0" print(query) ts = self.df.query(query).groupby("reference_name").count().reference_length if len(ts) == 0: print("nothing to plot") return ts ts.plot(kind="bar" ,color="r") try: pylab.tight_layout() except: pass return ts
def diagnostics(self, bins=60, clear=True): if clear: pylab.clf() pylab.subplot(3,1,1) pylab.hist(self.aprob, bins=bins) pylab.title("Acceptation") pylab.subplot(3,1,2) pylab.plot(self.vec) pylab.title("proposition") pylab.subplot(3,1,3) y, x, _ = pylab.hist(self.vec, bins, density=True, lw=0.5, ec="k") M1 = max(y) # this normalisation is an approximation/hack pylab.plot(self.Xtarget, self.Ytarget/ (max(self.Ytarget)/M1), "-ro") pylab.title("simulated (blue) and target (red) distributions")
def pie_plot(self, filename=None, hold=False): """Plot PIE plot of the status (complete / fragment / missed) .. plot:: :include-source: from sequana import BUSCO, sequana_data b = BUSCO(sequana_data("test_busco_full_table.tsv")) b.pie_plot() """ if hold is False: pylab.clf() self.df.groupby('Status').count()['# Busco id'].plot(kind="pie") pylab.ylabel("") #pylab.title("Distribution Complete/Fragmented/Missing") #pylab.legend() if filename: pylab.savefig(filename)
def hist_GC(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="GC %", ylabel="#", label="",title=None): """Plot histogram GC content :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: fontsize of the x and y labels and title. :param bool grid: add grid or not :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_GC() """ mean_GC = np.mean(self.df.loc[:,'GC_content']) # set title if needed if title is None: title = "GC %% \n Mean GC : %.2f" %(mean_GC) # histogram GC percent if hold is False: pylab.clf() pylab.hist(self.df.loc[:,'GC_content'], bins=bins, alpha=alpha, label=label + ", mean : " + str(round(mean_GC, 2)) + ", N : " + str(len(self))) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True) pylab.xlim([0, 100]) try: pylab.tight_layout() except:pass
def hist_read_length(self, bins=80, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Read Length", ylabel="#", label="", title=None, logy=False, ec="k", hist_kwargs={}): """Plot histogram Read length :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_read_length() """ mean_len = np.mean(self.df.loc[:,'read_length']) # set title if not provided if title is None: title = "Read length \n Mean length : %.2f" %(mean_len) if hold is False: pylab.clf() hist = HistCumSum(self.df.loc[:,'read_length'], fontsize=fontsize, grid=grid) hist.title = title hist.xlabel = xlabel hist.ylabel = ylabel hist.plot(bins=bins, alpha=alpha, edgecolor=ec, label= "%s, mean : %.0f, N : %d" % (label, mean_len, len(self)), log=logy, **hist_kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0)
def hist_nb_passes(self, bins=None, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Number of ZMW passes", logy=True, ylabel="#", label="", title="Number of ZMW passes"): """Plot histogram of number of reads per ZMW (number of passes) :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param bool logy: use log scale on the y axis (default to True) :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_nb_passes() """ max_nb_pass = self.df.nb_passes.max() if bins is None: k = range(1, max_nb_pass+1) # histogram nb passes if hold is False: pylab.clf() pylab.hist(self.df.nb_passes, bins=bins, alpha=alpha, label=label, log=logy, width=1) if len(k) < 5: pylab.xticks(range(6), range(6)) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def plot(self, bins=80, rwidth=0.8, **kwargs): pylab.clf() Y, X, _ = pylab.hist(self.data, bins=bins, rwidth=rwidth, **kwargs) pylab.xlabel(self.xlabel, fontsize=self.fontsize) pylab.ylabel(self.ylabel, fontsize=self.fontsize) """self.Y = Y self.X = X ax_twin = pylab.gca().twinx() shift = (X[1] - X[0]) / 2 ax_twin.plot(X[0:-1]- shift, len(self.data) - pylab.cumsum(Y), "k") ax_twin.set_ylim(bottom=0) pylab.ylabel("CDF", fontsize=self.fontsize) """ pylab.grid(self.grid) pylab.title(self.title) try: pylab.tight_layout() except:pass
def plot_GC_read_len(self, hold=False, fontsize=12, bins=[200, 60], grid=True, xlabel="GC %", ylabel="#", cmap="BrBG"): """Plot GC content versus read length :param bool hold: :param int fontsize: for x and y labels and title :param bins: a integer or tuple of 2 integers to specify the binning of the x and y 2D histogram. :param bool grid: :param str xlabel: :param str ylabel: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.plot_GC_read_len(bins=[10, 10]) """ mean_len = np.mean(self.df.loc[:,'read_length']) mean_GC = np.mean(self.df.loc[:,'GC_content']) if hold is False: pylab.clf() data = self.df.loc[:,['read_length','GC_content']].dropna() h = biokit.viz.hist2d.Hist2D(data) res = h.plot(bins=bins, contour=False, norm='log', Nlevels=6, cmap=cmap) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("GC %", fontsize=fontsize) pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" % (mean_len, mean_GC), fontsize=fontsize) pylab.ylim([0, 100]) if grid is True: pylab.grid(True)
def plot(self, kind="pie", cmap="copper", threshold=1, radius=0.9, textcolor="red", **kargs): """A simple non-interactive plot of taxons :return: None if no taxon were found and a dataframe otherwise A Krona Javascript output is also available in :meth:`kraken_to_krona` .. plot:: :include-source: from sequana import KrakenResults, sequana_data test_file = sequana_data("test_kraken.out", "testing") k = KrakenResults(test_file) df = k.plot(kind='pie') .. seealso:: to generate the data see :class:`KrakenPipeline` or the standalone application **sequana_taxonomy**. """ if len(self._df) == 0: return if self._data_created == False: status = self.kraken_to_krona() if kind not in ['barh', 'pie']: logger.error('kind parameter: Only barh and pie are supported') return # This may have already been called but maybe not. This is not time # consuming, so we call it again here if len(self.taxons.index) == 0: return None df = self.get_taxonomy_biokit(list(self.taxons.index)) df.ix[-1] = ["Unclassified"] * 8 data = self.taxons.copy() data.ix[-1] = self.unclassified data = data/data.sum()*100 assert threshold > 0 and threshold < 100 others = data[data<threshold].sum() data = data[data>threshold] names = df.ix[data.index]['name'] data.index = names.values data.ix['others'] = others try: data.sort_values(inplace=True) except: data.sort(inplace=True) # text may be long so, let us increase the figsize a little bit pylab.figure(figsize=(10,8)) pylab.clf() if kind == "pie": ax = data.plot(kind=kind, cmap=cmap, autopct='%1.1f%%', radius=radius, **kargs) pylab.ylabel(" ") for text in ax.texts: # large, x-small, small, None, x-large, medium, xx-small, # smaller, xx-large, larger text.set_size("small") text.set_color(textcolor) for wedge in ax.patches: wedge.set_linewidth(1) wedge.set_edgecolor("k") self.ax = ax elif kind == "barh": ax = data.plot(kind=kind, **kargs) pylab.xlabel(" percentage ") return data