def imshow_qualities(self): """Qualities :: from sequana import sequana_data from sequana import FastQC filename = sequana_data("test.fastq", "testing") qc = FastQC(filename) qc.imshow_qualities() from pylab import tight_layout; tight_layout() """ tiles = self._get_tile_info() d = defaultdict(list) for tile, seq in zip(tiles['tiles'], self.qualities): d[tile].append(seq) self.data_imqual = [pd.DataFrame(d[key]).mean().values for key in sorted(d.keys())] from biokit.viz import Imshow im = Imshow(self.data_imqual) im.plot(xticks_on=False, yticks_on=False, origin='lower') pylab.title("Quality per tile", fontsize=self.fontsize) pylab.xlabel("Position in read (bp)") pylab.ylabel("tile number")
def check(self, bins=60): y, x, _ = pylab.hist(self.vec, bins, density=True, lw=0.5, ec="k") M1 = max(y) # this normalisation is an approximation/hack pylab.plot(self.Xtarget, self.Ytarget/ (max(self.Ytarget)/M1), "-ro") pylab.title("simulated (blue) and target (red) distributions")
def hist_contig_length(self, bins=30, fontsize=16): pylab.clf() pylab.hist(self.df.length, lw=1, ec="k", bins=bins) pylab.grid() pylab.xlabel("Contig length", fontsize=fontsize) pylab.ylabel("#", fontsize=fontsize) pylab.title("Distribution {} contigs".format(len(self.df)))
def imshow_qualities(self): """Qualities :: from sequana import sequana_data from sequana import FastQC filename = sequana_data("test.fastq", "testing") qc = FastQC(filename) qc.imshow_qualities() from pylab import tight_layout; tight_layout() """ tiles = self._get_tile_info() d = defaultdict(list) for tile, seq in zip(tiles['tiles'], self.qualities): d[tile].append(seq) self.data_imqual = [pd.DataFrame(d[key]).mean().values for key in sorted(d.keys())] from sequana.viz import Imshow im = Imshow(self.data_imqual) im.plot(xticks_on=False, yticks_on=False, origin='lower') pylab.title("Quality per tile", fontsize=self.fontsize) pylab.xlabel("Position in read (bp)") pylab.ylabel("tile number")
def plot_specific_alignment(self, bamfile, query_name, motif,clf=True, show_figure=True, authorized_flags=[0,16], windows=[10, 50, 100, 150,200, 250,500, 1000], local_threshold=5): found = None bam = BAM(bamfile) for aln in bam: if aln.query_name == query_name and aln.flag in authorized_flags: found = aln break # we may have several entries. let us pick up the first sizes = [] if found: # Detection seq = found.query_sequence if clf:pylab.clf() for window in windows: X = [seq[i:i+window].count(motif) for i in range(len(seq))] if show_figure: pylab.plot(X, label=window) score = sum([x>local_threshold for x in X]) sizes.append(score-window) if show_figure: pylab.legend() pylab.ylabel("# {} in a given sliding window".format(motif)) pylab.title(query_name) else: print("{} Not found in {} file".format(query_name, bamfile)) return sizes
def plot_alignment(self, bamfile, motif, window=200, global_th=10,title=None,legend=True, legend_fontsize=11, valid_rnames=[], valid_flags=[]): """ plot alignments that match the motif. """ bam = BAM(bamfile) print("Found {} hits".format(len(bam))) pylab.clf() count = 0 for aln in bam: if valid_rnames and aln.rname not in valid_rnames: continue if valid_flags and aln.flag not in valid_flags: continue seq = aln.query_sequence if seq: count += 1 X1 = [seq[i:i+window].count(motif) for i in range(len(seq))] pylab.plot(range(aln.reference_start, aln.reference_start+len(seq)),X1, label=aln.query_name) print("Showing {} entries after filtering".format(count)) max_theo = int(1.2*window / len(motif)) pylab.ylim([0, max_theo]) if legend and count<15: pylab.legend(fontsize=legend_fontsize) if title: pylab.title(title, fontsize=16)
def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40, xlabel="Frame", ylabel="#", bar_width=0.35): if self._ORF_pos is None: self._find_ORF_CDS() # number of ORF and CDS found by frame frames = [-3, -2, -1, 1, 2, 3] nb_res_ORF = [] nb_res_CDS = [] for fr in frames: nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr] ["len_ORF"].dropna().shape[0]) nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr] ["len_CDS"].dropna().shape[0]) pylab.bar(np.array(frames) - (bar_width / 2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" % sum(nb_res_ORF)) pylab.bar(np.array(frames) + (bar_width / 2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" % sum(nb_res_CDS)) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend(loc=1) pylab.title("Number of ORF and CDS by frame")
def plot(self, color_line='r', bgcolor='grey', color='yellow', lw=4, hold=False, ax=None): xmax = self.xmax + 1 if ax: pylab.sca(ax) pylab.fill_between([0, xmax], [0, 0], [20, 20], color='red', alpha=0.3) pylab.fill_between([0, xmax], [20, 20], [30, 30], color='orange', alpha=0.3) pylab.fill_between([0, xmax], [30, 30], [41, 41], color='green', alpha=0.3) if self.X is None: X = range(1, self.xmax + 1) pylab.fill_between(X, self.df.mean() + self.df.std(), self.df.mean() - self.df.std(), color=color, interpolate=False) pylab.plot(X, self.df.mean(), color=color_line, lw=lw) pylab.ylim([0, 41]) pylab.xlim([0, self.xmax + 1]) pylab.title("Quality scores across all bases") pylab.xlabel("Position in read (bp)") pylab.ylabel("Quality") pylab.grid(axis='x')
def hist_ORF_CDS_linearscale(self, alpha=0.5, bins=40, xlabel="Length", ylabel="#"): if self._ORF_pos is None: self._find_ORF_CDS() n_ORF = self._ORF_pos["len_ORF"].dropna().shape[0] n_CDS = self._ORF_pos["len_CDS"].dropna().shape[0] # plot for all ORF and CDS pylab.hist(self._ORF_pos["len_ORF"].dropna(), alpha=alpha, label="ORF, N = " + str(n_ORF), bins=bins) pylab.hist(self._ORF_pos["len_CDS"].dropna(), alpha=alpha, label="CDS, N = " + str(n_CDS), bins=bins) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend() pylab.title("Length of ORF and CDS (after filter %s > %d)" \ %(self._type_filter, self._threshold))
def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8, align="left", fontsize=16, edgecolor="k", **kwargs): """ mode can be all, lq, hq """ pylab.clf() L1 = [len(read['sequence']) for read in self.lq_sequence] L2 = [len(read['sequence']) for read in self.hq_sequence] if mode == "all": L = L1 + L2 elif mode == "lq": L = L1 else: L = L2 Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align, ec=edgecolor, **kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("Number of reads", fontsize=fontsize) pylab.grid() ax_twin = pylab.gca().twinx() shift = (X[1] - X[0]) / 2 ax_twin.plot(X[0:-1]- shift, len(L) - pylab.cumsum(Y), "k") ax_twin.set_ylim(bottom=0) pylab.ylabel("CDF", fontsize=fontsize) pylab.title("Read length of Consensus isoforms reads")
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#",title="", clip_upper_SNR=30): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram. Note that the range starts at 0 and ends at clip_upper_SNR :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() maxSNR = 0 for letter in "ACGT": m = self._df.loc[:,"snr_{}".format(letter)].max() if m > maxSNR: maxSNR = m if maxSNR > clip_upper_SNR: maxSNR = clip_upper_SNR bins = pylab.linspace(0, maxSNR, bins) pylab.hist(self._df.loc[:,'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:,'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:,'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:,'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title,fontsize=fontsize) if grid is True: pylab.grid(True)
def check(self, bins=60): y, x, _ = pylab.hist(self.vec, bins, density=True, lw=0.5, ec="k") M1 = max(y) # this normalisation is an approximation/hack pylab.plot(self.Xtarget, self.Ytarget / (max(self.Ytarget) / M1), "-ro") pylab.title("simulated (blue) and target (red) distributions")
def plotter(filename, key): name = key.replace(" ", "_") pylab.ioff() histograms[key].plot(logy=False, lw=2, marker="o") pylab.title(name + "(%s)" % count) pylab.grid(True) pylab.savefig(filename) pylab.close() # need to close the figure otherwise warnings
def hist_plot_contig_length(self, bins=40, fontsize=16): """Plot distribution of contig lengths""" L = len(self.fasta.sequences) pylab.hist(self.fasta.lengths, lw=1, ec="k", bins=bins) pylab.grid() pylab.xlabel("Contig length", fontsize=fontsize) pylab.ylabel("#", fontsize=fontsize) pylab.title("Distribution {} contigs".format(L))
def plotter(filename, key): name = key.replace(" ", "_") pylab.ioff() histograms[key].plot(logy=False, lw=2, marker="o") pylab.title(name + "(%s)" % count) pylab.grid(True) pylab.savefig(filename) pylab.close() # need to close the figure otherwise warnings
def hist_GC(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="GC %", ylabel="#", label="", title=None): """Plot histogram GC content :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: fontsize of the x and y labels and title. :param bool grid: add grid or not :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_GC() """ mean_GC = np.mean(self.df.loc[:, 'GC_content']) # set title if needed if title is None: title = "GC %% \n Mean GC : %.2f" % (mean_GC) # histogram GC percent if hold is False: pylab.clf() pylab.hist(self.df.loc[:, 'GC_content'], bins=bins, alpha=alpha, label=label + ", mean : " + str(round(mean_GC, 2)) + ", N : " + str(len(self))) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True) pylab.xlim([0, 100]) try: pylab.tight_layout() except: pass
def plot_genesets_hist(self, bins=20): N = len(self.gene_sets.keys()) pylab.clf() pylab.hist([len(v) for k, v in self.gene_sets.items()], bins=bins, lw=1, ec="k") pylab.title("{} gene sets".format(N)) pylab.xlabel("Gene set sizes") pylab.grid(True) a, b = pylab.xlim() pylab.xlim([0, b])
def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="SNR", ylabel="#", title=""): """Plot histogram of the ACGT SNRs for all reads :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str title: .. plot:: :include-source: from sequana.pacbio import BAMPacbio from sequana import sequana_data b = BAMPacbio(sequana_data("test_pacbio_subreads.bam")) b.hist_snr() """ if self._df is None: self._get_df() # old pacbio format has no SNR stored if len(self._df['snr_A'].dropna()) == 0: # nothing to plot from sequana import sequana_data pylab.clf() pylab.imshow(pylab.imread(sequana_data("no_data.jpg"))) pylab.gca().axis('off') return if hold is False: pylab.clf() pylab.hist(self._df.loc[:, 'snr_A'], alpha=alpha, label="A", bins=bins) pylab.hist(self._df.loc[:, 'snr_C'], alpha=alpha, label="C", bins=bins) pylab.hist(self._df.loc[:, 'snr_G'], alpha=alpha, label="G", bins=bins) pylab.hist(self._df.loc[:, 'snr_T'], alpha=alpha, label="T", bins=bins) pylab.legend() pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def hist_nb_passes(self, bins=None, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Number of ZMW passes", logy=True, ylabel="#", label="", title="Number of ZMW passes"): """Plot histogram of number of reads per ZMW (number of passes) :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param bool logy: use log scale on the y axis (default to True) :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_nb_passes() """ max_nb_pass = self.df.nb_passes.max() if bins is None: k = range(1, max_nb_pass + 1) # histogram nb passes if hold is False: pylab.clf() pylab.hist(self.df.nb_passes, bins=bins, alpha=alpha, label=label, log=logy, width=1) if len(k) < 5: pylab.xticks(range(6), range(6)) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def hist_len(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Read Length", ylabel="#", label="", title=None): """Plot histogram Read length :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import BAMPacbio from sequana import sequana_data b = BAMPacbio(sequana_data("test_pacbio_subreads.bam")) b.hist_len() """ if self._df is None: self._get_df() mean_len = np.mean(self._df.loc[:, 'read_length']) # set title if not provided if title is None: title = "Read length \n Mean length : %.2f" % (mean_len) # histogram GC percent if hold is False: pylab.clf() pylab.hist(self._df.loc[:, 'read_length'], bins=bins, alpha=alpha, label="%s, mean : %.0f, N : %d" % (label, mean_len, self._N)) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000, alpha=1, output_filename=None): # compute histogram of the input reads once for all to be used # in the target_distribution method self.bins = bins self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, normed=True) lengths = self.bam_simul.df.read_length.values self.tokeep = [] vec = [] x = self.bam.df.read_length.mean() for i in range(self.bam_simul.df.shape[0]): can = lengths[i] aprob = min([ alpha, self.target_distribution(can) / self.target_distribution(x) ]) #acceptance probability u = pylab.uniform(0, 1) if u < aprob: x = can vec.append(x) self.tokeep.append(True) else: self.tokeep.append(False) #plotting the results: #theoretical curve x = pylab.arange(xmin, xmax, step) y = self.target_distribution(x) pylab.subplot(211) pylab.title('Metropolis-Hastings') pylab.plot(vec) pylab.subplot(212) pylab.hist(vec[burn:], bins=bins, normed=1) pylab.plot(x, y, 'r-') pylab.ylabel('Frequency') pylab.xlabel('x') pylab.legend(('PDF', 'Samples')) if output_filename is not None: self.bam_simul.filter_bool(output_filename, self.tokeep)
def plot_GC_read_len(self, hold=False, fontsize=12, bins=[60, 60], grid=True, xlabel="GC %", ylabel="#", cmap="BrBG"): """Plot GC content versus read length :param bool hold: :param int fontsize: for x and y labels and title :param bins: a integer or tuple of 2 integers to specify the binning of the x and y 2D histogram. :param bool grid: :param str xlabel: :param str ylabel: .. plot:: :include-source: from sequana.pacbio import BAMPacbio from sequana import sequana_data b = BAMPacbio(sequana_data("test_pacbio_subreads.bam")) b.plot_GC_read_len(bins=[10, 10]) """ if self._df is None: self._get_df() mean_len = np.mean(self._df.loc[:, 'read_length']) mean_GC = np.mean(self._df.loc[:, 'GC_content']) if hold is False: pylab.clf() data = self._df.loc[:, ['read_length', 'GC_content']].dropna() h = biokit.viz.hist2d.Hist2D(data) res = h.plot(bins=bins, contour=False, norm='log', Nlevels=6, cmap=cmap) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("GC %", fontsize=fontsize) pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" % (mean_len, mean_GC), fontsize=fontsize) pylab.ylim([0, 100]) if grid is True: pylab.grid(True)
def hist_ZMW_subreads(self, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Number of ZMW passes", logy=True, ylabel="#", label="", title="Number of ZMW passes"): """Plot histogram of number of reads per ZMW (number of passes) :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param bool logy: use log scale on the y axis (default to True) :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import BAMPacbio from sequana import sequana_data b = BAMPacbio(sequana_data("test_pacbio_subreads.bam")) b.hist_ZMW_subreads() """ if self._nb_pass is None: self._get_ZMW_passes() max_nb_pass = max(self._nb_pass.keys()) k = range(1, max_nb_pass + 1) val = [self._nb_pass[i] for i in k] # histogram nb passes if hold is False: pylab.clf() pylab.bar(k, val, alpha=alpha, label=label, log=logy) if len(k) < 5: pylab.xticks(range(6), range(6)) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def plot_sirv_by_group(self, title, shift=5, plot=False, mapq_min=-1): aa = self.df.query("reference_name not in [-1, '-1']").copy() if len(aa) == 0: return pd.Series(), self.df aa['group'] = aa.reference_name.apply(lambda x: x[0:shift]) mapped = aa.query("mapq>@mapq_min").groupby("group").count()["mapq"] mapped.name = None if plot: mapped.plot(kind="bar") pylab.title(title) pylab.tight_layout() #data.to_csv(path + "_hq_sirv_grouped.csv") return mapped, self.df
def plot_sirv_by_group(self, title, shift=5, plot=False, mapq_min=-1): aa = self.df.query("reference_name not in [-1, '-1']").copy() if len(aa) == 0: return pd.Series(), self.df aa['group'] = aa.reference_name.apply(lambda x: x[0:shift]) mapped = aa.query("mapq>@mapq_min").groupby("group").count()["mapq"] mapped.name = None if plot: mapped.plot(kind="bar") pylab.title(title) pylab.tight_layout() #data.to_csv(path + "_hq_sirv_grouped.csv") return mapped, self.df
def hist_ORF_CDS_linearscale(self, alpha=0.5, bins=40, xlabel="Length", ylabel="#"): if self._ORF_pos is None: self._find_ORF_CDS() n_ORF = self._ORF_pos["len_ORF"].dropna().shape[0] n_CDS = self._ORF_pos["len_CDS"].dropna().shape[0] # plot for all ORF and CDS pylab.hist(self._ORF_pos["len_ORF"].dropna(),alpha=alpha, label="ORF, N = " + str(n_ORF),bins=bins) pylab.hist(self._ORF_pos["len_CDS"].dropna(),alpha=alpha, label="CDS, N = " + str(n_CDS),bins=bins) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend() pylab.title("Length of ORF and CDS (after filter %s > %d)" \ %(self._type_filter, self._threshold))
def boxplot_quality(self, color_line='r', bgcolor='grey', color='yellow', lw=4, hold=False, ax=None): quality = self.df[[str(x) for x in range(42)]] # not sure why we have phred score from 0 to 41 N = self.metadata['ReadNum'] proba = quality / N self.xmax = 150 xmax = self.xmax + 1 if ax: pylab.sca(ax) # pragma no cover pylab.fill_between([0,xmax], [0,0], [20,20], color='red', alpha=0.3) pylab.fill_between([0,xmax], [20,20], [30,30], color='orange', alpha=0.3) pylab.fill_between([0,xmax], [30,30], [41,41], color='green', alpha=0.3) X = [] Q = [] S = [] for pos in range(1, 151): qualities = [((int(k)+1)*v) for k,v in quality.loc[pos].items()] mean_quality = sum(qualities) / N X.append(pos) Q.append(mean_quality) proba = quality.loc[pos] / N std = pylab.sqrt(sum([(x-mean_quality)**2 * y for x, y in zip(range(42), proba)])) S.append(std) print(len(X)) print(len(Q)) print(len(S)) Q = np.array(Q) X = np.array(X) S = np.array(S) pylab.fill_between(X, Q+S, Q-S, color=color, interpolate=False) pylab.plot(X, Q, color=color_line, lw=lw) pylab.ylim([0, 41]) pylab.xlim([0, self.xmax+1]) pylab.title("Quality scores across all bases") pylab.xlabel("Position in read (bp)") pylab.ylabel("Quality") pylab.grid(axis='x')
def hist_length_repeats(self, bins=20, alpha=0.5, hold=False, fontsize=12, grid=True, title="Repeat length", xlabel="Repeat length", ylabel="#", logy=True): """Plots histogram of the repeat lengths """ # check that user has set a threshold if hold is False: pylab.clf() pylab.hist(self.list_len_repeats, alpha=alpha, bins=bins) pylab.title(title) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) if grid is True: pylab.grid(True) if logy: pylab.semilogy()
def hist_polymerase_per_barcode(self, bins=10, fontsize=12): """histogram of number of polymerase per barcode Cumulative histogram gives total number of polymerase reads """ PR = self.df_barcoded["Polymerase Reads"].sum() self.df_barcoded['Polymerase Reads'].hist(bins=bins, ec="k", rwidth=0.8) pylab.title("Total Polymerase count: {}".format(PR)) pylab.xlabel("Number of Polymerase Reads", fontsize=fontsize) pylab.ylabel("Number of Barcoded Samples", fontsize=fontsize) try: pylab.tight_layout() except: pass
def hist_GC(self, bins=50, hold=False, fontsize=12, grid=True,xlabel="GC %",ylabel="#"): """Plot histogram GC content""" if self._df is None: self._get_df() mean_GC = np.mean(self._df.loc[:,'GC_content']) # histogram GC percent if hold is False: pylab.clf() pylab.hist(self._df.loc[:,'GC_content'], bins=bins) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title("GC %% \n Mean GC : %.2f" %(mean_GC), fontsize=fontsize) if grid is True: pylab.grid(True)
def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8, align="left", fontsize=16, edgecolor="k", **kwargs): """ mode can be all, lq, hq """ pylab.clf() L1 = [len(read['sequence']) for read in self.lq_sequence] L2 = [len(read['sequence']) for read in self.hq_sequence] if mode == "all": L = L1 + L2 elif mode == "lq": L = L1 else: L = L2 Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align, ec=edgecolor, **kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("Number of reads", fontsize=fontsize) pylab.grid() ax_twin = pylab.gca().twinx() shift = (X[1] - X[0]) / 2 ax_twin.plot(X[1:] - shift, len(L) - pylab.cumsum(Y), "k") ax_twin.set_ylim(bottom=0) pylab.ylabel("CDF", fontsize=fontsize) pylab.title("Read length of Consensus isoforms reads")
def histogram_gc_content(self): """Plot histogram of GC content .. plot:: :include-source: from sequana import sequana_data from sequana import FastQC filename = sequana_data("test.fastq", "testing") qc = FastQC(filename) qc.histogram_gc_content() """ pylab.hist(self.gc_list, bins=range(0, 100)) pylab.grid() pylab.title("GC content distribution (per sequence)") pylab.xlabel(r"Mean GC content (%)", fontsize=self.fontsize) pylab.xlim([0,100])
def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40, xlabel="Frame", ylabel="#", bar_width=0.35): if self._ORF_pos is None: self._find_ORF_CDS() # number of ORF and CDS found by frame frames = [-3, -2, -1, 1, 2, 3] nb_res_ORF = [] nb_res_CDS = [] for fr in frames: nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_ORF"].dropna().shape[0]) nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_CDS"].dropna().shape[0]) pylab.bar(np.array(frames)-(bar_width/2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" %sum(nb_res_ORF)) pylab.bar(np.array(frames)+(bar_width/2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" %sum(nb_res_CDS)) pylab.xlabel(xlabel) pylab.ylabel(ylabel) pylab.legend(loc=1) pylab.title("Number of ORF and CDS by frame")
def plot_GC_read_len(self, alpha=0.07, hold=False, fontsize=12, grid=True,xlabel="GC %",ylabel="#"): """Plot GC content versus read length""" if self._df is None: self._get_df() mean_len = np.mean(self._df.loc[:,'read_length']) mean_GC = np.mean(self._df.loc[:,'GC_content']) if hold is False: pylab.clf() data = self._df.loc[:,['read_length','GC_content']] h = hist2d.Hist2D(data) res = h.plot(bins=[40,40], contour=False, nnorm='log', Nlevels=6) #pylab.plot(self._df.loc[:,'read_length'] , self._df.loc[:,'GC_content'], 'bo', alpha=alpha) pylab.xlabel("Read length", fontsize=12) pylab.ylabel("GC %", fontsize=12) pylab.title("GC % vs length \n Mean length : %.2f , Mean GC : %.2f" %(mean_len, mean_GC))
def hist_GC(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="GC %", ylabel="#", label="",title=None): """Plot histogram GC content :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: fontsize of the x and y labels and title. :param bool grid: add grid or not :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_GC() """ mean_GC = np.mean(self.df.loc[:,'GC_content']) # set title if needed if title is None: title = "GC %% \n Mean GC : %.2f" %(mean_GC) # histogram GC percent if hold is False: pylab.clf() pylab.hist(self.df.loc[:,'GC_content'], bins=bins, alpha=alpha, label=label + ", mean : " + str(round(mean_GC, 2)) + ", N : " + str(len(self))) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True) pylab.xlim([0, 100]) try: pylab.tight_layout() except:pass
def plot_alignment(self, motif, window=200, global_th=10, title=None, legend=True, legend_fontsize=11): """ plot alignments that match the motif. """ df = self._get_aligments(motif=motif, window=window, global_th=global_th) print("Found {} hits".format(len(df))) bam = BAM(self.bamfile) pylab.clf() count = 0 for aln in bam: if aln.query_name in df.query_name.values: seq = aln.query_sequence if seq: count += 1 X1 = [ seq[i:i + window].count(motif) for i in range(len(seq)) ] pylab.plot(range(aln.reference_start, aln.reference_start + len(seq)), X1, label=aln.query_name) max_theo = int(1.2 * window / len(motif)) pylab.ylim([0, max_theo]) if legend and count < 15: pylab.legend(fontsize=legend_fontsize) if title: pylab.title(title, fontsize=16) return df
def hist_nb_passes(self, bins=None, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Number of ZMW passes", logy=True, ylabel="#", label="", title="Number of ZMW passes"): """Plot histogram of number of reads per ZMW (number of passes) :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param bool logy: use log scale on the y axis (default to True) :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_nb_passes() """ max_nb_pass = self.df.nb_passes.max() if bins is None: k = range(1, max_nb_pass+1) # histogram nb passes if hold is False: pylab.clf() pylab.hist(self.df.nb_passes, bins=bins, alpha=alpha, label=label, log=logy, width=1) if len(k) < 5: pylab.xticks(range(6), range(6)) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def plot_scatter_contig_length_nread_cov(self, fontsize=16, vmin=0, vmax=50, min_nreads=20, min_length=50000): if self._df is None: _ = self.get_df() pylab.clf() df = self._df m1 = df.length.min() M1 = df.length.max() # least square X = df.query("nread>@min_nreads and length>@min_length")['length'] Y = df.query("nread>@min_nreads and length>@min_length")['nread'] Z = df.query("nread>@min_nreads and length>@min_length")['covStat'] print(X) print(Y) print(Z) A = np.vstack([X, np.ones(len(X))]).T m, c = np.linalg.lstsq(A, Y.as_matrix())[0] x = np.array([m1, M1]) X = df['length'] Y = df['nread'] Z = df['covStat'] pylab.scatter(X, Y, c=Z, vmin=vmin, vmax=vmax) pylab.colorbar() pylab.xlabel("Contig length", fontsize=fontsize) pylab.ylabel("Contig reads", fontsize=fontsize) pylab.title("coverage function of contig length and reads used") pylab.grid() pylab.plot(x, m * x + c, "o-r") pylab.loglog() pylab.tight_layout()
def hist_ZMW_subreads(self, hold=False, fontsize=12, grid=True,xlabel="Number of ZMW passes",ylabel="#"): """ Plot histogram of number of reads per ZMW """ if self._nb_pass is None: self._get_ZMW_passes() max_nb_pass = max(self._nb_pass.keys()) k = range(1,max_nb_pass+1) val = [self._nb_pass[i] for i in k] # histogram nb passes if hold is False: pylab.clf() pylab.hist(k, weights=val, bins=max_nb_pass) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.yscale('log') pylab.title("Number of ZMW passes",fontsize=fontsize) if grid is True: pylab.grid(True)
def plot_count_per_sample(self, fontsize=12, sample_list=None): """"Number of mapped reads per sample. Each color for each replicate .. plot:: :include-source: from sequana.rnadiff import RNADiffResults from sequana import sequana_data r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1")) r.plot_count_per_sample() """ sample_names = [x for x in self.df.columns if x.startswith("norm")] sample_names = [x.replace("norm.", "") for x in sample_names] N = len(sample_names) dd = self.df[sample_names].sum() pylab.clf() pylab.bar(range(N), (dd/1000000).values, color=['r']*3+['b']*3, alpha=1) pylab.xlabel("Samples", fontsize=fontsize) pylab.ylabel("Total read count (millions)", fontsize=fontsize) pylab.grid(True) pylab.title("Total read count per sample", fontsize=fontsize)
def barplot(self, filename="lane{}_status.png", lanes=None): df = self.get_data_reads() if lanes is None: lanes = df.lane.unique() for lane in lanes: pylab.clf() query = "lane==@lane and name!='Undetermined'" counts = df.query(query)['count'] total = counts.sum() L = len(counts) query = "lane==@lane and name=='Undetermined'" under = df.query(query)['count'].sum() if total > 0: pylab.bar(range(L), counts, color="b", label="reads") if total == 0: color = "red" else: if 100 * under / total < 20: color = "green" elif 100 * under / total < 50: color = "orange" else: color = "red" pylab.bar(range(L, L + 1), under, color=color, label="undetermined") pylab.xticks([]) pylab.ylabel("Number of reads") try: pylab.legend(loc="lower left") except: pass pylab.title("Lane {}".format(lane)) pylab.savefig(filename.format(lane), dpi=200)
def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000,alpha=1,output_filename=None): # compute histogram of the input reads once for all to be used # in the target_distribution method self.bins = bins self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, density=True) lengths = self.bam_simul.df.read_length.values self.tokeep = [] vec = [] x = self.bam.df.read_length.mean() for i in range(self.bam_simul.df.shape[0]): can = lengths[i] aprob = min([alpha,self.target_distribution(can)/self.target_distribution(x)]) #acceptance probability u = pylab.uniform(0,1) if u < aprob: x = can vec.append(x) self.tokeep.append(True) else: self.tokeep.append(False) #plotting the results: #theoretical curve x = pylab.arange(xmin, xmax, step) y = self.target_distribution(x) pylab.subplot(211) pylab.title('Metropolis-Hastings') pylab.plot(vec) pylab.subplot(212) pylab.hist(vec[burn:], bins=bins, density=1) pylab.plot(x,y,'r-') pylab.ylabel('Frequency') pylab.xlabel('x') pylab.legend(('PDF','Samples')) if output_filename is not None: self.bam_simul.filter_bool(output_filename, self.tokeep)
def plot(self, bins=80, rwidth=0.8, **kwargs): pylab.clf() Y, X, _ = pylab.hist(self.data, bins=bins, rwidth=rwidth, **kwargs) pylab.xlabel(self.xlabel, fontsize=self.fontsize) pylab.ylabel(self.ylabel, fontsize=self.fontsize) """self.Y = Y self.X = X ax_twin = pylab.gca().twinx() shift = (X[1] - X[0]) / 2 ax_twin.plot(X[0:-1]- shift, len(self.data) - pylab.cumsum(Y), "k") ax_twin.set_ylim(bottom=0) pylab.ylabel("CDF", fontsize=self.fontsize) """ pylab.grid(self.grid) pylab.title(self.title) try: pylab.tight_layout() except:pass
def plot_GC_read_len(self, hold=False, fontsize=12, bins=[200, 60], grid=True, xlabel="GC %", ylabel="#", cmap="BrBG"): """Plot GC content versus read length :param bool hold: :param int fontsize: for x and y labels and title :param bins: a integer or tuple of 2 integers to specify the binning of the x and y 2D histogram. :param bool grid: :param str xlabel: :param str ylabel: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.plot_GC_read_len(bins=[10, 10]) """ mean_len = np.mean(self.df.loc[:,'read_length']) mean_GC = np.mean(self.df.loc[:,'GC_content']) if hold is False: pylab.clf() data = self.df.loc[:,['read_length','GC_content']].dropna() h = biokit.viz.hist2d.Hist2D(data) res = h.plot(bins=bins, contour=False, norm='log', Nlevels=6, cmap=cmap) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("GC %", fontsize=fontsize) pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" % (mean_len, mean_GC), fontsize=fontsize) pylab.ylim([0, 100]) if grid is True: pylab.grid(True)
def diagnostics(self, bins=60, clear=True): if clear: pylab.clf() pylab.subplot(3,1,1) pylab.hist(self.aprob, bins=bins) pylab.title("Acceptation") pylab.subplot(3,1,2) pylab.plot(self.vec) pylab.title("proposition") pylab.subplot(3,1,3) y, x, _ = pylab.hist(self.vec, bins, density=True, lw=0.5, ec="k") M1 = max(y) # this normalisation is an approximation/hack pylab.plot(self.Xtarget, self.Ytarget/ (max(self.Ytarget)/M1), "-ro") pylab.title("simulated (blue) and target (red) distributions")
def hist_median_ccs(self, bins=1000, **kwargs): """Group subreads by ZMW and plot median of read length for each polymerase""" data = self.df[['read_length', 'ZMW']].groupby('ZMW') data.median().hist(bins=bins, **kwargs) pylab.title("CCS median read length") return data