def _merge_row(self, df, start, stop): chrom = df["chr"][start] cov = np.mean(df["cov"].loc[start:stop]) max_cov = np.max(df["cov"].loc[start:stop]) rm = np.mean(df["rm"].loc[start:stop]) zscore = np.mean(df["zscore"].loc[start:stop]) if zscore >= 0: max_zscore = df["zscore"].loc[start:stop].max() else: max_zscore = df["zscore"].loc[start:stop].min() size = stop - start + 1 return {"chr": chrom, "start": start, "end": stop + 1, "size": size, "mean_cov": cov, "mean_rm": rm, "mean_zscore": zscore, "max_zscore": max_zscore, "max_cov": max_cov}
def plot_GC_read_len(self, hold=False, fontsize=12, bins=[60, 60], grid=True, xlabel="GC %", ylabel="#", cmap="BrBG"): """Plot GC content versus read length :param bool hold: :param int fontsize: for x and y labels and title :param bins: a integer or tuple of 2 integers to specify the binning of the x and y 2D histogram. :param bool grid: :param str xlabel: :param str ylabel: .. plot:: :include-source: from sequana.pacbio import BAMPacbio from sequana import sequana_data b = BAMPacbio(sequana_data("test_pacbio_subreads.bam")) b.plot_GC_read_len(bins=[10, 10]) """ if self._df is None: self._get_df() mean_len = np.mean(self._df.loc[:, 'read_length']) mean_GC = np.mean(self._df.loc[:, 'GC_content']) if hold is False: pylab.clf() data = self._df.loc[:, ['read_length', 'GC_content']].dropna() h = biokit.viz.hist2d.Hist2D(data) res = h.plot(bins=bins, contour=False, norm='log', Nlevels=6, cmap=cmap) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("GC %", fontsize=fontsize) pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" % (mean_len, mean_GC), fontsize=fontsize) pylab.ylim([0, 100]) if grid is True: pylab.grid(True)
def hist_concordance(self, bins=100, fontsize=16): """ formula : 1 - (in + del + mismatch / (in + del + mismatch + match) ) For BWA and BLASR, the get_cigar_stats are different !!! BWA for instance has no X stored while Pacbio forbids the use of the M (CMATCH) tag. Instead, it uses X (CDIFF) and = (CEQUAL) characters. Subread Accuracy: The post-mapping accuracy of the basecalls. Formula: [1 - (errors/subread length)], where errors = number of deletions + insertions + substitutions. """ try: concordance = self._concordance except: self._set_concordance() concordance = self._concordance pylab.hist(concordance, bins=bins) pylab.grid() mu = np.mean(concordance) median = np.median(concordance) pylab.axvline(mu, color='r', alpha=0.5) pylab.axvline(median, color='r', alpha=0.5, ls="--") pylab.xlabel("concordance", fontsize=fontsize)
def hist_concordance(self, method, bins=100, fontsize=16): """ formula : 1 - (in + del + mismatch / (in + del + mismatch + match) ) For BWA and BLASR, the get_cigar_stats are different !!! BWA for instance has no X stored while Pacbio forbids the use of the M (CMATCH) tag. Instead, it uses X (CDIFF) and = (CEQUAL) characters. Subread Accuracy: The post-mapping accuracy of the basecalls. Formula: [1 - (errors/subread length)], where errors = number of deletions + insertions + substitutions. """ try: concordance = self._concordance except: self._set_concordance(method) concordance = self._concordance pylab.hist(concordance, bins=bins) pylab.grid() mu = np.mean(concordance) median = np.median(concordance) pylab.axvline(mu, color='r', alpha=0.5) pylab.axvline(median, color='r', alpha=0.5, ls="--") pylab.xlabel("concordance", fontsize=fontsize)
def hist_read_length(self, bins=80, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Read Length", ylabel="#", label="", title=None, logy=False, ec="k", hist_kwargs={}): """Plot histogram Read length :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_read_length() """ mean_len = np.mean(self.df.loc[:, 'read_length']) # set title if not provided if title is None: title = "Read length \n Mean length : %.2f" % (mean_len) if hold is False: pylab.clf() hist = HistCumSum(self.df.loc[:, 'read_length'], fontsize=fontsize, grid=grid) hist.title = title hist.xlabel = xlabel hist.ylabel = ylabel hist.plot(bins=bins, alpha=alpha, edgecolor=ec, label="%s, mean : %.0f, N : %d" % (label, mean_len, len(self)), log=logy, **hist_kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0)
def summary(self): summary = {"name": "sequana_summary_pacbio_qc"} summary["read_stats"] = self.stats.copy() summary["mean_gc"] = float(np.mean(self._df.loc[:,'GC_content'])) a, b = np.histogram(self._df.loc[:,'GC_content'], bins=100) summary['hist_gc'] = {"Y": a.tolist(), "X": b.tolist()} a, b = np.histogram(self._df['read_length'],100) summary['hist_read_length'] = {"Y": a.tolist(), "X": b.tolist()} return summary
def plot_GC_read_len(self, alpha=0.07, hold=False, fontsize=12, grid=True,xlabel="GC %",ylabel="#"): """Plot GC content versus read length""" if self._df is None: self._get_df() mean_len = np.mean(self._df.loc[:,'read_length']) mean_GC = np.mean(self._df.loc[:,'GC_content']) if hold is False: pylab.clf() data = self._df.loc[:,['read_length','GC_content']] h = hist2d.Hist2D(data) res = h.plot(bins=[40,40], contour=False, nnorm='log', Nlevels=6) #pylab.plot(self._df.loc[:,'read_length'] , self._df.loc[:,'GC_content'], 'bo', alpha=alpha) pylab.xlabel("Read length", fontsize=12) pylab.ylabel("GC %", fontsize=12) pylab.title("GC % vs length \n Mean length : %.2f , Mean GC : %.2f" %(mean_len, mean_GC))
def summary(self): summary = {"name": "sequana_summary_pacbio_qc"} summary["read_stats"] = self.stats.copy() summary["mean_gc"] = float(np.mean(self._df.loc[:, 'GC_content'])) a, b = np.histogram(self._df.loc[:, 'GC_content'], bins=100) summary['hist_gc'] = {"Y": a.tolist(), "X": b.tolist()} a, b = np.histogram(self._df['read_length'], 100) summary['hist_read_length'] = {"Y": a.tolist(), "X": b.tolist()} return summary
def hist_GC(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="GC %", ylabel="#", label="", title=None): """Plot histogram GC content :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: fontsize of the x and y labels and title. :param bool grid: add grid or not :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_GC() """ mean_GC = np.mean(self.df.loc[:, 'GC_content']) # set title if needed if title is None: title = "GC %% \n Mean GC : %.2f" % (mean_GC) # histogram GC percent if hold is False: pylab.clf() pylab.hist(self.df.loc[:, 'GC_content'], bins=bins, alpha=alpha, label=label + ", mean : " + str(round(mean_GC, 2)) + ", N : " + str(len(self))) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True) pylab.xlim([0, 100]) try: pylab.tight_layout() except: pass
def hist_len(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Read Length", ylabel="#", label="", title=None): """Plot histogram Read length :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import BAMPacbio from sequana import sequana_data b = BAMPacbio(sequana_data("test_pacbio_subreads.bam")) b.hist_len() """ if self._df is None: self._get_df() mean_len = np.mean(self._df.loc[:, 'read_length']) # set title if not provided if title is None: title = "Read length \n Mean length : %.2f" % (mean_len) # histogram GC percent if hold is False: pylab.clf() pylab.hist(self._df.loc[:, 'read_length'], bins=bins, alpha=alpha, label="%s, mean : %.0f, N : %d" % (label, mean_len, self._N)) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True)
def plot_GC_read_len(self, hold=False, fontsize=12, bins=[200, 60], grid=True, xlabel="GC %", ylabel="#", cmap="BrBG"): """Plot GC content versus read length :param bool hold: :param int fontsize: for x and y labels and title :param bins: a integer or tuple of 2 integers to specify the binning of the x and y 2D histogram. :param bool grid: :param str xlabel: :param str ylabel: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.plot_GC_read_len(bins=[10, 10]) """ mean_len = np.mean(self.df.loc[:,'read_length']) mean_GC = np.mean(self.df.loc[:,'GC_content']) if hold is False: pylab.clf() data = self.df.loc[:,['read_length','GC_content']].dropna() h = biokit.viz.hist2d.Hist2D(data) res = h.plot(bins=bins, contour=False, norm='log', Nlevels=6, cmap=cmap) pylab.xlabel("Read length", fontsize=fontsize) pylab.ylabel("GC %", fontsize=fontsize) pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" % (mean_len, mean_GC), fontsize=fontsize) pylab.ylim([0, 100]) if grid is True: pylab.grid(True)
def hist_GC(self, bins=50, hold=False, fontsize=12, grid=True,xlabel="GC %",ylabel="#"): """Plot histogram GC content""" if self._df is None: self._get_df() mean_GC = np.mean(self._df.loc[:,'GC_content']) # histogram GC percent if hold is False: pylab.clf() pylab.hist(self._df.loc[:,'GC_content'], bins=bins) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title("GC %% \n Mean GC : %.2f" %(mean_GC), fontsize=fontsize) if grid is True: pylab.grid(True)
def hist_GC(self, bins=50, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="GC %", ylabel="#", label="",title=None): """Plot histogram GC content :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: fontsize of the x and y labels and title. :param bool grid: add grid or not :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_GC() """ mean_GC = np.mean(self.df.loc[:,'GC_content']) # set title if needed if title is None: title = "GC %% \n Mean GC : %.2f" %(mean_GC) # histogram GC percent if hold is False: pylab.clf() pylab.hist(self.df.loc[:,'GC_content'], bins=bins, alpha=alpha, label=label + ", mean : " + str(round(mean_GC, 2)) + ", N : " + str(len(self))) pylab.xlabel(xlabel, fontsize=fontsize) pylab.ylabel(ylabel, fontsize=fontsize) pylab.title(title, fontsize=fontsize) if grid is True: pylab.grid(True) pylab.xlim([0, 100]) try: pylab.tight_layout() except:pass
def hist_read_length(self, bins=80, alpha=0.5, hold=False, fontsize=12, grid=True, xlabel="Read Length", ylabel="#", label="", title=None, logy=False, ec="k", hist_kwargs={}): """Plot histogram Read length :param int bins: binning for the histogram :param float alpha: transparency of the histograms :param bool hold: :param int fontsize: :param bool grid: :param str xlabel: :param str ylabel: :param str label: label of the histogram (for the legend) :param str title: .. plot:: :include-source: from sequana.pacbio import PacbioSubreads from sequana import sequana_data b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) b.hist_read_length() """ mean_len = np.mean(self.df.loc[:,'read_length']) # set title if not provided if title is None: title = "Read length \n Mean length : %.2f" %(mean_len) if hold is False: pylab.clf() hist = HistCumSum(self.df.loc[:,'read_length'], fontsize=fontsize, grid=grid) hist.title = title hist.xlabel = xlabel hist.ylabel = ylabel hist.plot(bins=bins, alpha=alpha, edgecolor=ec, label= "%s, mean : %.0f, N : %d" % (label, mean_len, len(self)), log=logy, **hist_kwargs) pylab.gca().set_ylim(bottom=0) pylab.gca().set_xlim(left=0)
def get_stats(self): # FIXME the information should all be computed in _get_info # !!! sequences is limited to 500,000 if max_sample set to 500,000 # full stats must be computed in run_info() method # so do not use .sequences here stats = self.stats.copy() stats['GC content'] = self.gc_content stats["n_reads"] = self.N stats['total bases'] = self.stats['total_bp'] stats['mean quality'] = np.mean(self.mean_qualities) stats['average read length'] = self.stats['mean_length'] stats['min read length'] = self.minimum stats['max read length'] = self.maximum # use DataFrame instead of Series to mix types (int/float) ts = pd.DataFrame([stats]) cols = ['n_reads', 'A', 'C', 'G', 'T', 'N','total bases' ] ts[cols] = ts[cols].astype(int) ts = ts[cols + ['GC content', 'average read length', 'mean quality']] return ts
def _get_mean_quality(self): return np.mean(self.quality)
def _get_info(self): """Populates the data structures for plotting. Will be called on request""" stats = {"A":0, "C":0, "G":0, "T":0, "N":0} stats["qualities"] = [] stats["mean_qualities"] = [] stats["mean_length"] = 0 stats["sequences"] = [] minimum = 1e6 maximum = 0 # FIXME this self.N takes time in the cosntructor # do we need it ? self.lengths = np.empty(self.N) self.gc_list = [] total_length = 0 C = defaultdict(int) if self.verbose: pb = Progress(self.N) sequences = [] mean_qualities = [] qualities = [] # could use multiprocessing # FastxFile has shown some errors while handling gzip files # created with zlib (e.g. from atropos). This is now replaced # by the Atropos FastqReader for now. #fastq = pysam.FastxFile(self.filename) with FastqReader(self.filename) as f: for i, record in enumerate(f): N = len(record.sequence) self.lengths[i] = N # we can store all qualities and sequences reads, so # just max_sample are stored: if i < self.max_sample: quality = [ord(x) -33 for x in record.qualities] mean_qualities.append(sum(quality) / N) qualities.append(quality) sequences.append(record.sequence) # store count of all qualities for k in record.qualities: C[k] += 1 GG = record.sequence.count('G') CC = record.sequence.count('C') self.gc_list.append((GG+CC)/float(N)*100) # not using a counter, or loop speed up the code stats["A"] += record.sequence.count("A") stats["C"] += CC stats["G"] += GG stats["T"] += record.sequence.count("T") stats["N"] += record.sequence.count("N") total_length += len(record.sequence) if self.verbose: pb.animate(i+1) # other data self.qualities = qualities self.mean_qualities = mean_qualities self.minimum = int(self.lengths.min()) self.maximum = int(self.lengths.max()) self.sequences = sequences self.gc_content = np.mean(self.gc_list) stats['mean_length'] = total_length / float(self.N) stats['total_bp'] = stats['A'] + stats['C'] + stats['G'] + stats["T"] + stats['N'] stats['mean_quality'] = sum([(ord(k) -33)*v for k,v in C.items()]) / stats['total_bp'] self.stats = stats