예제 #1
0
 def _merge_row(self, df, start, stop):
     chrom = df["chr"][start]
     cov = np.mean(df["cov"].loc[start:stop])
     max_cov = np.max(df["cov"].loc[start:stop])
     rm = np.mean(df["rm"].loc[start:stop])
     zscore = np.mean(df["zscore"].loc[start:stop])
     if zscore >= 0:
         max_zscore = df["zscore"].loc[start:stop].max()
     else:
         max_zscore = df["zscore"].loc[start:stop].min()
     size = stop - start + 1
     return {"chr": chrom, "start": start, "end": stop + 1, "size": size,
             "mean_cov": cov, "mean_rm": rm, "mean_zscore": zscore,
             "max_zscore": max_zscore, "max_cov": max_cov}
예제 #2
0
    def plot_GC_read_len(self,
                         hold=False,
                         fontsize=12,
                         bins=[60, 60],
                         grid=True,
                         xlabel="GC %",
                         ylabel="#",
                         cmap="BrBG"):
        """Plot GC content versus read length

        :param bool hold:
        :param int fontsize: for x and y labels and title
        :param bins: a integer or tuple of 2 integers to specify
            the binning of the x and y 2D histogram.
        :param bool grid:
        :param str xlabel:
        :param str ylabel:

        .. plot::
            :include-source:

            from sequana.pacbio import BAMPacbio
            from sequana import sequana_data
            b = BAMPacbio(sequana_data("test_pacbio_subreads.bam"))
            b.plot_GC_read_len(bins=[10, 10])

        """
        if self._df is None:
            self._get_df()
        mean_len = np.mean(self._df.loc[:, 'read_length'])
        mean_GC = np.mean(self._df.loc[:, 'GC_content'])

        if hold is False:
            pylab.clf()

        data = self._df.loc[:, ['read_length', 'GC_content']].dropna()
        h = biokit.viz.hist2d.Hist2D(data)
        res = h.plot(bins=bins,
                     contour=False,
                     norm='log',
                     Nlevels=6,
                     cmap=cmap)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("GC %", fontsize=fontsize)
        pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" %
                    (mean_len, mean_GC),
                    fontsize=fontsize)
        pylab.ylim([0, 100])
        if grid is True:
            pylab.grid(True)
예제 #3
0
파일: pacbio.py 프로젝트: sequana/sequana
    def hist_concordance(self,  bins=100, fontsize=16):
        """

            formula : 1 - (in + del + mismatch / (in + del + mismatch + match) )

        For BWA and BLASR, the get_cigar_stats are different !!!
        BWA for instance has no X stored while Pacbio forbids the use of the M
        (CMATCH) tag. Instead, it uses X (CDIFF) and = (CEQUAL) characters.

        Subread Accuracy: The post-mapping accuracy of the basecalls.
        Formula: [1 - (errors/subread length)], where errors = number of deletions +
        insertions + substitutions.

        """
        try:
            concordance = self._concordance
        except:
            self._set_concordance()
            concordance = self._concordance

        pylab.hist(concordance, bins=bins)
        pylab.grid()
        mu = np.mean(concordance)
        median = np.median(concordance)
        pylab.axvline(mu, color='r', alpha=0.5)
        pylab.axvline(median, color='r', alpha=0.5, ls="--")
        pylab.xlabel("concordance", fontsize=fontsize)
예제 #4
0
    def hist_concordance(self, method, bins=100, fontsize=16):
        """

            formula : 1 - (in + del + mismatch / (in + del + mismatch + match) )

        For BWA and BLASR, the get_cigar_stats are different !!!
        BWA for instance has no X stored while Pacbio forbids the use of the M
        (CMATCH) tag. Instead, it uses X (CDIFF) and = (CEQUAL) characters.

        Subread Accuracy: The post-mapping accuracy of the basecalls. 
        Formula: [1 - (errors/subread length)], where errors = number of deletions +
        insertions + substitutions.

        """
        try:
            concordance = self._concordance
        except:
            self._set_concordance(method)
            concordance = self._concordance

        pylab.hist(concordance, bins=bins)
        pylab.grid()
        mu = np.mean(concordance)
        median = np.median(concordance)
        pylab.axvline(mu, color='r', alpha=0.5)
        pylab.axvline(median, color='r', alpha=0.5, ls="--")
        pylab.xlabel("concordance", fontsize=fontsize)
예제 #5
0
파일: pacbio.py 프로젝트: wenliangz/sequana
    def hist_read_length(self,
                         bins=80,
                         alpha=0.5,
                         hold=False,
                         fontsize=12,
                         grid=True,
                         xlabel="Read Length",
                         ylabel="#",
                         label="",
                         title=None,
                         logy=False,
                         ec="k",
                         hist_kwargs={}):
        """Plot histogram Read length

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_read_length()

        """
        mean_len = np.mean(self.df.loc[:, 'read_length'])

        # set title if not provided
        if title is None:
            title = "Read length  \n Mean length : %.2f" % (mean_len)

        if hold is False:
            pylab.clf()

        hist = HistCumSum(self.df.loc[:, 'read_length'],
                          fontsize=fontsize,
                          grid=grid)
        hist.title = title
        hist.xlabel = xlabel
        hist.ylabel = ylabel
        hist.plot(bins=bins,
                  alpha=alpha,
                  edgecolor=ec,
                  label="%s, mean : %.0f, N : %d" %
                  (label, mean_len, len(self)),
                  log=logy,
                  **hist_kwargs)
        pylab.gca().set_ylim(bottom=0)
        pylab.gca().set_xlim(left=0)
예제 #6
0
파일: pacbio.py 프로젝트: sequana/sequana
 def summary(self):
     summary = {"name": "sequana_summary_pacbio_qc"}
     summary["read_stats"] = self.stats.copy()
     summary["mean_gc"] = float(np.mean(self._df.loc[:,'GC_content']))
     a, b = np.histogram(self._df.loc[:,'GC_content'], bins=100)
     summary['hist_gc'] = {"Y": a.tolist(), "X": b.tolist()}
     a, b =  np.histogram(self._df['read_length'],100)
     summary['hist_read_length'] = {"Y": a.tolist(), "X": b.tolist()}
     return summary
예제 #7
0
    def plot_GC_read_len(self, alpha=0.07, hold=False, fontsize=12,
                grid=True,xlabel="GC %",ylabel="#"):
        """Plot GC content versus read length"""

        if self._df is None:
            self._get_df()
        mean_len =  np.mean(self._df.loc[:,'read_length'])
        mean_GC =  np.mean(self._df.loc[:,'GC_content'])

        if hold is False:
            pylab.clf()
        data = self._df.loc[:,['read_length','GC_content']]
        h = hist2d.Hist2D(data)
        res = h.plot(bins=[40,40], contour=False, nnorm='log', Nlevels=6)
        #pylab.plot(self._df.loc[:,'read_length'] , self._df.loc[:,'GC_content'], 'bo', alpha=alpha)
        pylab.xlabel("Read length", fontsize=12)
        pylab.ylabel("GC %", fontsize=12)
        pylab.title("GC % vs length \n Mean length : %.2f , Mean GC : %.2f" %(mean_len, mean_GC))
예제 #8
0
 def summary(self):
     summary = {"name": "sequana_summary_pacbio_qc"}
     summary["read_stats"] = self.stats.copy()
     summary["mean_gc"] = float(np.mean(self._df.loc[:, 'GC_content']))
     a, b = np.histogram(self._df.loc[:, 'GC_content'], bins=100)
     summary['hist_gc'] = {"Y": a.tolist(), "X": b.tolist()}
     a, b = np.histogram(self._df['read_length'], 100)
     summary['hist_read_length'] = {"Y": a.tolist(), "X": b.tolist()}
     return summary
예제 #9
0
파일: pacbio.py 프로젝트: wenliangz/sequana
    def hist_GC(self,
                bins=50,
                alpha=0.5,
                hold=False,
                fontsize=12,
                grid=True,
                xlabel="GC %",
                ylabel="#",
                label="",
                title=None):
        """Plot histogram GC content

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize: fontsize of the x and y labels and title.
        :param bool grid: add grid or not
        :param str xlabel:
        :param str ylabel:
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_GC()

        """
        mean_GC = np.mean(self.df.loc[:, 'GC_content'])

        # set title if needed
        if title is None:
            title = "GC %%  \n Mean GC : %.2f" % (mean_GC)

        # histogram GC percent
        if hold is False:
            pylab.clf()
        pylab.hist(self.df.loc[:, 'GC_content'],
                   bins=bins,
                   alpha=alpha,
                   label=label + ", mean : " + str(round(mean_GC, 2)) +
                   ", N : " + str(len(self)))
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
        pylab.xlim([0, 100])
        try:
            pylab.tight_layout()
        except:
            pass
예제 #10
0
    def hist_len(self,
                 bins=50,
                 alpha=0.5,
                 hold=False,
                 fontsize=12,
                 grid=True,
                 xlabel="Read Length",
                 ylabel="#",
                 label="",
                 title=None):
        """Plot histogram Read length

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import BAMPacbio
            from sequana import sequana_data
            b = BAMPacbio(sequana_data("test_pacbio_subreads.bam"))
            b.hist_len()

        """
        if self._df is None:
            self._get_df()
        mean_len = np.mean(self._df.loc[:, 'read_length'])

        # set title if not provided
        if title is None:
            title = "Read length  \n Mean length : %.2f" % (mean_len)

        # histogram GC percent
        if hold is False:
            pylab.clf()
        pylab.hist(self._df.loc[:, 'read_length'],
                   bins=bins,
                   alpha=alpha,
                   label="%s, mean : %.0f, N : %d" %
                   (label, mean_len, self._N))
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
예제 #11
0
파일: pacbio.py 프로젝트: sequana/sequana
    def plot_GC_read_len(self, hold=False, fontsize=12, bins=[200, 60],
                grid=True, xlabel="GC %", ylabel="#", cmap="BrBG"):
        """Plot GC content versus read length

        :param bool hold:
        :param int fontsize: for x and y labels and title
        :param bins: a integer or tuple of 2 integers to specify
            the binning of the x and y 2D histogram.
        :param bool grid:
        :param str xlabel:
        :param str ylabel:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.plot_GC_read_len(bins=[10, 10])

        """
        mean_len =  np.mean(self.df.loc[:,'read_length'])
        mean_GC =  np.mean(self.df.loc[:,'GC_content'])

        if hold is False:
            pylab.clf()

        data = self.df.loc[:,['read_length','GC_content']].dropna()
        h = biokit.viz.hist2d.Hist2D(data)
        res = h.plot(bins=bins, contour=False, norm='log', Nlevels=6, cmap=cmap)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("GC %", fontsize=fontsize)
        pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" %
            (mean_len, mean_GC), fontsize=fontsize)
        pylab.ylim([0, 100])
        if grid is True:
            pylab.grid(True)
예제 #12
0
    def hist_GC(self, bins=50, hold=False, fontsize=12,
                grid=True,xlabel="GC %",ylabel="#"):
        """Plot histogram GC content"""

        if self._df is None:
            self._get_df()
        mean_GC =  np.mean(self._df.loc[:,'GC_content'])

        # histogram GC percent
        if hold is False:
            pylab.clf()
        pylab.hist(self._df.loc[:,'GC_content'], bins=bins)
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title("GC %%  \n Mean GC : %.2f" %(mean_GC), fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
예제 #13
0
파일: pacbio.py 프로젝트: sequana/sequana
    def hist_GC(self, bins=50, alpha=0.5, hold=False, fontsize=12,
                grid=True, xlabel="GC %", ylabel="#", label="",title=None):
        """Plot histogram GC content

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize: fontsize of the x and y labels and title.
        :param bool grid: add grid or not
        :param str xlabel:
        :param str ylabel:
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_GC()

        """
        mean_GC =  np.mean(self.df.loc[:,'GC_content'])

        # set title if needed
        if title is None:
            title = "GC %%  \n Mean GC : %.2f" %(mean_GC)

        # histogram GC percent
        if hold is False:
            pylab.clf()
        pylab.hist(self.df.loc[:,'GC_content'], bins=bins,
            alpha=alpha, label=label + ", mean : " + str(round(mean_GC, 2))
            + ", N : " + str(len(self)))
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
        pylab.xlim([0, 100])
        try: pylab.tight_layout()
        except:pass
예제 #14
0
파일: pacbio.py 프로젝트: sequana/sequana
    def hist_read_length(self, bins=80, alpha=0.5, hold=False, fontsize=12,
                grid=True, xlabel="Read Length", ylabel="#", label="",
                title=None, logy=False,  ec="k", hist_kwargs={}):
        """Plot histogram Read length

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_read_length()

        """
        mean_len =  np.mean(self.df.loc[:,'read_length'])

        # set title if not provided
        if title is None:
            title = "Read length  \n Mean length : %.2f" %(mean_len)

        if hold is False:
            pylab.clf()

        hist = HistCumSum(self.df.loc[:,'read_length'], fontsize=fontsize,
                    grid=grid)
        hist.title = title
        hist.xlabel = xlabel
        hist.ylabel = ylabel
        hist.plot(bins=bins, alpha=alpha, edgecolor=ec,
            label=  "%s, mean : %.0f, N : %d" % (label, mean_len, len(self)),
            log=logy, **hist_kwargs)
        pylab.gca().set_ylim(bottom=0)
        pylab.gca().set_xlim(left=0)
예제 #15
0
파일: fastq.py 프로젝트: wenliangz/sequana
    def get_stats(self):
        # FIXME the information should all be computed in _get_info

        # !!! sequences is limited to 500,000 if max_sample set to 500,000
        # full stats must be computed in run_info() method
        # so do not use .sequences here
        stats = self.stats.copy()
        stats['GC content'] = self.gc_content
        stats["n_reads"] = self.N

        stats['total bases'] = self.stats['total_bp']
        stats['mean quality'] = np.mean(self.mean_qualities)
        stats['average read length'] = self.stats['mean_length']
        stats['min read length'] = self.minimum
        stats['max read length'] = self.maximum

        # use DataFrame instead of Series to mix types (int/float)
        ts = pd.DataFrame([stats])
        cols = ['n_reads', 'A', 'C', 'G', 'T', 'N','total bases' ]
        ts[cols] = ts[cols].astype(int)
        ts = ts[cols + ['GC content', 'average read length', 'mean quality']]
        return ts
예제 #16
0
파일: phred.py 프로젝트: sequana/sequana
 def _get_mean_quality(self):
     return np.mean(self.quality)
예제 #17
0
 def _get_mean_quality(self):
     return np.mean(self.quality)
예제 #18
0
파일: fastq.py 프로젝트: wenliangz/sequana
    def _get_info(self):
        """Populates the data structures for plotting.

        Will be called on request"""

        stats = {"A":0, "C":0, "G":0, "T":0, "N":0}
        stats["qualities"] = []
        stats["mean_qualities"] = []
        stats["mean_length"] = 0
        stats["sequences"] = []

        minimum = 1e6
        maximum = 0
        # FIXME this self.N takes time in the cosntructor
        # do we need it ?
        self.lengths = np.empty(self.N)
        self.gc_list = []
        total_length = 0
        C = defaultdict(int)
        if self.verbose:
            pb = Progress(self.N)

        sequences = []
        mean_qualities = []
        qualities = []
        # could use multiprocessing
        # FastxFile has shown some errors while handling gzip files
        # created with zlib (e.g. from atropos). This is now replaced
        # by the Atropos FastqReader for now.
        #fastq = pysam.FastxFile(self.filename)

        with FastqReader(self.filename) as f:
            for i, record in enumerate(f):
                N = len(record.sequence)
                self.lengths[i] = N

                # we can store all qualities and sequences reads, so
                # just max_sample are stored:
                if i < self.max_sample:
                    quality = [ord(x) -33 for x in record.qualities]
                    mean_qualities.append(sum(quality) / N)
                    qualities.append(quality)
                    sequences.append(record.sequence)

                # store count of all qualities
                for k in record.qualities:
                    C[k] += 1

                GG = record.sequence.count('G') 
                CC = record.sequence.count('C')
                self.gc_list.append((GG+CC)/float(N)*100)

                # not using a counter, or loop speed up the code
                stats["A"] += record.sequence.count("A")
                stats["C"] += CC
                stats["G"] += GG
                stats["T"] += record.sequence.count("T")
                stats["N"] += record.sequence.count("N")

                total_length += len(record.sequence)

                if self.verbose:
                    pb.animate(i+1)

        # other data
        self.qualities = qualities
        self.mean_qualities = mean_qualities
        self.minimum = int(self.lengths.min())
        self.maximum = int(self.lengths.max())
        self.sequences = sequences
        self.gc_content = np.mean(self.gc_list)
        stats['mean_length'] = total_length / float(self.N)
        stats['total_bp'] = stats['A'] + stats['C'] + stats['G'] + stats["T"] + stats['N']
        stats['mean_quality'] = sum([(ord(k) -33)*v for k,v in C.items()]) / stats['total_bp']

        self.stats = stats