예제 #1
0
파일: isoseq.py 프로젝트: sequana/sequana
    def hist_read_length_consensus_isoform(self, mode="all", bins=80, rwidth=0.8,
        align="left", fontsize=16, edgecolor="k", **kwargs):
        """

        mode can be all, lq, hq
        """
        pylab.clf()

        L1 = [len(read['sequence']) for read in self.lq_sequence]
        L2 = [len(read['sequence']) for read in self.hq_sequence]
        if mode == "all":
            L = L1 + L2
        elif mode == "lq":
            L = L1
        else:
            L = L2
 
        Y, X, _ = pylab.hist(L, bins=bins, rwidth=rwidth, align=align,
                    ec=edgecolor, **kwargs)
        pylab.gca().set_ylim(bottom=0)
        pylab.gca().set_xlim(left=0)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("Number of reads", fontsize=fontsize)
        pylab.grid()

        ax_twin = pylab.gca().twinx()

        shift = (X[1] - X[0]) / 2

        ax_twin.plot(X[0:-1]- shift, len(L) - pylab.cumsum(Y), "k")
        ax_twin.set_ylim(bottom=0)
        pylab.ylabel("CDF", fontsize=fontsize)

        pylab.title("Read length of Consensus isoforms reads")
예제 #2
0
파일: fastq.py 프로젝트: sequana/sequana
    def imshow_qualities(self):
        """Qualities

        ::

            from sequana import sequana_data
            from sequana import FastQC
            filename  = sequana_data("test.fastq", "testing")
            qc = FastQC(filename)
            qc.imshow_qualities()
            from pylab import tight_layout; tight_layout()

        """
        tiles = self._get_tile_info()
        d = defaultdict(list)
        for tile, seq in zip(tiles['tiles'], self.qualities):
            d[tile].append(seq)
        self.data_imqual = [pd.DataFrame(d[key]).mean().values for key in sorted(d.keys())]

        from biokit.viz import Imshow
        im = Imshow(self.data_imqual)
        im.plot(xticks_on=False, yticks_on=False, origin='lower')
        pylab.title("Quality per tile", fontsize=self.fontsize)
        pylab.xlabel("Position in read (bp)")
        pylab.ylabel("tile number")
예제 #3
0
파일: bamtools.py 프로젝트: sequana/sequana
    def plot_gc_content(self, fontsize=16, ec="k", bins=100):
        """plot GC content histogram

        :params bins: a value for the number of bins or an array (with a copy()
            method)
        :param ec: add black contour on the bars

        .. plot::
            :include-source:

            from sequana import BAM, sequana_data
            b = BAM(sequana_data('test.bam'))
            b.plot_gc_content()

        """
        data = self.get_gc_content()
        try:
            X = np.linspace(0, 100, bins)
        except:
            X = bins.copy()

        pylab.hist(data, X, density=True, ec=ec)
        pylab.grid(True)
        mu = pylab.mean(data)
        sigma = pylab.std(data)

        X = pylab.linspace(X.min(), X.max(), 100)

        from sequana.misc import normpdf

        pylab.plot(X, normpdf(X, mu, sigma), lw=2, color="r", ls="--")
        pylab.xlabel("GC content", fontsize=16)
예제 #4
0
파일: fastq.py 프로젝트: sequana/sequana
    def histogram_sequence_lengths(self, logy=True):
        """Histogram sequence lengths

        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana import FastQC
            filename  = sequana_data("test.fastq", "testing")
            qc = FastQC(filename)
            qc.histogram_sequence_lengths()

        """
        data = [len(x) for x in self.sequences]
        bary, barx = np.histogram(data, bins=range(max(data)+1))

        # get rid of zeros to avoid warnings
        bx = [x for x,y in zip(barx, bary) if y!=0]
        by = [y for x,y in zip(barx, bary) if y!=0]
        if logy:
            pylab.bar(bx, pylab.log10(by))
        else:
            pylab.bar(bx, by)

        pylab.xlim([1,max(data)+1])

        pylab.grid(True)
        pylab.xlabel("position (bp)", fontsize=self.fontsize)
        pylab.ylabel("Count (log scale)", fontsize=self.fontsize)
예제 #5
0
파일: bamtools.py 프로젝트: sequana/sequana
    def plot_bar_flags(self, logy=True, fontsize=16, filename=None):
        """Plot an histogram of the flags contained in the BAM

        .. plot::
            :include-source:

            from sequana import BAM, sequana_data
            b = BAM(sequana_data('test.bam', "testing"))
            b.plot_bar_flags()

        .. seealso:: :class:`SAMFlags` for meaning of each flag
        """
        df = self.get_flags_as_df()
        df = df.sum()
        pylab.clf()
        if logy is True:
            barplot = df.plot(kind='bar', logy=logy, grid=True)
        else:
            barplot = df.plot(kind='bar', grid=True)
        pylab.xlabel("flags", fontsize=fontsize)
        pylab.ylabel("count", fontsize=fontsize)
        pylab.tight_layout()
        if filename:
            pylab.savefig(filename)
        return barplot
예제 #6
0
파일: pacbio.py 프로젝트: sequana/sequana
    def hist_concordance(self,  bins=100, fontsize=16):
        """

            formula : 1 - (in + del + mismatch / (in + del + mismatch + match) )

        For BWA and BLASR, the get_cigar_stats are different !!!
        BWA for instance has no X stored while Pacbio forbids the use of the M
        (CMATCH) tag. Instead, it uses X (CDIFF) and = (CEQUAL) characters.

        Subread Accuracy: The post-mapping accuracy of the basecalls.
        Formula: [1 - (errors/subread length)], where errors = number of deletions +
        insertions + substitutions.

        """
        try:
            concordance = self._concordance
        except:
            self._set_concordance()
            concordance = self._concordance

        pylab.hist(concordance, bins=bins)
        pylab.grid()
        mu = np.mean(concordance)
        median = np.median(concordance)
        pylab.axvline(mu, color='r', alpha=0.5)
        pylab.axvline(median, color='r', alpha=0.5, ls="--")
        pylab.xlabel("concordance", fontsize=fontsize)
예제 #7
0
파일: isoseq.py 프로젝트: sequana/sequana
    def hist_average_quality(self, fontsize=16, bins=None):
        """

        bins is from 0 to 94 
        """

        hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) 
                for read in self.hq_sequence]
        lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) 
            for read in self.lq_sequence]

        if bins is None:
            bins = range(0,94)
        Y1, X = np.histogram(hq_qv, bins=bins)
        Y2, X = np.histogram(lq_qv, bins=bins)
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlim([0.5, 93.5])

        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)

        ax = pylab.twinx()
        N = np.sum(Y1+Y2)
        ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
예제 #8
0
파일: pacbio.py 프로젝트: sequana/sequana
    def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12,
                grid=True, xlabel="SNR", ylabel="#",title="", clip_upper_SNR=30):
        """Plot histogram of the ACGT SNRs for all reads

        :param int bins: binning for the histogram. Note that the range starts
            at 0 and ends at clip_upper_SNR
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_snr()

        """
        if self._df is None:
            self._get_df()

        # old pacbio format has no SNR stored
        if len(self._df['snr_A'].dropna()) == 0:
            # nothing to plot
            from sequana import sequana_data
            pylab.clf()
            pylab.imshow(pylab.imread(sequana_data("no_data.jpg")))
            pylab.gca().axis('off')
            return

        if hold is False:
            pylab.clf()

        maxSNR = 0
        for letter in "ACGT":
            m = self._df.loc[:,"snr_{}".format(letter)].max()
            if m > maxSNR:
                maxSNR = m

        if maxSNR > clip_upper_SNR:
            maxSNR = clip_upper_SNR

        bins = pylab.linspace(0, maxSNR, bins)

        pylab.hist(self._df.loc[:,'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins)
        pylab.hist(self._df.loc[:,'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins)
        pylab.hist(self._df.loc[:,'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins)
        pylab.hist(self._df.loc[:,'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title,fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
예제 #9
0
파일: phred.py 프로젝트: sequana/sequana
 def plot(self, fontsize=16):
     """plot quality versus base position"""
     pylab.plot(self.quality, label="offset: %s" % self.offset)
     pylab.xlabel('base position', fontsize=fontsize)
     pylab.ylabel('Quality per base', fontsize=fontsize)
     pylab.grid(True)
     # ylim set autoscale to off so if we want to call this function  several
     # times, we must reset autoscale to on before calling ylim
     pylab.autoscale()
     limits = pylab.ylim()
     pylab.ylim(max(0,limits[0]-1), limits[1]+1)
예제 #10
0
파일: pacbio.py 프로젝트: sequana/sequana
 def boxplot_mapq_concordance(self):
     # method can only be bwa for now
     assert self.method == "bwa"
     data = self._get_data()
     df = pd.DataFrame(data, columns=["mapq", "length", "concordance"])
     pylab.clf()
     pylab.boxplot([df[df.mapq == i]['concordance'] for i in range(1,61)])
     pylab.xlabel("mapq")
     pylab.ylabel("concordance")
     pylab.grid()
     tt = [10,20,30,40,50,60]
     pylab.xticks(tt, tt)
예제 #11
0
파일: isoseq.py 프로젝트: sequana/sequana
 def hist_isoform_length_mapped_vs_unmapped(self, bins=None):
     df = self.df
     if bins is None:
         bins = range(0, len(df.reference_length.max()), 100)
     mapped = df[df.reference_name != -1]
     unmapped = df[df.reference_name == -1]
     pylab.hist(mapped.reference_length, bins=bins, alpha=0.5,
         label="mapped {}".format(len(mapped)), density=False)
     pylab.hist(unmapped.reference, bins=bins, alpha=0.5,
         label="unmapped {}".format(len(unmapped)), density=False)
     pylab.xlabel("Isoform length")
     pylab.legend()
예제 #12
0
파일: bamtools.py 프로젝트: sequana/sequana
    def plot_indel_dist(self, fontsize=16):
        """Plot indel count (+ ratio)

        :Return: list of insertions, deletions and ratio insertion/deletion for
            different length starting at 1

        .. plot::
            :include-source:

            from sequana import sequana_data, BAM
            b = BAM(sequana_data("measles.fa.sorted.bam"))
            b.plot_indel_dist()

        What you see on this figure is the presence of 10 insertions of length
        1, 1 insertion of length 2 and 3 deletions of length 1


        # Note that in samtools, several insertions or deletions in a single
        alignment are ignored and only the first one seems to be reported. For
        instance 10M1I10M1I stored only 1 insertion in its report; Same comment
        for deletions.

        .. todo:: speed up and handle long reads cases more effitiently by 
            storing INDELS as histograms rather than lists
        """
        try:
            self.insertions
        except:
            self._set_indels()

        if len(self.insertions) ==0 or len(self.deletions) == 0:
            raise ValueError("No deletions or insertions found")

        N = max(max(Counter(self.deletions)), max(Counter(self.insertions))) + 1
        D = [self.deletions.count(i) for i in range(N)]
        I = [self.insertions.count(i) for i in range(N)]
        R = [i/d if d!=0 else 0 for i,d in zip(I, D)]
        fig, ax = pylab.subplots()
        ax.plot(range(N), I, marker="x", label="Insertions")
        ax.plot(range(N), D, marker="x", label="Deletions")
        ax.plot(range(N), R, "--r", label="Ratio insertions/deletions")
        ax.set_yscale("symlog")
        pylab.ylim([1, pylab.ylim()[1]])
        pylab.legend()
        pylab.grid()
        from matplotlib.ticker import MaxNLocator
        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
        pylab.xlabel("Indel length", fontsize=fontsize)
        pylab.ylabel("Indel count", fontsize=fontsize)
        return I, D, R
예제 #13
0
파일: sequence.py 프로젝트: sequana/sequana
    def hist_ORF_CDS_linearscale(self, alpha=0.5, bins=40, xlabel="Length", ylabel="#"):
        if self._ORF_pos is None:
            self._find_ORF_CDS()

        n_ORF = self._ORF_pos["len_ORF"].dropna().shape[0]
        n_CDS = self._ORF_pos["len_CDS"].dropna().shape[0]

        # plot for all ORF and CDS
        pylab.hist(self._ORF_pos["len_ORF"].dropna(),alpha=alpha, label="ORF, N = " + str(n_ORF),bins=bins)
        pylab.hist(self._ORF_pos["len_CDS"].dropna(),alpha=alpha, label="CDS, N = " + str(n_CDS),bins=bins)
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend()
        pylab.title("Length of ORF and CDS (after filter %s > %d)" \
            %(self._type_filter, self._threshold))
예제 #14
0
파일: bamtools.py 프로젝트: sequana/sequana
    def plot_coverage(self):
        """Please use :class:`GenomeCov` for more sophisticated
        tools to plot the genome coverage

        .. plot::
            :include-source:

            from sequana import sequana_data, BAM
            b = BAM(sequana_data("measles.fa.sorted.bam"))
            b.plot_coverage()

        """
        try: self.coverage
        except: self._set_coverage()
        pylab.plot(self.coverage)
        pylab.xlabel("Coverage")
예제 #15
0
파일: bamtools.py 프로젝트: sequana/sequana
    def hist_coverage(self, bins=100):
        """

        .. plot::
            :include-source:

            from sequana import sequana_data, BAM
            b = BAM(sequana_data("measles.fa.sorted.bam"))
            b.hist_coverage()
        """
        try: self.coverage
        except: self._set_coverage()
        pylab.hist(self.coverage, bins=bins)
        pylab.xlabel("Coverage")
        pylab.ylabel("Number of mapped bases")
        pylab.grid()
예제 #16
0
파일: sequence.py 프로젝트: sequana/sequana
    def hist_length_repeats(self, bins=20, alpha=0.5, hold=False,
            fontsize=12, grid=True, title="Repeat length",
            xlabel="Repeat length", ylabel="#", logy=True):
        """Plots histogram of the repeat lengths

        """
        # check that user has set a threshold
        if hold is False:
            pylab.clf()
        pylab.hist(self.list_len_repeats, alpha=alpha, bins=bins)
        pylab.title(title)
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
        if logy:
            pylab.semilogy()
예제 #17
0
파일: bamtools.py 프로젝트: sequana/sequana
    def plot_read_length(self):
        """Plot occurences of aligned read lengths

        .. plot::
            :include-source:

            from sequana import sequana_data, BAM
            b = BAM(sequana_data("test.bam"))
            b.plot_read_length()

        """
        X, Y = self._get_read_length()
        pylab.plot(X, Y,
            label="min length:{}; max length:{}".format(min(X), max(X)))
        pylab.grid()
        pylab.xlabel("Read length", fontsize=16)
        pylab.legend()
예제 #18
0
파일: fastq.py 프로젝트: sequana/sequana
    def histogram_gc_content(self):
        """Plot histogram of GC content

        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana import FastQC
            filename  = sequana_data("test.fastq", "testing")
            qc = FastQC(filename)
            qc.histogram_gc_content()

        """
        pylab.hist(self.gc_list, bins=range(0, 100))
        pylab.grid()
        pylab.title("GC content distribution (per sequence)")
        pylab.xlabel(r"Mean GC content (%)", fontsize=self.fontsize)
        pylab.xlim([0,100])
예제 #19
0
파일: sequence.py 프로젝트: sequana/sequana
    def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40,
        xlabel="Frame", ylabel="#", bar_width=0.35):
        if self._ORF_pos is None:
                self._find_ORF_CDS()
        # number of ORF and CDS found by frame
        frames = [-3, -2, -1, 1, 2, 3]
        nb_res_ORF = []
        nb_res_CDS = []
        for fr in frames:
            nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_ORF"].dropna().shape[0])
            nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_CDS"].dropna().shape[0])

        pylab.bar(np.array(frames)-(bar_width/2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" %sum(nb_res_ORF))
        pylab.bar(np.array(frames)+(bar_width/2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" %sum(nb_res_CDS))
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend(loc=1)
        pylab.title("Number of ORF and CDS by frame")
예제 #20
0
파일: pacbio.py 프로젝트: sequana/sequana
    def hist_GC(self, bins=50, alpha=0.5, hold=False, fontsize=12,
                grid=True, xlabel="GC %", ylabel="#", label="",title=None):
        """Plot histogram GC content

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize: fontsize of the x and y labels and title.
        :param bool grid: add grid or not
        :param str xlabel:
        :param str ylabel:
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_GC()

        """
        mean_GC =  np.mean(self.df.loc[:,'GC_content'])

        # set title if needed
        if title is None:
            title = "GC %%  \n Mean GC : %.2f" %(mean_GC)

        # histogram GC percent
        if hold is False:
            pylab.clf()
        pylab.hist(self.df.loc[:,'GC_content'], bins=bins,
            alpha=alpha, label=label + ", mean : " + str(round(mean_GC, 2))
            + ", N : " + str(len(self)))
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
        pylab.xlim([0, 100])
        try: pylab.tight_layout()
        except:pass
예제 #21
0
파일: bamtools.py 프로젝트: sequana/sequana
    def plot_bar_mapq(self, fontsize=16, filename=None, ):
        """Plots bar plots of the MAPQ (quality) of alignments

            .. plot::
                :include-source:

                from sequana import BAM, sequana_data
                b = BAM(sequana_data('test.bam', "testing"))
                b.plot_bar_mapq()

        """
        df = self.get_mapq_as_df()
        df.plot(kind='hist', bins=range(0,df.max().values[0]+1), legend=False,
            grid=True, logy=True)
        pylab.xlabel("MAPQ", fontsize=fontsize)
        pylab.ylabel("Count", fontsize=fontsize)
        pylab.tight_layout()
        if filename:
            pylab.savefig(filename)
예제 #22
0
파일: fastq.py 프로젝트: sequana/sequana
    def plot_acgt_content(self, stacked=False):
        """Plot histogram of GC content

        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana import FastQC
            filename  = sequana_data("test.fastq", "testing")
            qc = FastQC(filename)
            qc.plot_acgt_content()
        """
        df = self.get_actg_content()
        if stacked is True:
            df.plot.bar(stacked=True)
        else:
            df.plot()
            pylab.grid(True)
        pylab.xlabel("position (bp)", fontsize=self.fontsize)
        pylab.ylabel("percent", fontsize=self.fontsize)
예제 #23
0
파일: pacbio.py 프로젝트: sequana/sequana
    def hist_nb_passes(self, bins=None, alpha=0.5, hold=False, fontsize=12,
                          grid=True, xlabel="Number of ZMW passes", logy=True,
                          ylabel="#", label="", title="Number of ZMW passes"):
        """Plot histogram of number of reads per ZMW (number of passes)

        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param bool logy: use log scale on the y axis (default to True)
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_nb_passes()
        """
        max_nb_pass = self.df.nb_passes.max()
        if bins is None:
            k = range(1, max_nb_pass+1)

        # histogram nb passes
        if hold is False:
            pylab.clf()
        pylab.hist(self.df.nb_passes, bins=bins, alpha=alpha,
                   label=label, log=logy, width=1)
        if len(k) < 5:
            pylab.xticks(range(6), range(6))

        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
예제 #24
0
파일: isoseq.py 프로젝트: sequana/sequana
    def plot_bar_grouped(self, normalise=False, ncol=2, N=None):
        """

        :param normalise:
        :param ncol: columns in the legend

        """
        if N is not None:
            N = np.array(N)
        else:
            N = np.array([len(x) for x in self.rawdata])

        dd = pd.DataFrame(self.sirv).T
        if normalise:
            dd = dd/ (N/max(N))
        dd.columns = self.labels

        dd.plot(kind="bar")
        pylab.xlabel("")
        pylab.legend(self.labels, ncol=ncol)
        pylab.tight_layout()
        return dd
예제 #25
0
파일: pacbio.py 프로젝트: sequana/sequana
    def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000,alpha=1,output_filename=None):
        # compute histogram of the input reads once for all to be used
        # in the target_distribution method
        self.bins = bins
        self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, density=True)

        lengths = self.bam_simul.df.read_length.values
        self.tokeep = []
        vec = []
        x = self.bam.df.read_length.mean()
        for i in range(self.bam_simul.df.shape[0]):
            can = lengths[i]
            aprob = min([alpha,self.target_distribution(can)/self.target_distribution(x)])
            #acceptance probability
            u = pylab.uniform(0,1)
            if u < aprob:
                x = can
                vec.append(x)
                self.tokeep.append(True)
            else:
                self.tokeep.append(False)

        #plotting the results:
        #theoretical curve
        x = pylab.arange(xmin, xmax, step)
        y = self.target_distribution(x)
        pylab.subplot(211)
        pylab.title('Metropolis-Hastings')
        pylab.plot(vec)
        pylab.subplot(212)

        pylab.hist(vec[burn:], bins=bins, density=1)
        pylab.plot(x,y,'r-')
        pylab.ylabel('Frequency')
        pylab.xlabel('x')
        pylab.legend(('PDF','Samples'))

        if output_filename is not None:
            self.bam_simul.filter_bool(output_filename, self.tokeep)
예제 #26
0
파일: pacbio.py 프로젝트: sequana/sequana
    def plot(self, bins=80, rwidth=0.8, **kwargs):
        pylab.clf()
        Y, X, _ = pylab.hist(self.data, bins=bins, rwidth=rwidth, **kwargs)

        pylab.xlabel(self.xlabel, fontsize=self.fontsize)
        pylab.ylabel(self.ylabel, fontsize=self.fontsize)

        """self.Y = Y
        self.X = X

        ax_twin = pylab.gca().twinx()

        shift = (X[1] - X[0]) / 2

        ax_twin.plot(X[0:-1]- shift, len(self.data) - pylab.cumsum(Y), "k")
        ax_twin.set_ylim(bottom=0)
        pylab.ylabel("CDF", fontsize=self.fontsize)
        """
        pylab.grid(self.grid)
        pylab.title(self.title)
        try: pylab.tight_layout()
        except:pass
예제 #27
0
파일: pacbio.py 프로젝트: sequana/sequana
    def plot_GC_read_len(self, hold=False, fontsize=12, bins=[200, 60],
                grid=True, xlabel="GC %", ylabel="#", cmap="BrBG"):
        """Plot GC content versus read length

        :param bool hold:
        :param int fontsize: for x and y labels and title
        :param bins: a integer or tuple of 2 integers to specify
            the binning of the x and y 2D histogram.
        :param bool grid:
        :param str xlabel:
        :param str ylabel:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.plot_GC_read_len(bins=[10, 10])

        """
        mean_len =  np.mean(self.df.loc[:,'read_length'])
        mean_GC =  np.mean(self.df.loc[:,'GC_content'])

        if hold is False:
            pylab.clf()

        data = self.df.loc[:,['read_length','GC_content']].dropna()
        h = biokit.viz.hist2d.Hist2D(data)
        res = h.plot(bins=bins, contour=False, norm='log', Nlevels=6, cmap=cmap)
        pylab.xlabel("Read length", fontsize=fontsize)
        pylab.ylabel("GC %", fontsize=fontsize)
        pylab.title("GC %% vs length \n Mean length : %.2f , Mean GC : %.2f" %
            (mean_len, mean_GC), fontsize=fontsize)
        pylab.ylim([0, 100])
        if grid is True:
            pylab.grid(True)
예제 #28
0
파일: rnadiff.py 프로젝트: sequana/sequana
 def _format_plot(self, title="", xlabel="", ylabel="", rotation=0):
     pylab.title(title)
     pylab.xticks(rotation=rotation, ha="right")
     pylab.xlabel(xlabel)
     pylab.ylabel(ylabel)
예제 #29
0
 def bar_mapq(self, logy=True, xmin=0, xmax=60, fontsize=12):
     self.df.mapq.hist()
     if logy:
         pylab.semilogy()
     pylab.xlim([xmin, xmax])
     pylab.xlabel("Mapping quality", fontsize=fontsize)
예제 #30
0
 def hist_passes(self, maxp=50, fontsize=16):
     passes = self.df.nb_passes.copy()
     passes.clip(upper=maxp).hist(bins=maxp)
     pylab.xlim([0, maxp])
     pylab.ylabel("# count", fontsize=fontsize)
     pylab.xlabel("Passes (max {})".format(maxp), fontsize=fontsize)
예제 #31
0
파일: kraken.py 프로젝트: ranjit58/sequana
    def plot(self, kind="pie", cmap="copper", threshold=1, radius=0.9,
                textcolor="red", **kargs):
        """A simple non-interactive plot of taxons

        :return: None if no taxon were found and a dataframe otherwise

        A Krona Javascript output is also available in :meth:`kraken_to_krona`

        .. plot::
            :include-source:

            from sequana import KrakenResults, sequana_data
            test_file = sequana_data("test_kraken.out", "testing")
            k = KrakenResults(test_file)
            df = k.plot(kind='pie')

        .. seealso:: to generate the data see :class:`KrakenPipeline`
            or the standalone application **sequana_taxonomy**.
        """
        if len(self._df) == 0:
            return 
        if self._data_created == False:
            status = self.kraken_to_krona()

        if kind not in ['barh', 'pie']:
            logger.error('kind parameter: Only barh and pie are supported')
            return
        # This may have already been called but maybe not. This is not time
        # consuming, so we call it again here

        if len(self.taxons.index) == 0:
            return None

        df = self.get_taxonomy_biokit(list(self.taxons.index))
        df.ix[-1] = ["Unclassified"] * 8
        data = self.taxons.copy()
        data.ix[-1] = self.unclassified

        data = data/data.sum()*100
        assert threshold > 0 and threshold < 100
        others = data[data<threshold].sum()
        data = data[data>threshold]
        names = df.ix[data.index]['name']

        data.index = names.values
        data.ix['others'] = others
        try:
            data.sort_values(inplace=True)
        except:
            data.sort(inplace=True)

        # text may be long so, let us increase the figsize a little bit
        pylab.figure(figsize=(10,8))
        pylab.clf()
        if kind == "pie":
            ax = data.plot(kind=kind, cmap=cmap, autopct='%1.1f%%',
                radius=radius, **kargs)
            pylab.ylabel(" ")
            for text in ax.texts:
                #  large, x-small, small, None, x-large, medium, xx-small,
                #  smaller, xx-large, larger
                text.set_size("small")
                text.set_color(textcolor)
            for wedge in ax.patches:
                wedge.set_linewidth(1)
                wedge.set_edgecolor("k")
            self.ax = ax
        elif kind == "barh":
            ax = data.plot(kind=kind,  **kargs)
            pylab.xlabel(" percentage ")

        return data
예제 #32
0
파일: isoseq.py 프로젝트: sequana/sequana
 def bar_mapq(self, logy=True, xmin=0, xmax=60, fontsize=12):
     self.df.mapq.hist()
     if logy:
         pylab.semilogy()
     pylab.xlim([xmin, xmax])
     pylab.xlabel("Mapping quality", fontsize=fontsize)
예제 #33
0
    def plot_coverage(self, filename=None, fontsize=16,
            rm_lw=1, rm_color="#0099cc", rm_label="Running median",
            th_lw=1, th_color="r", th_ls="--", main_color="k", main_lw=1,
            main_kwargs={}, sample=True, set_ylimits=True):
        """ Plot coverage as a function of base position.

        :param filename:
        :param rm_lw: line width of the running median
        :param rm_color: line color of the running median
        :param rm_color: label for the running median
        :param th_lw: line width of the thresholds
        :param th_color: line color of the thresholds
        :param main_color: line color of the coverage
        :param main_lw: line width of the coverage
        :param sample: if there are more than 1 000 000 points, we 
            use an integer step to skip data points. We can still plot
            all points at your own risk by setting this option to False

        :param set_ylimits: we want to focus on the "normal" coverage ignoring
            unsual excess. To do so, we set the yaxis range between 0 and a
            maximum value. This maximum value is set to the minimum between the
            6 times the mean coverage and 1.5 the maximum of the high coverage
            threshold curve. If you want to let the ylimits free, set this
            argument to False

        .. note:: if there are more than 1,000,000 points, we show only
            1,000,000 by points. For instance for 5,000,000 points,

        In addition to the coverage, the running median and coverage confidence
        corresponding to the lower and upper  zscore thresholds are shown.

        .. note:: uses the thresholds attribute.
        """
        # z = (X/rm - \mu ) / sigma
        high_zcov = (self.thresholds.high * self.best_gaussian["sigma"] +
                self.best_gaussian["mu"]) * self.df["rm"]
        low_zcov = (self.thresholds.low * self.best_gaussian["sigma"] +
                self.best_gaussian["mu"]) * self.df["rm"]

        pylab.clf()
        ax = pylab.gca()
        ax.set_facecolor('#eeeeee')
        pylab.xlim(0,self.df["pos"].iloc[-1])
        axes = []
        labels = []

        # 1,000,000 points is a lot for matplotlib. Let us restrict ourself to 1
        # million points for now.
        if len(self.df) > 1000000 and sample is True:
            NN = int(len(self.df)/1000000)
        else:
            NN = 1

        # the main coverage plot
        p1, = pylab.plot(self.df["cov"][::NN], color=main_color, label="Coverage",
                linewidth=main_lw, **main_kwargs)
        axes.append(p1)
        labels.append("Coverage")

        # The running median plot
        if rm_lw > 0:
            p2, = pylab.plot(self.df["rm"][::NN],
                    color=rm_color,
                    linewidth=rm_lw,
                    label=rm_label)
            axes.append(p2)
            labels.append(rm_label)

        # The threshold curves
        if th_lw > 0:
            p3, = pylab.plot(high_zcov[::NN], linewidth=th_lw, color=th_color, ls=th_ls,
                label="Thresholds")
            p4, = pylab.plot(low_zcov[::NN], linewidth=th_lw, color=th_color, ls=th_ls,
                label="_nolegend_")
            axes.append(p3)
            labels.append("Thresholds")

        pylab.legend(axes, labels, loc="best")
        pylab.xlabel("Position", fontsize=fontsize)
        pylab.ylabel("Per-base coverage", fontsize=fontsize)
        pylab.grid(True)

        # sometimes there are large coverage value that squeeze the plot.
        # Let us restrict it
        if set_ylimits is True:
            pylab.ylim([0, min([
                high_zcov.max() * 1.5,
                self.df["cov"].mean()*6])])
        else:
            pylab.ylim([0, pylab.ylim()[1]])

        try:
            pylab.tight_layout()
        except:
            pass

        if filename:
            pylab.savefig(filename)
예제 #34
0
    def plot_common_major_counts(self, mode, labels=None,
            switch_up_down_cond2=False, add_venn=True, xmax=None, 
            title="", fontsize=12, sortby="log2FoldChange"):
        """

        :param mode: down, up or all


        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana.compare import RNADiffCompare

            c = RNADiffCompare(
                sequana_data("rnadiff/rnadiff_onecond_1"),
                sequana_data("rnadiff/rnadiff_onecond_2"))
            c.plot_common_major_counts("down")
        """
        #cond1, cond2 = self._get_cond1_cond2()
        if labels is None:
            labels = ['r1', 'r2']

        if mode in ["down"]:
            # Negative values !
            gl1 = set(self.r1.gene_lists['down'])
            gl2 =  set(self.r2.gene_lists['down'])
            A = self.r1.df.loc[gl1].sort_values(by=sortby)
            B = self.r2.df.loc[gl1].sort_values(by=sortby)
        else:
            gl1 = set(self.r1.gene_lists[mode])
            gl2 =  set(self.r2.gene_lists[mode])
            A = self.r1.df.loc[gl1].sort_values(by=sortby, ascending=False)
            B = self.r2.df.loc[gl1].sort_values(by=sortby, ascending=False)
        # sometimes, up and down may be inverted as compared to the other
        # conditions

        N = []
        for i in range(1,max(len(A), len(B))):
            a = A.iloc[0:i].index
            b = B.iloc[0:i].index
            n = len(set(b).intersection(set(a)))
            N.append(n / i*100)

        max_common = len(set(A.index).intersection(set(B.index)))
        pylab.clf()
        if len(A) > len(B):
            pylab.axhline(max_common/len(A)*100, color="r", ls='--', label="min set intersection")
            pylab.axvline(len(B), ls="--", color="k", label="rank of minor set")
        else:
            pylab.axhline(max_common/len(B)*100, color='r', ls='--', label="min set intersect")
            pylab.axvline(len(A), ls="--", color="k", label="rank of minor set")

        pylab.plot(N)
        pylab.xlabel('rank', fontsize=fontsize)
        pylab.ylabel('% common features', fontsize=fontsize)
        pylab.grid(True)
        pylab.ylim([0,100])
        if xmax:
            pylab.xlim([0, xmax])
        else:
            pylab.xlim([0, max(len(A),len(B))])
        pylab.title(title, fontsize=fontsize)
        ax = pylab.gca()
        ax2 = ax.twinx()
        ax2.plot(A[sortby].values, "orange", label=sortby)
        ax2.set_ylabel(sortby)
        pylab.legend(loc="lower left")
        ax.legend(loc="lower right")

        if add_venn:
            f = pylab.gcf()
            ax = f.add_axes([0.5,0.5,0.35,0.35], facecolor="grey")
            if mode=="down":
                self.plot_venn_down(ax=ax, title=None, labels=labels,
                    mode="two_only")
            elif mode=="up":
                self.plot_venn_up(ax=ax, title=None, labels=labels,
                    mode="two_only")
            elif mode=="all":
                self.plot_venn_all(ax=ax, title=None, labels=labels,
                    mode="two_only")
예제 #35
0
파일: trf.py 프로젝트: sequana/sequana
 def hist_period_size(self, bins=50):
     """Length of the repetitions"""
     self.df.period_size.hist(bins=bins)
     pylab.xlabel("repeat length")
예제 #36
0
파일: trf.py 프로젝트: sequana/sequana
 def hist_repet_by_sequence(self):
     # How many repetitions per sequence
     pylab.hist(
         [len(x) for x in self.df.groupby("sequence_name").groups.values()])
     pylab.xlabel("# repetitions per sequence")
예제 #37
0
파일: kraken.py 프로젝트: sequana/sequana
    def plot(self, kind="pie", cmap="copper", threshold=1, radius=0.9,
                textcolor="red", **kargs):
        """A simple non-interactive plot of taxons

        :return: None if no taxon were found and a dataframe otherwise

        A Krona Javascript output is also available in :meth:`kraken_to_krona`

        .. plot::
            :include-source:

            from sequana import KrakenResults, sequana_data
            test_file = sequana_data("test_kraken.out", "testing")
            k = KrakenResults(test_file)
            df = k.plot(kind='pie')

        .. seealso:: to generate the data see :class:`KrakenPipeline`
            or the standalone application **sequana_taxonomy**.
        """
        if len(self._df) == 0:
            return

        if self._data_created == False:
            status = self.kraken_to_krona()

        if kind not in ['barh', 'pie']:
            logger.error('kind parameter: Only barh and pie are supported')
            return
        # This may have already been called but maybe not. This is not time
        # consuming, so we call it again here

        if len(self.taxons.index) == 0:
            return None

        df = self.get_taxonomy_biokit(list(self.taxons.index))
        df.ix[-1] = ["Unclassified"] * 8
        data = self.taxons.copy()
        data.ix[-1] = self.unclassified

        data = data/data.sum()*100
        assert threshold > 0 and threshold < 100
        others = data[data<threshold].sum()
        data = data[data>threshold]
        names = df.ix[data.index]['name']

        data.index = names.values
        data.ix['others'] = others
        try:
            data.sort_values(inplace=True)
        except:
            data.sort(inplace=True)

        # text may be long so, let us increase the figsize a little bit
        pylab.figure(figsize=(10,8))
        pylab.clf()
        if kind == "pie":
            ax = data.plot(kind=kind, cmap=cmap, autopct='%1.1f%%',
                radius=radius, **kargs)
            pylab.ylabel(" ")
            for text in ax.texts:
                #  large, x-small, small, None, x-large, medium, xx-small,
                #  smaller, xx-large, larger
                text.set_size("small")
                text.set_color(textcolor)
            for wedge in ax.patches:
                wedge.set_linewidth(1)
                wedge.set_edgecolor("k")
            self.ax = ax
        elif kind == "barh":
            ax = data.plot(kind=kind,  **kargs)
            pylab.xlabel(" percentage ")

        return data
예제 #38
0
    def plot(self,
             kind="pie",
             cmap="tab20c",
             threshold=1,
             radius=0.9,
             textcolor="red",
             **kargs):
        """A simple non-interactive plot of taxons

        :return: None if no taxon were found and a dataframe otherwise

        A Krona Javascript output is also available in :meth:`kraken_to_krona`

        .. plot::
            :include-source:

            from sequana import KrakenResults, sequana_data
            test_file = sequana_data("test_kraken.out", "testing")
            k = KrakenResults(test_file)
            df = k.plot(kind='pie')

        .. seealso:: to generate the data see :class:`KrakenPipeline`
            or the standalone application **sequana_taxonomy**.


        .. todo:: For a future release, we could use this kind of plot 
            https://stackoverflow.com/questions/57720935/how-to-use-correct-cmap-colors-in-nested-pie-chart-in-matplotlib
        """
        if len(self._df) == 0:
            return

        if self._data_created == False:
            status = self.kraken_to_krona()

        if kind not in ['barh', 'pie']:
            logger.error('kind parameter: Only barh and pie are supported')
            return
        # This may have already been called but maybe not. This is not time
        # consuming, so we call it again here

        if len(self.taxons.index) == 0:
            return None

        df = self.get_taxonomy_db(list(self.taxons.index))

        # we add the unclassified only if needed
        if self.unclassified > 0:
            df.loc[-1] = ["Unclassified"] * 8

        data = self.taxons.copy()

        # we add the unclassified only if needed
        if self.unclassified > 0:
            data.loc[-1] = self.unclassified

        data = data / data.sum() * 100
        assert threshold > 0 and threshold < 100

        # everything below the threshold (1) is gather together and summarised
        # into 'others'
        others = data[data < threshold].sum()

        data = data[data >= threshold]
        names = df.loc[data.index]['name']

        data.index = names.values

        if others > 0:
            data.loc['others'] = others

        try:
            data.sort_values(inplace=True)
        except:
            data.sort(inplace=True)

        pylab.figure(figsize=(10, 8))
        pylab.clf()
        if kind == "pie":
            ax = data.plot(kind=kind,
                           cmap=cmap,
                           autopct='%1.1f%%',
                           radius=radius,
                           **kargs)
            pylab.ylabel(" ")
            for text in ax.texts:
                #  large, x-small, small, None, x-large, medium, xx-small,
                #  smaller, xx-large, larger
                text.set_size("small")
                text.set_color(textcolor)
            for wedge in ax.patches:
                wedge.set_linewidth(1)
                wedge.set_edgecolor("k")
            self.ax = ax
        elif kind == "barh":
            ax = data.plot(kind=kind, **kargs)
            pylab.xlabel(" percentage ")

        return data
예제 #39
0
파일: pacbio.py 프로젝트: sequana/sequana
 def hist_passes(self, maxp=50, fontsize=16):
     passes = self.df.nb_passes.copy()
     passes.clip_upper(maxp).hist(bins=maxp)
     pylab.xlim([0, maxp])
     pylab.ylabel("# count", fontsize=fontsize)
     pylab.xlabel("Passes (max {})".format(maxp), fontsize=fontsize)
예제 #40
0
    def plot_volcano(self, labels=None):
        """Volcano plot of log2 fold change versus log10 of adjusted p-value

        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana.compare import RNADiffCompare

            c = RNADiffCompare(
                sequana_data("rnadiff/rnadiff_onecond_1"),
                sequana_data("rnadiff/rnadiff_onecond_2"))
            c.plot_volcano()
        """
        cond1, cond2 = "cond1", "cond2"
        if labels is None:
            labels = [cond1, cond2]

        A = self.r1.df.loc[self.r1.gene_lists["all"]]
        B = self.r2.df.loc[self.r2.gene_lists["all"]]

        if cond1 == cond2:
            cond1 += "(1)"
            cond2 += "(2)"

        pylab.clf()
        pylab.plot(A.log2FoldChange, -np.log10(A.padj), marker="o",
            alpha=0.5, color="r", lw=0, label=labels[0], pickradius=4,
            picker=True)
        pylab.plot(B.log2FoldChange, -np.log10(B.padj), marker="x",
            alpha=0.5, color="k", lw=0, label=labels[1], pickradius=4,
            picker=True)

        genes = list(A.index) + list(B.index)
        pylab.grid(True)
        pylab.xlabel("fold change")
        pylab.ylabel("log10 adjusted p-value")
        pylab.legend(loc="lower right")
        ax = pylab.gca()

        def onpick(event):
            thisline = event.artist
            self.event = event
            label = thisline.get_label()
            if label == cond1:
                gene_name = A.index[event.ind[0]]
                x1 = round(A.loc[gene_name].log2FoldChange,1)
                y1 = round(-np.log10(A.loc[gene_name].padj),1)
                try:
                    x2 = round(B.loc[gene_name].log2FoldChange,1)
                    y2 = round(-np.log10(B.loc[gene_name].padj),1)
                except:
                    x2, y2 = None, None
            else:
                gene_name = B.index[event.ind[0]]
                x1 = round(B.loc[gene_name].log2FoldChange,1)
                y1 = round(-np.log10(B.loc[gene_name].padj),1)
                try:
                    x2 = round(A.loc[gene_name].log2FoldChange,1)
                    y2 = round(-np.log10(A.loc[gene_name].padj),1)
                except:
                    x2, y2 = None, None

            try:
                if x2 is None:
                    ax.title.set_text("{} at pos [{},{}]".format(
                        gene_name,x1,y1))
                else:
                    ax.title.set_text("{} at pos [{},{}] and [{},{}]".format(
                            gene_name,x1,y1,x2,y2))
            except:
                print("exception")
                ax.title.set_text("")
            pylab.draw()
        fig = pylab.gcf()
        fig.canvas.mpl_connect('pick_event', onpick)
예제 #41
0
                                             sequence_reference)
print("reference sequence loaded")

x = range(100, 10000, 300)
res_time = []
for i in x:
    time1 = time.clock()
    alignment_best = sequence_reference_2x(sequence_reference[0:i])
    time2 = time.clock()
    print("Align %d pb in %f, score = %d" %
          (i, time2 - time1, alignment_best.optimal_alignment_score))
    res_time.append(time2 - time1)

df_res_time = pd.DataFrame([x, res_time])
df_res_time = df_res_time.transpose()
df_res_time.columns = ["len_seq", "time_minutes"]
df_res_time["time_minutes"] = df_res_time["time_minutes"] / float(60)
df_res_time.to_csv("2017_03_20_SW_time.csv")

pylab.plot(df_res_time["len_seq"], df_res_time["time_minutes"], "b-")
pylab.xlabel("Length of aligned sequence")
pylab.ylabel("Time (minutes)")
pylab.title("Time for Smith waterman alignment with scikit 0.5.1")
pylab.show()

estim = df_res_time["time_minutes"].iloc[-1] * len_genome / float(
    df_res_time["len_seq"].iloc[-1] * 60)  # in minutes

print("Time estimation (linear) for contig of %d = %f minutes (%f hours)" %
      (len_genome, estim, estim / 60))
for i in range(len(list_analysis)):
    analysis = list_analysis[i]
    res = compute_table_performance(analysis, df_results)
    print("%s" % analysis)
    # [TP, FP, FN, TN]
    # print(len(res[0]), len(res[1]), res[2], res[3] , sum([len(res[0]), len(res[1]), res[2], res[3]]))
    TP = res[0]
    FP = res[1]
    FN = [0] * res[2]
    TN = [0] * res[3]
    y_true = np.array([1] * len(TP) + [1] * len(FN) + [0] * len(FP) +
                      [0] * len(TN))
    y_scores = np.array(TP + FN + FP + TN)
    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
    pylab.plot(recall, precision, color=colors[i], label=analysis)

pylab.xlabel('Recall')
pylab.ylabel('Precision')
pylab.ylim([0.0, 1.05])
pylab.xlim([0.0, 1.05])
pylab.title('Precision-Recall')
#pylab.legend(loc="lower left")

lgd = pylab.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#pylab.tight_layout()

if file_fig != "show":
    pylab.savefig(file_fig, bbox_extra_artists=(lgd, ), bbox_inches='tight')
else:
    pylab.show()
예제 #43
0
파일: rnadiff.py 프로젝트: sequana/sequana
    def plot_pvalue_hist(self, bins=60, fontsize=16, rotation=0):

        pylab.hist(self.df.pvalue.dropna(), bins=bins, ec="k")
        pylab.xlabel("raw p-value")
        pylab.ylabel("Occurences")
예제 #44
0
    def plot_go_terms(self,
                      ontologies,
                      max_features=50,
                      log=False,
                      fontsize=8,
                      minimum_genes=0,
                      pvalue=0.05,
                      cmap="summer_r",
                      sort_by="fold_enrichment",
                      show_pvalues=False,
                      include_negative_enrichment=False,
                      fdr_threshold=0.05,
                      compute_levels=True,
                      progress=True):

        assert sort_by in ['pValue', 'fold_enrichment', 'fdr']

        # FIXME: pvalue and fold_enrichment not sorted in same order
        pylab.clf()

        df = self.get_data(
            ontologies,
            include_negative_enrichment=include_negative_enrichment,
            fdr=fdr_threshold)

        if len(df) == 0:
            return df

        df = df.query("pValue<=@pvalue")
        logger.info("Filtering out pvalue>{}. Kept {} GO terms".format(
            pvalue, len(df)))
        df = df.reset_index(drop=True)

        # Select a subset of the data to keep the best max_features in terms of
        # pValue
        subdf = df.query("number_in_list>@minimum_genes").copy()
        logger.info(
            "Filtering out GO terms with less than {} genes: Kept {} GO terms".
            format(minimum_genes, len(subdf)))

        logger.info("Filtering out the 3 parent terms")
        subdf = subdf.query("id not in @self.ontologies")

        # Keeping only a part of the data, sorting by pValue
        if sort_by == "pValue":
            subdf = subdf.sort_values(by="pValue",
                                      ascending=False).iloc[-max_features:]
            df = df.sort_values(by="pValue", ascending=False)
        elif sort_by == "fold_enrichment":
            subdf = subdf.sort_values(by="abs_log2_fold_enrichment",
                                      ascending=True).iloc[-max_features:]
            df = df.sort_values(by="abs_log2_fold_enrichment", ascending=False)
        elif sort_by == "fdr":
            subdf = subdf.sort_values(by="fdr",
                                      ascending=False).iloc[-max_features:]
            df = df.sort_values(by="fdr", ascending=False)

        subdf = subdf.reset_index(drop=True)

        # We get all levels for each go id.
        # They are stored by MF, CC or BP
        if compute_levels:
            paths = self.get_graph(list(subdf['id'].values), progress=progress)
            levels = []
            keys = list(paths.keys())
            goid_levels = paths[keys[0]]
            if len(keys) > 1:
                for k in keys[1:]:
                    goid_levels.update(paths[k])
            levels = [goid_levels[ID] for ID in subdf['id'].values]
            subdf["level"] = levels
        else:
            subdf['level'] = ""
        N = len(subdf)

        size_factor = 12000 / len(subdf)
        max_size = subdf.number_in_list.max()
        min_size = subdf.number_in_list.min()
        sizes = [
            max(max_size * 0.2, x) for x in size_factor *
            subdf.number_in_list.values / subdf.number_in_list.max()
        ]

        m1 = min(sizes)
        m3 = max(sizes)
        m2 = m1 + (m3 - m1) / 2

        if log:
            pylab.scatter(pylab.log2(subdf.fold_enrichment),
                          range(len(subdf)),
                          c=subdf.fdr,
                          s=sizes,
                          cmap=cmap,
                          alpha=0.8,
                          ec="k",
                          vmin=0,
                          vmax=fdr_threshold,
                          zorder=10)
            #pylab.barh(range(N), pylab.log2(subdf.fold_enrichment), color="r",
            #    label="pvalue>0.05; FDR>0.05")
            #pylab.axvline(1, color="gray", ls="--")
            #pylab.axvline(-1, color="gray", ls="--")
        else:
            pylab.scatter(subdf.fold_enrichment,
                          range(len(subdf)),
                          c=subdf.fdr,
                          cmap=cmap,
                          s=sizes,
                          ec="k",
                          alpha=.8,
                          vmin=0,
                          vmax=fdr_threshold,
                          zorder=10)
        #    pylab.barh(range(N), subdf.fold_enrichment, color="r",
        #    label="not significant")
        pylab.grid(zorder=-10)
        ax2 = pylab.colorbar(shrink=0.5)
        ax2.ax.set_ylabel('FDR')

        labels = [
            x if len(x) < 50 else x[0:47] + "..." for x in list(subdf.label)
        ]
        ticks = [
            "{} ({}) {}".format(ID, level, "; " + label.title())
            for level, ID, label in zip(subdf['level'], subdf.id, labels)
        ]

        pylab.yticks(range(N), ticks, fontsize=fontsize, ha='left')

        yax = pylab.gca().get_yaxis()
        try:
            pad = [x.label.get_window_extent().width for x in yax.majorTicks]
            yax.set_tick_params(pad=max(pad))
        except:
            yax.set_tick_params(pad=60 * fontsize * 0.7)
        yax.set_tick_params(pad=60 * fontsize * 0.6)

        fc_max = subdf.fold_enrichment.max(skipna=True)
        fc_min = subdf.fold_enrichment.min(skipna=True)
        # go into log2 space
        fc_max = pylab.log2(fc_max)
        fc_min = pylab.log2(fc_min)
        abs_max = max(fc_max, abs(fc_min), 1)

        if log:
            fc_max = abs_max * 1.5
        else:
            fc_max = 2**abs_max * 1.2

        pylab.axvline(0, color="k", lw=2)
        if log:
            pylab.xlabel("Fold Enrichment (log2)")
        else:
            pylab.xlabel("Fold Enrichment")
        if include_negative_enrichment:
            pylab.xlim([-fc_max, fc_max])
        else:
            pylab.xlim([0, fc_max])
        pylab.tight_layout()

        # The pvalue:
        if show_pvalues:
            ax = pylab.gca().twiny()
            ax.set_xlim([0, max(-pylab.log10(subdf.pValue)) * 1.2])
            ax.set_xlabel("p-values (log10)", fontsize=12)
            ax.plot(-pylab.log10(subdf.pValue),
                    range(len(subdf)),
                    label="pvalue",
                    lw=2,
                    color="k")
            ax.axvline(1.33, lw=1, ls="--", color="grey", label="pvalue=0.05")
            pylab.tight_layout()
            pylab.legend(loc="lower right")
        s1 = pylab.scatter([], [], s=m1, marker='o', color='#555555', ec="k")
        s2 = pylab.scatter([], [], s=m2, marker='o', color='#555555', ec="k")
        s3 = pylab.scatter([], [], s=m3, marker='o', color='#555555', ec="k")

        if len(subdf) < 10:
            labelspacing = 1.5 * 4
            borderpad = 4
            handletextpad = 2
        elif len(subdf) < 20:
            labelspacing = 1.5 * 2
            borderpad = 1
            handletextpad = 2
        else:
            labelspacing = 1.5
            borderpad = 2
            handletextpad = 2

        if len(subdf) >= 3:
            leg = pylab.legend(
                (s1, s2, s3),
                (str(int(min_size)),
                 str(int(min_size +
                         (max_size - min_size) / 2)), str(int(max_size))),
                scatterpoints=1,
                loc='lower right',
                ncol=1,
                frameon=True,
                title="gene-set size",
                labelspacing=labelspacing,
                borderpad=borderpad,
                handletextpad=handletextpad,
                fontsize=8)
        else:
            leg = pylab.legend((s1, ), (str(int(min_size)), ),
                               scatterpoints=1,
                               loc='lower right',
                               ncol=1,
                               frameon=True,
                               title="gene-set size",
                               labelspacing=labelspacing,
                               borderpad=borderpad,
                               handletextpad=handletextpad,
                               fontsize=8)

        frame = leg.get_frame()
        frame.set_facecolor('#b4aeae')
        frame.set_edgecolor('black')
        frame.set_alpha(1)

        self.subdf = subdf
        self.df = df
        return df
예제 #45
0
    def hist_snr(self,
                 bins=50,
                 alpha=0.5,
                 hold=False,
                 fontsize=12,
                 grid=True,
                 xlabel="SNR",
                 ylabel="#",
                 title="",
                 clip_upper_SNR=30):
        """Plot histogram of the ACGT SNRs for all reads

        :param int bins: binning for the histogram. Note that the range starts
            at 0 and ends at clip_upper_SNR
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_snr()

        """
        if self._df is None:
            self._get_df()

        # old pacbio format has no SNR stored
        if len(self._df['snr_A'].dropna()) == 0:
            # nothing to plot
            from sequana import sequana_data
            pylab.clf()
            pylab.imshow(pylab.imread(sequana_data("no_data.jpg")))
            pylab.gca().axis('off')
            return

        if hold is False:
            pylab.clf()

        maxSNR = 0
        for letter in "ACGT":
            m = self._df.loc[:, "snr_{}".format(letter)].max()
            if m > maxSNR:
                maxSNR = m

        if maxSNR > clip_upper_SNR:
            maxSNR = clip_upper_SNR

        bins = pylab.linspace(0, maxSNR, bins)

        pylab.hist(self._df.loc[:, 'snr_A'].clip(upper=maxSNR),
                   alpha=alpha,
                   label="A",
                   bins=bins)
        pylab.hist(self._df.loc[:, 'snr_C'].clip(upper=maxSNR),
                   alpha=alpha,
                   label="C",
                   bins=bins)
        pylab.hist(self._df.loc[:, 'snr_G'].clip(upper=maxSNR),
                   alpha=alpha,
                   label="G",
                   bins=bins)
        pylab.hist(self._df.loc[:, 'snr_T'].clip(upper=maxSNR),
                   alpha=alpha,
                   label="T",
                   bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
예제 #46
0
    def plot(self,
             bins=100,
             cmap="hot_r",
             fontsize=10,
             Nlevels=4,
             xlabel=None,
             ylabel=None,
             norm=None,
             range=None,
             normed=False,
             colorbar=True,
             contour=True,
             grid=True,
             **kargs):
        """plots histogram of mean across replicates versus coefficient variation

        :param int bins: binning for the 2D histogram (either a float or list
            of 2 binning values).
        :param cmap: a valid colormap (defaults to hot_r)
        :param fontsize: fontsize for the labels
        :param int Nlevels: must be more than 2
        :param str xlabel: set the xlabel (overwrites content of the dataframe)
        :param str ylabel: set the ylabel (overwrites content of the dataframe)
        :param norm: set to 'log' to show the log10 of the values.
        :param normed: normalise the data
        :param range: as in pylab.Hist2D : a 2x2 shape [[-3,3],[-4,4]]
        :param contour: show some contours (default to True)
        :param bool grid: Show unerlying grid (defaults to True)

        If the input is a dataframe, the xlabel and ylabel will be populated
        with the column names of the dataframe.

        """
        X = self.df[self.df.columns[0]].values
        Y = self.df[self.df.columns[1]].values
        if len(X) > 10000:
            logger.info("Computing 2D histogram. Please wait")

        pylab.clf()
        if norm == 'log':
            from matplotlib import colors
            res = pylab.hist2d(X,
                               Y,
                               bins=bins,
                               density=normed,
                               cmap=cmap,
                               norm=colors.LogNorm())
        else:
            res = pylab.hist2d(X,
                               Y,
                               bins=bins,
                               cmap=cmap,
                               density=normed,
                               range=range)

        if colorbar is True:
            pylab.colorbar()

        if contour:
            try:
                bins1 = bins[0]
                bins2 = bins[1]
            except:
                bins1 = bins
                bins2 = bins

            X, Y = pylab.meshgrid(res[1][0:bins1], res[2][0:bins2])
            if contour:
                if res[0].max().max() < 10 and norm == 'log':
                    pylab.contour(X, Y, res[0].transpose())
                else:
                    levels = [
                        round(x) for x in pylab.logspace(
                            0, pylab.log10(res[0].max().max()), Nlevels)
                    ]
                    pylab.contour(X, Y, res[0].transpose(), levels[2:])
                #pylab.clabel(C, fontsize=fontsize, inline=1)

        if ylabel is None:
            ylabel = self.df.columns[1]
        if xlabel is None:
            xlabel = self.df.columns[0]

        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)

        if grid is True:
            pylab.grid(True)

        return res
	return genome_not_covered


################################ PLOT ##############################################################################################

if do_plot:

	cmap = pylab.cm.get_cmap(colormap)
	# shuffle colors :  in case 2 adjacent contigs have the same color, user can plot again to see better
	shuffle_col = list(np.linspace(0,1,res_best.shape[0]))
	shuffle(shuffle_col)
	colors = [cmap(i) for i in shuffle_col]
	
	pylab.plot(res_best["qLength"], res_best["score_norm"],"bo",alpha=0.5)
	pylab.xlabel("Length of contig")
	pylab.ylabel("Score blasr (normalised by length)")
	pylab.title(title_plot)
	if save_plot:
		pylab.savefig(file_plot.replace(".png","_scores.png"))
	else:
		pylab.show()

	fig, axarr = pylab.subplots(2,figsize=figsize, sharex=True)
	fig.suptitle("Coverage by contigs (blasr)\n%s" % title_plot, fontsize=10)
	# plot coverage found by blasr, with score
	ax = axarr[0]
	list_contigs = plot_contigs(res_best, ax, mode="score")
	genome_not_covered = areas_not_covered(list_contigs, len_genome)
	# add grey on not covered areas
	for area in genome_not_covered: