예제 #1
0
    def plot_idr_vs_peaks(self, filename=None, savefig=False):

        # global_idr is actually -log10(idr)
        pylab.clf()
        X1 = pylab.linspace(0, self.threshold, 100)
        X2 = pylab.linspace(self.threshold, 1, 100)
        # convert global idr to proba

        df1 = self.df.query("idr<@self.threshold")
        df2 = self.df.query("idr>[email protected]")

        pylab.plot([sum(df1['idr'] < x) for x in X1], X1, '-', color='r', lw=2)
        shift = len(df1)

        pylab.plot([shift + sum(df2['idr'] < x) for x in X2],
                   X2,
                   "-",
                   color='k',
                   lw=2)
        pylab.xlabel('Number of significant peaks')
        pylab.ylabel('IDR')
        pylab.axhline(0.05, color='b', ls='--')
        pylab.axvline(self.N_significant_peaks, color='b', ls='--')
        if savefig:
            pylab.savefig(filename)
예제 #2
0
    def plot_bar_mapq(self, fontsize=16, filename=None):
        """Plots bar plots of the MAPQ (quality) of alignments

            .. plot::
                :include-source:

                from sequana import BAM, sequana_data
                b = BAM(sequana_data('test.bam', "testing"))
                b.plot_bar_mapq()

        """
        df = self.get_mapq_as_df()
        df.plot(kind='hist',
                bins=range(0,
                           df.max().values[0] + 1),
                legend=False,
                grid=True,
                logy=True)
        pylab.xlabel("MAPQ", fontsize=fontsize)
        pylab.ylabel("Count", fontsize=fontsize)
        try:
            # This may raise issue on MAC platforms
            pylab.tight_layout()
        except:
            pass
        if filename:
            pylab.savefig(filename)
예제 #3
0
    def plot_bar_flags(self, logy=True, fontsize=16, filename=None):
        """Plot an histogram of the flags contained in the BAM

        .. plot::
            :include-source:

            from sequana import BAM, sequana_data
            b = BAM(sequana_data('test.bam', "testing"))
            b.plot_bar_flags()

        .. seealso:: :class:`SAMFlags` for meaning of each flag
        """
        df = self.get_flags_as_df()
        df = df.sum()
        pylab.clf()
        if logy is True:
            barplot = df.plot(kind='bar', logy=logy, grid=True)
        else:
            barplot = df.plot(kind='bar', grid=True)
        pylab.xlabel("flags", fontsize=fontsize)
        pylab.ylabel("count", fontsize=fontsize)
        pylab.tight_layout()
        if filename:
            pylab.savefig(filename)
        return barplot
예제 #4
0
    def plot_gc_vs_coverage(self, filename=None, bins=None, Nlevels=6,
                            fontsize=20, norm="log", ymin=0, ymax=100,
                            contour=True, **kwargs):

        if Nlevels is None or Nlevels==0:
            contour = False

        data = self.df[['cov','gc']].copy()
        data['gc'] *= 100
        data = data.dropna()
        if bins is None:
            bins = [100, min(int(data['gc'].max()-data['gc'].min()+1),
                    max(5,self.bed.gc_window_size - 4))]
            bins[0] = max(10, min(bins[0], self.df['cov'].max()))

        from biokit import Hist2D
        h2 = Hist2D(data)

        try:
            h2.plot(bins=bins, xlabel="Per-base coverage",
                    ylabel=r'GC content (%)',
                    Nlevels=Nlevels, contour=contour, norm=norm,
                    fontsize=fontsize, **kwargs)
        except:
            h2.plot(bins=bins, xlabel="Per-base coverage",
                    ylabel=r'GC content (%)' ,
                    Nlevels=Nlevels, contour=False, norm=norm,
                    fontsize=fontsize, **kwargs)
        pylab.ylim([ymin, ymax])
        try:
            pylab.tight_layout()
        except:
            pass
        if filename:
            pylab.savefig(filename)
예제 #5
0
파일: assembly.py 프로젝트: sequana/sequana
    def scatter_plot(self, filename=None, hold=False):
        """Scatter plot of the score versus length of each ortholog

        .. plot::
            :include-source:

            from sequana import BUSCO, sequana_data
            b = BUSCO(sequana_data("test_busco_full_table.tsv"))
            b.scatter_plot()
        """
        if hold is False:
            pylab.clf()
        colors = ["green", "orange", "red", "blue"]
        markers = ['o', 's', 'x', 'o']
        for i, this in enumerate(["Complete", "Fragmented", "Missing",  "Duplicated"]):
            mask = self.df.Status == "Complete"
            if sum(mask)>0:
                self.df[mask].plot(x="Length", y="Score", kind="scatter", 
                    color=colors[i],
                    marker=markers[i], label="Complete")

        pylab.legend()
        pylab.grid()
        if filename:
            pylab.savefig(filename)
예제 #6
0
    def venn(self, compa_list, direction="all", prefix=""):
        """
        Plot a venn diagram comparing the list compa_list of dr gene lists.
        compa_list is a list of comparison names from Deseq2 results
        direction specifies either if up/down/all dr genes are considered
        prefix is a string to be added as prefix to the outfile name.

        compa_list can be a list of lists of comparisons to make.
        ie [["WT", "KO1"],["WT", "KO2"]
        """
        from sequana.viz.venn import plot_venn
        # If compa_list is a list of lists of comparison
        if all(isinstance(l, list) for l in compa_list):

            fig, ax = pylab.subplots(6, 1, figsize=(6, 20))
            ax = ax.flat

            for i, c in enumerate(compa_list):

                plot_venn(
                    [self.dr_gene_lists[x][direction] for x in c],
                    [compa_name for compa_name in c],
                    ax=ax[i],
                )
        # If compa is only a list of comparisons
        else:
            plot_venn(
                [self.dr_gene_lists[x][direction] for x in compa_list],
                [compa_name for compa_name in compa_list],
            )
        out_dir = os.path.join(self.out_dir, "vennDiagrams")
        os.makedirs(out_dir, exist_ok=True)
        outfile = os.path.join(out_dir, f"{prefix}vennDiagrams_{direction}.pdf")

        pylab.savefig(outfile, bbox_inches="tight")
예제 #7
0
    def scatter_plot(self, filename=None, hold=False):
        """Scatter plot of the score versus length of each ortholog

        .. plot::
            :include-source:

            from sequana import BUSCO, sequana_data
            b = BUSCO(sequana_data("test_busco_full_table.tsv"))
            b.scatter_plot()


        Missing are not show since there is no information about contig .
        """
        if hold is False:
            pylab.clf()
        colors = ["green", "orange", "red", "blue"]
        markers = ['o', 's', 'x', 'o']
        for i, this in enumerate(["Complete", "Fragmented", "Duplicated"]):
            mask = self.df.Status == this
            if sum(mask) > 0:
                self.df[mask].plot(x="Length",
                                   y="Score",
                                   kind="scatter",
                                   color=colors[i],
                                   ax=pylab.gca(),
                                   marker=markers[i],
                                   label=this)

        pylab.legend()
        pylab.grid()
        if filename:
            pylab.savefig(filename)
예제 #8
0
def find_motif(bamfile, motif="CAGCAG", window=200, savefig=False, 
    local_th=5, global_th=10):
    """

    If at least 10 position contains at least 5 instances of the motif, then
    this is a hit and the alignment is kept
    """
    b1 = BAM(bamfile)

    # FIND motif and create pictures
    count = 0
    found = []
    Ss = []
    alns = []
    for a in b1:
        count +=1
        if a.query_sequence is None:
            continue
        seq = a.query_sequence
        X1 = [seq[i:i+window].count(motif) for i in range(len(seq))]
        S = sum([x>local_th for x in X1])
        Ss.append(S)
        als.append(a)
        if S > global_th:
            found.append(True)
            off = a.query_alignment_start
            pylab.clf()
            pylab.plot(range(off+a.reference_start, off+a.reference_start+len(seq)),X1)
            if savefig:
                pylab.savefig("{}_{}_{}.png".format(a.reference_name, S, a.query_name.replace("/", "_")))
        else:
            found.append(False)

    return alns, found, Ss
예제 #9
0
    def plot_stacked_hist(self,
                          output_filename=None,
                          dpi=200,
                          kind="barh",
                          fontsize=10,
                          edgecolor="k",
                          lw=1,
                          width=1,
                          ytick_fontsize=10):
        df = self.get_df()
        df.T.plot(kind=kind,
                  stacked=True,
                  edgecolor=edgecolor,
                  lw=lw,
                  width=width)
        ax = pylab.gca()
        positions = pylab.yticks()
        #ax.set_yticklabel(positions, labels, fontsize=ytick_fontsize)
        pylab.xlabel("Percentage (%)", fontsize=fontsize)
        pylab.ylabel("Sample index/name", fontsize=fontsize)
        pylab.yticks(fontsize=ytick_fontsize)
        pylab.legend(title="kingdom")
        pylab.xlim([0, 100])

        if output_filename:
            pylab.savefig(output_filename, dpi=dpi)
예제 #10
0
 def plot_go_terms_up(filename, ontologies, case_name, df):
     self._temp_df[case_name] = df.copy()
     self._plus[case_name] = sum(df.plus_minus == '+')
     self._minus[case_name] = sum(df.plus_minus == '-')
     pylab.savefig(f"{config.output_dir}/Panther_up_{case_name}.png")
     pylab.savefig(filename)
     pylab.close()
예제 #11
0
    def plot_hist_normalized_coverage(self, filename=None, binwidth=0.1,
            max_z=4):
        """ Barplot of the normalized coverage with gaussian fitting

        """
        pylab.clf()
        # if there are a NaN -> can't set up binning
        d = self.df["scale"][self.range[0]:self.range[1]].dropna()
        # remove outlier -> plot crash if range between min and max is too high
        d = d[np.abs(d - d.mean()) <= (4 * d.std())]
        bins = self._set_bins(d, binwidth)
        self.mixture_fitting.data = d
        try:
            self.mixture_fitting.plot(self.gaussians_params, bins=bins, Xmin=0,
                                      Xmax=max_z)
        except ZeroDivisionError:
            pass
        pylab.grid(True)
        pylab.xlim([0,max_z])
        pylab.xlabel("Normalised per-base coverage")
        try:
            pylab.tight_layout()
        except:
            pass
        if filename:
            pylab.savefig(filename)
예제 #12
0
 def plot_ranks(self, filename=None, savefig=False):
     # ranks
     # the *score* columns contains the scaled IDR value, min(int(log2(-125IDR), 1000).
     # e.g. peaks with an IDR of 0 have a score of 1000, idr 0.05 have a score of
     # int(-125log2(0.05)) = 540, and idr 1.0 has a score of 0.
     df1 = self.df.query('score>540')
     df2 = self.df.query('score<=540')
     pylab.clf()
     pylab.plot(df1.rep1_rank,
                df1.rep2_rank,
                'ko',
                alpha=0.5,
                label='<0.05 IDR')
     pylab.plot(df2.rep1_rank,
                df2.rep2_rank,
                'ro',
                alpha=0.5,
                label='>=0.05 IDR')
     pylab.xlabel("Peak rank - replicate 1")
     pylab.ylabel("Peak rank - replicate 2")
     N = len(self.df)
     pylab.plot([0, N], [0, N], color='blue', alpha=0.5, ls='--')
     #pylab.xlim([0,1.05])
     #pylab.ylim([0,1.05])
     pylab.legend(loc='lower right')
     if savefig:
         pylab.savefig(filename)
예제 #13
0
파일: bamtools.py 프로젝트: sequana/sequana
    def plot_bar_flags(self, logy=True, fontsize=16, filename=None):
        """Plot an histogram of the flags contained in the BAM

        .. plot::
            :include-source:

            from sequana import BAM, sequana_data
            b = BAM(sequana_data('test.bam', "testing"))
            b.plot_bar_flags()

        .. seealso:: :class:`SAMFlags` for meaning of each flag
        """
        df = self.get_flags_as_df()
        df = df.sum()
        pylab.clf()
        if logy is True:
            barplot = df.plot(kind='bar', logy=logy, grid=True)
        else:
            barplot = df.plot(kind='bar', grid=True)
        pylab.xlabel("flags", fontsize=fontsize)
        pylab.ylabel("count", fontsize=fontsize)
        pylab.tight_layout()
        if filename:
            pylab.savefig(filename)
        return barplot
예제 #14
0
    def barplot_summary(self,
                        filename=None,
                        color=["green", "red"],
                        alpha=0.8):

        df = self.get_data_reads()
        under = df.query("name=='Undetermined'")

        total = df.query("name!='Undetermined'")
        total = total.groupby("lane").sum().reset_index()
        total["name"] = "Determined"

        df = pd.concat([under, total])  #sort=True)

        df = df.pivot(index="lane", columns="name", values="count")
        df = df[["Determined", "Undetermined"]]
        if df.sum().min() > 1e6:
            df /= 1e6
            df.plot.barh(stacked=True, color=color, alpha=alpha, ec='k')
            pylab.xlabel("Number of reads (M)")
        else:
            df.plot.barh(stacked=True, color=color, alpha=alpha, ec='k')
            pylab.xlabel("Number of reads")
        pylab.legend()

        if filename:
            pylab.savefig(filename, dpi=200)
        return df
예제 #15
0
 def plotter(filename, key):
     name = key.replace(" ", "_")
     pylab.ioff()
     histograms[key].plot(logy=False, lw=2, marker="o")
     pylab.title(name + "(%s)" % count)
     pylab.grid(True)
     pylab.savefig(filename)
     pylab.close()  # need to close the figure otherwise warnings
예제 #16
0
파일: cutadapt.py 프로젝트: sequana/sequana
 def plotter(filename, key):
     name = key.replace(" ", "_")
     pylab.ioff()
     histograms[key].plot(logy=False, lw=2, marker="o")
     pylab.title(name + "(%s)" % count)
     pylab.grid(True)
     pylab.savefig(filename)
     pylab.close()  # need to close the figure otherwise warnings 
예제 #17
0
    def barplot_per_sample(self, alpha=0.5, width=0.8, filename=None):
        df = self.get_data_reads()

        # this is ugly but will do the job for now
        under = df.query("name=='Undetermined'")
        others = df.query("name!='Undetermined'")

        under = under.groupby("name").sum().reset_index()
        others = others.groupby("name").sum().reset_index()

        under = under[["name", "count"]].set_index("name")
        others = others[["name", "count"]].set_index("name")

        all_data = others.sort_index(ascending=False)
        all_data.columns = ["samples"]

        # appended at the end
        all_data.loc['undetermined'] = 0

        # revert back
        all_data = all_data.loc[::-1]

        # just for legend
        under.columns = ['undetermined']
        if all_data.sum().min() > 1e6:
            all_data /= 1e6
            under /= 1e6
            M = True
        else:
            M = False

        all_data.plot(kind="barh", alpha=alpha, zorder=1, width=width, ec='k')

        under.plot(kind="barh",
                   alpha=alpha,
                   color="red",
                   ax=pylab.gca(),
                   zorder=1,
                   width=width,
                   ec='k')
        pylab.ylim([-0.5, len(all_data) + 0.5])
        if len(all_data) < 100:
            pylab.yticks(range(len(all_data)), all_data.index)

        pylab.legend()
        pylab.grid(True, zorder=-1)
        if M:
            pylab.xlabel("Number of reads (M)")
        else:
            pylab.xlabel("Number of reads")
        try:
            pylab.tight_layout()
        except:
            pass
        if filename:
            pylab.savefig(filename, dpi=200)
예제 #18
0
 def plot_go_terms_down(filename, ontologies, case_name, df):
     df = self.pe.plot_go_terms("down", ontologies=ontologies,
                                       compute_levels=self.enrichment_params['plot_compute_levels'],
                                       log=self.enrichment_params['plot_logx'])
     self._temp_df[case_name] = df.copy()
     self._plus[case_name] = sum(df.plus_minus == '+')
     self._minus[case_name] = sum(df.plus_minus == '-')
     pylab.savefig(f"{config.output_dir}/Panther_down_{case_name}.png")
     pylab.savefig(filename)
     pylab.close()
예제 #19
0
    def run(self, dbname="multiple", output_prefix="kraken_final"):
        """Run the hierachical analysis

        This method does not return anything but creates a set of files:

        - kraken_final.out
        - krona_final.html
        - kraken.png  (pie plot of the classified/unclassified reads)

        .. note:: the databases are run in the order provided in the constructor.
        """
        # list of all output to merge at the end
        self._list_kraken_output = []
        self._list_kraken_input = []

        # Iteration over the databases
        for iteration in range(len(self.databases)):
            status = self._run_one_analysis(iteration)
            last_unclassified = self._list_kraken_input[-1]
            stat = os.stat(last_unclassified)
            if stat.st_size == 0:
                break

        # concatenate all kraken output files
        file_output_final = self.output_directory + os.sep + "%s.out" % output_prefix
        with open(file_output_final, 'w') as outfile:
            for fname in self._list_kraken_output:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)

        # create html report
        logger.info("Analysing results")
        result = KrakenResults(file_output_final)

        # TODO: this looks similar to the code in KrakenPipeline. could be factorised
        result.to_js("%s%s%s.html" %
                     (self.output_directory, os.sep, output_prefix))
        result.plot(kind="pie")
        pylab.savefig(self.output_directory + os.sep + "kraken.png")
        prefix = self.output_directory + os.sep
        result.kraken_to_json(prefix + "kraken.json", dbname)
        result.kraken_to_csv(prefix + "kraken.csv", dbname)

        # remove kraken intermediate files (including unclassified files)
        if self.unclassified_output:
            # Just cp the last unclassified file
            import shutil
            shutil.copy2(self._list_kraken_input[-1], self.unclassified_output)

        if not self.keep_temp_files:
            for f_temp in self._list_kraken_output:
                os.remove(f_temp)
            for f_temp in self._list_kraken_input:
                os.remove(f_temp)
예제 #20
0
    def plot_hist_zscore(self, fontsize=16, filename=None, max_z=6,
            binwidth=0.5, **hist_kargs):
        """ Barplot of the zscore values

        """
        pylab.clf()
        bins = self._set_bins(self.df["zscore"][self.range[0]:self.range[1]],
                              binwidth)
        self.df["zscore"][self.range[0]:self.range[1]].hist(
            grid=True, bins=bins, **hist_kargs)
        pylab.xlabel("Z-Score", fontsize=fontsize)
        try:
            pylab.tight_layout()
        except:
            pass
        if filename:
            pylab.savefig(filename)
예제 #21
0
파일: kraken.py 프로젝트: sequana/sequana
    def run(self, dbname="multiple", output_prefix="kraken_final"):
        """Run the hierachical analysis

        This method does not return anything but creates a set of files:

        - kraken_final.out
        - krona_final.html
        - kraken.png  (pie plot of the classified/unclassified reads)

        .. note:: the databases are run in the order provided in the constructor.
        """
        # list of all output to merge at the end
        self._list_kraken_output = []
        self._list_kraken_input = []

        # Iteration over the databases
        for iteration in range(len(self.databases)):
            self._run_one_analysis(iteration)

        # concatenate all kraken output files
        file_output_final = self.output_directory + os.sep + "%s.out" % output_prefix
        with open(file_output_final, 'w') as outfile:
            for fname in self._list_kraken_output:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)

        # create html report
        logger.info("Analysing results")
        result = KrakenResults(file_output_final)

        # TODO: this looks similar to the code in KrakenPipeline. could be factorised
        result.to_js("%s%s%s.html" % (self.output_directory, os.sep, output_prefix))
        result.plot(kind="pie")
        pylab.savefig(self.output_directory + os.sep + "kraken.png")
        prefix = self.output_directory + os.sep
        result.kraken_to_json(prefix + "kraken.json", dbname)
        result.kraken_to_csv(prefix + "kraken.csv", dbname)

        # remove kraken intermediate files (including unclassified files)
        if not self.keep_temp_files:
            for f_temp in self._list_kraken_output:
                os.remove(f_temp)
            for f_temp in self._list_kraken_input:
                os.remove(f_temp)
예제 #22
0
파일: bamtools.py 프로젝트: sequana/sequana
    def plot_bar_mapq(self, fontsize=16, filename=None, ):
        """Plots bar plots of the MAPQ (quality) of alignments

            .. plot::
                :include-source:

                from sequana import BAM, sequana_data
                b = BAM(sequana_data('test.bam', "testing"))
                b.plot_bar_mapq()

        """
        df = self.get_mapq_as_df()
        df.plot(kind='hist', bins=range(0,df.max().values[0]+1), legend=False,
            grid=True, logy=True)
        pylab.xlabel("MAPQ", fontsize=fontsize)
        pylab.ylabel("Count", fontsize=fontsize)
        pylab.tight_layout()
        if filename:
            pylab.savefig(filename)
예제 #23
0
    def pie_plot(self, filename=None, hold=False):
        """Plot PIE plot of the status (complete / fragment / missed)

        .. plot::
            :include-source:

            from sequana import BUSCO, sequana_data
            b = BUSCO(sequana_data("test_busco_full_table.tsv"))
            b.pie_plot()

        """
        if hold is False:
            pylab.clf()
        self.df.groupby('Status').count()['# Busco id'].plot(kind="pie")
        pylab.ylabel("")
        #pylab.title("Distribution Complete/Fragmented/Missing")
        #pylab.legend()
        if filename:
            pylab.savefig(filename)
예제 #24
0
파일: assembly.py 프로젝트: sequana/sequana
    def pie_plot(self, filename=None, hold=False):
        """Plot PIE plot of the status (complete / fragment / missed)

        .. plot::
            :include-source:

            from sequana import BUSCO, sequana_data
            b = BUSCO(sequana_data("test_busco_full_table.tsv"))
            b.pie_plot()

        """
        if hold is False:
            pylab.clf()
        self.df.groupby('Status').count()['# Busco id'].plot(kind="pie")
        pylab.ylabel("")
        #pylab.title("Distribution Complete/Fragmented/Missing")
        #pylab.legend()
        if filename:
            pylab.savefig(filename)
예제 #25
0
    def plot_hist_coverage(self, logx=True, logy=True, fontsize=16, N=20,
        fignum=1, hold=False, alpha=0.5, filename=None, **kw_hist):
        """


        """
        if hold is False:
            pylab.figure(fignum)
            pylab.clf()
        ax = pylab.gca()
        ax.set_facecolor('#eeeeee')

        data = self.df['cov'].dropna().values

        maxcov = data.max()
        if logx is True and logy is True:
            bins = pylab.logspace(0, pylab.log10(maxcov), N)
            pylab.hist(data, bins=bins, log=True, label=self.chrom_name,
                alpha=alpha, **kw_hist)
            pylab.semilogx()
            pylab.xlabel("Coverage (log scale)", fontsize=fontsize)
            pylab.ylabel("Count (log scale)", fontsize=fontsize)
        elif logx is False and logy is True:
            pylab.hist(data, bins=N, log=True, label=self.chrom_name,
                alpha=alpha, **kw_hist)
            pylab.xlabel("Coverage", fontsize=fontsize)
            pylab.ylabel("Count (log scale)", fontsize=fontsize)
        elif logx is True and logy is False:
            bins = pylab.logspace(0, pylab.log10(maxcov), N)
            pylab.hist(data, bins=N, label=self.chrom_name, alpha=alpha,
                **kw_hist)
            pylab.xlabel("Coverage (log scale)", fontsize=fontsize)
            pylab.ylabel("Count", fontsize=fontsize)
            pylab.semilogx()
        else:
            pylab.hist(data, bins=N, label=self.chrom_name, alpha=alpha,
                **kw_hist)
            pylab.xlabel("Coverage", fontsize=fontsize)
            pylab.ylabel("Count", fontsize=fontsize)
        pylab.grid(True)
        if filename:
            pylab.savefig(filename)
예제 #26
0
파일: rnadiff.py 프로젝트: sequana/sequana
    def run_enrichment_kegg(self,
                            organism,
                            annot_col="Name",
                            out_dir="enrichment"):  # pragma: no cover

        out_dir = Path(out_dir) / "figures"
        out_dir.mkdir(exist_ok=True, parents=True)

        gene_lists_dict = self.get_gene_lists(annot_col=annot_col, dropna=True)
        enrichment = {}

        for compa in self.comparisons:
            gene_lists = gene_lists_dict[compa]
            ke = KeggPathwayEnrichment(gene_lists, organism, progress=False)
            ke.compute_enrichment()

            for direction in ["up", "down", "all"]:
                enrichment[(compa, direction)] = ke._get_final_df(
                    ke.enrichment[direction].results, nmax=10000)
                pylab.figure()
                ke.scatterplot(direction)
                pylab.tight_layout()
                pylab.savefig(out_dir / f"kegg_{compa}_{direction}.pdf")
                pylab.savefig(out_dir / f"kegg_{compa}_{direction}.png")

            logger.info(f"KEGG enrichment for {compa} DONE.")

        df = pd.concat(enrichment).sort_index()
        df.index.rename(["comparison", "direction", "index"], inplace=True)

        self.enrichment_kegg = df

        # Export results (should be moved to enrichment.py at some point I think)
        with pd.ExcelWriter(out_dir.parent / "enrichment_kegg.xlsx") as writer:
            df = self.enrichment_kegg.copy()
            df.reset_index(inplace=True)
            df.to_excel(writer, "kegg", index=False)
            ws = writer.sheets["kegg"]
            try:
                ws.autofilter(0, 0, df.shape[0], df.shape[1] - 1)
            except:
                logger.warning("Fixme")
예제 #27
0
    def find_motif(self, motif, window=200, figure=False, savefig=False):

        b1 = BAM(self.bamfile)

        df = {
            "query_name": [],
            "hit": [],
            "length": [],
            "start": [],
            "end": []
        }

        for a in b1:
            if a.query_sequence is None:
                continue
            seq = a.query_sequence

            X1 = [seq[i:i + window].count(motif) for i in range(len(seq))]
            S = sum([x >= self.local_threshold for x in X1])

            df['query_name'].append(a.query_name)
            df['start'].append(a.reference_start)
            df['end'].append(a.reference_end)
            df['length'].append(a.rlen)
            df['hit'].append(S)

            if S >= self.global_threshold:
                off = a.query_alignment_start
                #pylab.clf()
                if figure:
                    pylab.plot(
                        range(off + a.reference_start,
                              off + a.reference_start + len(seq)), X1)
                    if savefig:
                        pylab.savefig("{}_{}_{}.png".format(
                            a.reference_name, S,
                            a.query_name.replace("/", "_")))

        df = pd.DataFrame(df)
        L = len(df.query("hit>5"))
        print(L)
        return df
예제 #28
0
 def plot_rank_vs_idr_score(self, filename=None, savefig=False):
     # rank versus IDR scores
     f, axes = pylab.subplots(2, 1)
     df = self.df
     axes[0].plot(
         range(len(df)),
         df.sort_values(by='rep1_rank', ascending=False)['local_idr'], 'o')
     axes[0].set_ylabel("log10 IDR for replicate 1")
     axes[0].axvline(len(self.df) - self.N_significant_peaks,
                     color='b',
                     ls='--')
     axes[1].plot(
         range(len(df)),
         df.sort_values(by='rep2_rank', ascending=False)['local_idr'], 'ro')
     axes[1].set_ylabel("log10 IDR for replicate 2")
     axes[1].axvline(len(self.df) - self.N_significant_peaks,
                     color='b',
                     ls='--')
     if savefig:
         pylab.savefig(filename)
예제 #29
0
 def plot_scores(self, filename=None, savefig=False):
     # scores
     from pylab import log10
     pylab.clf()
     pylab.plot(log10(self.df.query('score>540')['rep1_signal']),
                log10(self.df.query('score>540')['rep2_signal']),
                'ko',
                alpha=0.5,
                label='<0.05 IDR')
     pylab.plot(log10(self.df.query('score<540')['rep1_signal']),
                log10(self.df.query('score<540')['rep2_signal']),
                'ro',
                alpha=0.5,
                label='>=0.05 IDR')
     N = pylab.ylim()[1]
     pylab.plot([0, N], [0, N], color='blue', alpha=0.5, ls='--')
     pylab.xlabel("Rep1 log10 score")
     pylab.ylabel("Rep2 log10 score")
     pylab.legend(loc='lower right')
     if savefig:
         pylab.savefig(filename)
예제 #30
0
    def barplot(self, filename="lane{}_status.png", lanes=None):
        df = self.get_data_reads()
        if lanes is None:
            lanes = df.lane.unique()

        for lane in lanes:
            pylab.clf()
            query = "lane==@lane and name!='Undetermined'"
            counts = df.query(query)['count']
            total = counts.sum()
            L = len(counts)

            query = "lane==@lane and name=='Undetermined'"
            under = df.query(query)['count'].sum()
            if total > 0:
                pylab.bar(range(L), counts, color="b", label="reads")

            if total == 0:
                color = "red"
            else:
                if 100 * under / total < 20:
                    color = "green"
                elif 100 * under / total < 50:
                    color = "orange"
                else:
                    color = "red"

            pylab.bar(range(L, L + 1),
                      under,
                      color=color,
                      label="undetermined")
            pylab.xticks([])
            pylab.ylabel("Number of reads")
            try:
                pylab.legend(loc="lower left")
            except:
                pass
            pylab.title("Lane {}".format(lane))
            pylab.savefig(filename.format(lane), dpi=200)
예제 #31
0
파일: kraken.py 프로젝트: brwnj/sequana
    def run(self,
            output_filename_classified=None,
            output_filename_unclassified=None,
            only_classified_output=False):
        """Run the analysis using Kraken and create the Krona output

        .. todo:: reuse the KrakenResults code to simplify this method.

        """
        # Run Kraken (KrakenAnalysis)
        kraken_results = self.output_directory + os.sep + "kraken.out"

        self.ka.run(output_filename=kraken_results,
                    output_filename_unclassified=output_filename_unclassified,
                    output_filename_classified=output_filename_classified,
                    only_classified_output=only_classified_output)

        # Translate kraken output to a format understood by Krona and save png
        # image
        self.kr = KrakenResults(kraken_results)

        df = self.kr.plot(kind="pie")
        pylab.savefig(self.output_directory + os.sep + "kraken.png")

        prefix = self.output_directory + os.sep

        self.kr.kraken_to_json(prefix + "kraken.json", self.dbname)
        self.kr.kraken_to_csv(prefix + "kraken.csv", self.dbname)

        # Transform to Krona HTML
        from snakemake import shell
        kraken_html = self.output_directory + os.sep + "kraken.html"
        status = self.kr.kraken_to_krona(output_filename=prefix +
                                         "kraken.out.summary")
        if status is True:
            shell("ktImportText %s -o %s" %
                  (prefix + "kraken.out.summary", kraken_html))
        else:
            shell("touch {}".format(kraken_html))
예제 #32
0
파일: kraken.py 프로젝트: sequana/sequana
    def run(self, output_filename_classified=None,
                output_filename_unclassified=None,
                only_classified_output=False):
        """Run the analysis using Kraken and create the Krona output

        .. todo:: reuse the KrakenResults code to simplify this method.

        """
        # Run Kraken (KrakenAnalysis)
        kraken_results = self.output_directory + os.sep + "kraken.out"

        self.ka.run(
            output_filename=kraken_results,
            output_filename_unclassified=output_filename_unclassified,
            output_filename_classified=output_filename_classified,
            only_classified_output=only_classified_output
        )

        # Translate kraken output to a format understood by Krona and save png
        # image
        self.kr = KrakenResults(kraken_results)

        df = self.kr.plot(kind="pie")
        pylab.savefig(self.output_directory + os.sep + "kraken.png")

        prefix = self.output_directory + os.sep

        self.kr.kraken_to_json(prefix + "kraken.json", self.dbname)
        self.kr.kraken_to_csv(prefix + "kraken.csv", self.dbname)

        # Transform to Krona HTML
        from snakemake import shell
        kraken_html = self.output_directory + os.sep + "kraken.html"
        status = self.kr.kraken_to_krona(output_filename=prefix+"kraken.out.summary")
        if status is True:
            shell("ktImportText %s -o %s" % (prefix+"kraken.out.summary", kraken_html))
        else:
            shell("touch {}".format(kraken_html))
예제 #33
0
    def find_motif_bam(self, filename, motif, window=200, figure=False, savefig=False,
            local_threshold=None, global_threshold=None):
        from sequana import BAM
        b1 = BAM(filename)
        df = {
            "query_name": [],
            "hit": [],
            "length": [],
            "start": [],
            "end": []
        }

        for a in b1:
            if a.query_sequence is None:
                continue
            seq = a.query_sequence

            X1, S = self.find_motif_from_sequence(seq, motif, window=window,
                local_threshold=local_threshold)

            df['query_name'].append(a.query_name)
            df['start'].append(a.reference_start)
            df['end'].append(a.reference_end)
            df['length'].append(a.rlen)
            df['hit'].append(S)

            if S >= self.global_threshold:
                off = a.query_alignment_start
                #pylab.clf()
                if figure:
                    pylab.plot(range(off+a.reference_start, off+a.reference_start+len(seq)),X1)
                    if savefig:
                        pylab.savefig("{}_{}_{}.png".format(a.reference_name, S, a.query_name.replace("/", "_")))

        df = pd.DataFrame(df)
        L = len(df.query("hit>5"))
        print(L)
        return df
예제 #34
0
 def plot_scatter_up(filename):
     ke.scatterplot('up')
     pylab.savefig(filename)
예제 #35
0
 def plot_barplot_up(filename):
     ke.barplot('up')
     pylab.savefig(filename)
예제 #36
0
 def plot_scatter_down(filename):
     ke.scatterplot('down')
     pylab.savefig(filename)
예제 #37
0
 def plot_barplot_down(filename):
     ke.barplot('down')
     pylab.savefig(filename)