Exemplo n.º 1
0
 def plot_ranks(self, filename=None, savefig=False):
     # ranks
     # the *score* columns contains the scaled IDR value, min(int(log2(-125IDR), 1000).
     # e.g. peaks with an IDR of 0 have a score of 1000, idr 0.05 have a score of
     # int(-125log2(0.05)) = 540, and idr 1.0 has a score of 0.
     df1 = self.df.query('score>540')
     df2 = self.df.query('score<=540')
     pylab.clf()
     pylab.plot(df1.rep1_rank,
                df1.rep2_rank,
                'ko',
                alpha=0.5,
                label='<0.05 IDR')
     pylab.plot(df2.rep1_rank,
                df2.rep2_rank,
                'ro',
                alpha=0.5,
                label='>=0.05 IDR')
     pylab.xlabel("Peak rank - replicate 1")
     pylab.ylabel("Peak rank - replicate 2")
     N = len(self.df)
     pylab.plot([0, N], [0, N], color='blue', alpha=0.5, ls='--')
     #pylab.xlim([0,1.05])
     #pylab.ylim([0,1.05])
     pylab.legend(loc='lower right')
     if savefig:
         pylab.savefig(filename)
Exemplo n.º 2
0
    def plot_unknown_barcodes(self, N=20):
        ub = self.data['UnknownBarcodes']
        df = pd.DataFrame({x['Lane']: x['Barcodes'] for x in ub})
        if "unknown" in df.index and len(df) == 1:
            df.loc['known'] = [0 for i in df.columns]

        # if data is made of undetermined only, the dataframe is just made of
        # N lanes with one entry : unknown
        S = df.sum(axis=1).sort_values(ascending=False).index[0:N]
        data = df.loc[S][::-1]
        #print(data)

        data.columns = ["Lane {}".format(x) for x in data.columns]
        from matplotlib import rcParams
        rcParams['axes.axisbelow'] = True
        pylab.figure(figsize=(10, 8))
        ax = pylab.gca()
        data.plot(kind="barh", width=1, ec="k", ax=ax)
        rcParams['axes.axisbelow'] = False
        pylab.xlabel("Number of reads", fontsize=12)
        pylab.ylabel("")
        pylab.grid(True)
        pylab.legend(
            ["Lane {}".format(x) for x in range(1,
                                                len(df.columns) + 1)],
            loc="lower right")
        try:
            pylab.tight_layout()
        except Exception as err:
            print(err)
        return data
Exemplo n.º 3
0
    def hist_ORF_CDS_linearscale(self,
                                 alpha=0.5,
                                 bins=40,
                                 xlabel="Length",
                                 ylabel="#"):
        if self._ORF_pos is None:
            self._find_ORF_CDS()

        n_ORF = self._ORF_pos["len_ORF"].dropna().shape[0]
        n_CDS = self._ORF_pos["len_CDS"].dropna().shape[0]

        # plot for all ORF and CDS
        pylab.hist(self._ORF_pos["len_ORF"].dropna(),
                   alpha=alpha,
                   label="ORF, N = " + str(n_ORF),
                   bins=bins)
        pylab.hist(self._ORF_pos["len_CDS"].dropna(),
                   alpha=alpha,
                   label="CDS, N = " + str(n_CDS),
                   bins=bins)
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend()
        pylab.title("Length of ORF and CDS (after filter %s > %d)" \
            %(self._type_filter, self._threshold))
Exemplo n.º 4
0
    def plot_dispersion(self):

        pylab.plot(
            self.dds_stats.baseMean,
            self.dds_stats.dispGeneEst,
            "ok",
            label="Estimate",
            ms=1,
        )
        pylab.plot(
            self.dds_stats.baseMean,
            self.dds_stats.dispersion,
            "ob",
            label="final",
            ms=1,
        )
        pylab.plot(self.dds_stats.baseMean,
                   self.dds_stats.dispFit,
                   "or",
                   label="Fit",
                   ms=1)
        pylab.legend()
        ax = pylab.gca()
        ax.set(yscale="log")
        ax.set(xscale="log")

        self._format_plot(
            title="Dispersion estimation",
            xlabel="Mean of normalized counts",
            ylabel="Dispersion",
        )
Exemplo n.º 5
0
    def plot_alignment(self, bamfile, motif, window=200,
            global_th=10,title=None,legend=True, legend_fontsize=11,
            valid_rnames=[],
            valid_flags=[]):
        """


        plot alignments that match the motif. 

        """

        bam = BAM(bamfile)
        print("Found {} hits".format(len(bam)))
        pylab.clf()
        count = 0
        for aln in bam:
            if valid_rnames and aln.rname not in valid_rnames:
                continue
            if valid_flags and aln.flag not in valid_flags:
                continue

            seq = aln.query_sequence
            if seq:
                count += 1
                X1 = [seq[i:i+window].count(motif) for i in range(len(seq))]
                pylab.plot(range(aln.reference_start,
                    aln.reference_start+len(seq)),X1, label=aln.query_name)
        print("Showing {} entries after filtering".format(count))
        max_theo = int(1.2*window / len(motif))
        pylab.ylim([0, max_theo])
        if legend and count<15:
            pylab.legend(fontsize=legend_fontsize)
        if title:
            pylab.title(title, fontsize=16)
Exemplo n.º 6
0
    def plot_specific_alignment(self, bamfile, query_name, motif,clf=True,
            show_figure=True, authorized_flags=[0,16],
            windows=[10, 50, 100, 150,200, 250,500, 1000], local_threshold=5):

        found = None
        bam = BAM(bamfile)
        for aln in bam:
            if aln.query_name == query_name and aln.flag in authorized_flags:
                found = aln
                break  # we may have several entries. let us pick up the first 
            

        sizes = []
        if found:
            # Detection
            seq = found.query_sequence
            if clf:pylab.clf()
            for window in windows:
                X = [seq[i:i+window].count(motif) for i in range(len(seq))]
                if show_figure:
                    pylab.plot(X, label=window)
                score = sum([x>local_threshold for x in X])
                sizes.append(score-window)
            if show_figure:
                pylab.legend()
                pylab.ylabel("# {} in a given sliding window".format(motif))
                pylab.title(query_name)
        else:
            print("{} Not found in {} file".format(query_name, bamfile))
        
        return sizes
Exemplo n.º 7
0
    def plot_stacked_hist(self,
                          output_filename=None,
                          dpi=200,
                          kind="barh",
                          fontsize=10,
                          edgecolor="k",
                          lw=1,
                          width=1,
                          ytick_fontsize=10):
        df = self.get_df()
        df.T.plot(kind=kind,
                  stacked=True,
                  edgecolor=edgecolor,
                  lw=lw,
                  width=width)
        ax = pylab.gca()
        positions = pylab.yticks()
        #ax.set_yticklabel(positions, labels, fontsize=ytick_fontsize)
        pylab.xlabel("Percentage (%)", fontsize=fontsize)
        pylab.ylabel("Sample index/name", fontsize=fontsize)
        pylab.yticks(fontsize=ytick_fontsize)
        pylab.legend(title="kingdom")
        pylab.xlim([0, 100])

        if output_filename:
            pylab.savefig(output_filename, dpi=dpi)
Exemplo n.º 8
0
    def scatter_plot(self, filename=None, hold=False):
        """Scatter plot of the score versus length of each ortholog

        .. plot::
            :include-source:

            from sequana import BUSCO, sequana_data
            b = BUSCO(sequana_data("test_busco_full_table.tsv"))
            b.scatter_plot()
        """
        if hold is False:
            pylab.clf()
        colors = ["green", "orange", "red", "blue"]
        markers = ['o', 's', 'x', 'o']
        for i, this in enumerate(["Complete", "Fragmented", "Missing",  "Duplicated"]):
            mask = self.df.Status == "Complete"
            if sum(mask)>0:
                self.df[mask].plot(x="Length", y="Score", kind="scatter", 
                    color=colors[i],
                    marker=markers[i], label="Complete")

        pylab.legend()
        pylab.grid()
        if filename:
            pylab.savefig(filename)
Exemplo n.º 9
0
    def hist_average_quality(self, fontsize=16, bins=None):
        """

        bins is from 0 to 94 
        """

        hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) 
                for read in self.hq_sequence]
        lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) 
            for read in self.lq_sequence]

        if bins is None:
            bins = range(0,94)
        Y1, X = np.histogram(hq_qv, bins=bins)
        Y2, X = np.histogram(lq_qv, bins=bins)
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlim([0.5, 93.5])

        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)

        ax = pylab.twinx()
        N = np.sum(Y1+Y2)
        ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
Exemplo n.º 10
0
    def plot_polymerase_per_barcode(self, fontsize=12, unbarcoded=True):
        """Number Of Polymerase Reads Per Barcode"""
        PR = self.df_barcoded["Polymerase Reads"].sum()
        data = self.df_barcoded['Polymerase Reads'].sort_values(
            ascending=False).values
        pylab.plot([int(x) for x in range(1,
                                          len(data) + 1)],
                   data,
                   label="barcodes")
        pylab.axhline(data.mean(), color="r", label="average")

        try:
            if unbarcoded is True:
                unbar = self.df_not_barcoded['Polymerase Reads'].iloc[0]
                pylab.axhline(unbar, color="k", ls="--", label="not barcoded")
        except:
            pass

        pylab.xlabel("Barcode Rank Order", fontsize=fontsize)
        pylab.ylabel("Counts of Reads", fontsize=fontsize)
        pylab.title("Total Polymerase count: {}".format(PR))
        pylab.legend()
        pylab.ylim(ymin=0)
        try:
            pylab.tight_layout()
        except:
            pass
Exemplo n.º 11
0
    def barplot_count_ORF_CDS_by_frame(self,
                                       alpha=0.5,
                                       bins=40,
                                       xlabel="Frame",
                                       ylabel="#",
                                       bar_width=0.35):
        if self._ORF_pos is None:
            self._find_ORF_CDS()
        # number of ORF and CDS found by frame
        frames = [-3, -2, -1, 1, 2, 3]
        nb_res_ORF = []
        nb_res_CDS = []
        for fr in frames:
            nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]
                              ["len_ORF"].dropna().shape[0])
            nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]
                              ["len_CDS"].dropna().shape[0])

        pylab.bar(np.array(frames) - (bar_width / 2),
                  nb_res_ORF,
                  bar_width,
                  alpha=alpha,
                  label="ORF N = %d" % sum(nb_res_ORF))
        pylab.bar(np.array(frames) + (bar_width / 2),
                  nb_res_CDS,
                  bar_width,
                  alpha=alpha,
                  label="CDS N = %d" % sum(nb_res_CDS))
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend(loc=1)
        pylab.title("Number of ORF and CDS by frame")
Exemplo n.º 12
0
    def plot_specific_alignment(self,
                                query_name,
                                motif,
                                clf=True,
                                windows=[10, 50, 100, 200, 500, 1000]):

        found = None
        bam = BAM(self.bamfile)
        for aln in bam:
            if aln.query_name == query_name:
                found = aln
        if found:
            # Detection
            seq = found.query_sequence
            if clf: pylab.clf()
            for window in windows:
                X = [seq[i:i + window].count(motif) for i in range(len(seq))]
                pylab.plot(X, label=window)
                score = sum([x > window / 6 for x in X])
                print(window, score / 3.)
            pylab.legend()
            pylab.ylabel("# {} in a given sliding window".format(motif))
            pylab.title(query_name)
        else:
            print("Not found")
Exemplo n.º 13
0
    def scatter_plot(self, filename=None, hold=False):
        """Scatter plot of the score versus length of each ortholog

        .. plot::
            :include-source:

            from sequana import BUSCO, sequana_data
            b = BUSCO(sequana_data("test_busco_full_table.tsv"))
            b.scatter_plot()


        Missing are not show since there is no information about contig .
        """
        if hold is False:
            pylab.clf()
        colors = ["green", "orange", "red", "blue"]
        markers = ['o', 's', 'x', 'o']
        for i, this in enumerate(["Complete", "Fragmented", "Duplicated"]):
            mask = self.df.Status == this
            if sum(mask) > 0:
                self.df[mask].plot(x="Length",
                                   y="Score",
                                   kind="scatter",
                                   color=colors[i],
                                   ax=pylab.gca(),
                                   marker=markers[i],
                                   label=this)

        pylab.legend()
        pylab.grid()
        if filename:
            pylab.savefig(filename)
Exemplo n.º 14
0
    def hist_length_repeats(self,
                            bins=None,
                            alpha=0.5,
                            hold=False,
                            fontsize=12,
                            grid=True,
                            label="Repeat length",
                            xlabel="Repeat length",
                            ylabel="#"):
        """Plots histogram of the repeat lengths


        """
        # check that user has set a threshold
        if self._list_len_repeats is None:
            self._get_list_len_repeats()

        if bins is None:
            bins = range(max(0, self.threshold - 1),
                         max(self._list_len_repeats) + 2)

        if hold is False:
            pylab.clf()
        pylab.hist(self._list_len_repeats, alpha=alpha, label=label, bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Exemplo n.º 15
0
    def hist_average_quality(self, fontsize=16, bins=None):
        """

        bins is from 0 to 94 
        """

        hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) 
                for read in self.hq_sequence]
        lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) 
            for read in self.lq_sequence]

        if bins is None:
            bins = range(0,94)
        Y1, X = np.histogram(hq_qv, bins=bins)
        Y2, X = np.histogram(lq_qv, bins=bins)
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlim([0.5, 93.5])

        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)

        ax = pylab.twinx()
        N = np.sum(Y1+Y2)
        ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
Exemplo n.º 16
0
    def barplot_summary(self,
                        filename=None,
                        color=["green", "red"],
                        alpha=0.8):

        df = self.get_data_reads()
        under = df.query("name=='Undetermined'")

        total = df.query("name!='Undetermined'")
        total = total.groupby("lane").sum().reset_index()
        total["name"] = "Determined"

        df = pd.concat([under, total])  #sort=True)

        df = df.pivot(index="lane", columns="name", values="count")
        df = df[["Determined", "Undetermined"]]
        if df.sum().min() > 1e6:
            df /= 1e6
            df.plot.barh(stacked=True, color=color, alpha=alpha, ec='k')
            pylab.xlabel("Number of reads (M)")
        else:
            df.plot.barh(stacked=True, color=color, alpha=alpha, ec='k')
            pylab.xlabel("Number of reads")
        pylab.legend()

        if filename:
            pylab.savefig(filename, dpi=200)
        return df
Exemplo n.º 17
0
    def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12,
                grid=True, xlabel="SNR", ylabel="#",title="", clip_upper_SNR=30):
        """Plot histogram of the ACGT SNRs for all reads

        :param int bins: binning for the histogram. Note that the range starts
            at 0 and ends at clip_upper_SNR
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import PacbioSubreads
            from sequana import sequana_data
            b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam"))
            b.hist_snr()

        """
        if self._df is None:
            self._get_df()

        # old pacbio format has no SNR stored
        if len(self._df['snr_A'].dropna()) == 0:
            # nothing to plot
            from sequana import sequana_data
            pylab.clf()
            pylab.imshow(pylab.imread(sequana_data("no_data.jpg")))
            pylab.gca().axis('off')
            return

        if hold is False:
            pylab.clf()

        maxSNR = 0
        for letter in "ACGT":
            m = self._df.loc[:,"snr_{}".format(letter)].max()
            if m > maxSNR:
                maxSNR = m

        if maxSNR > clip_upper_SNR:
            maxSNR = clip_upper_SNR

        bins = pylab.linspace(0, maxSNR, bins)

        pylab.hist(self._df.loc[:,'snr_A'].clip_upper(maxSNR), alpha=alpha, label="A", bins=bins)
        pylab.hist(self._df.loc[:,'snr_C'].clip_upper(maxSNR), alpha=alpha, label="C", bins=bins)
        pylab.hist(self._df.loc[:,'snr_G'].clip_upper(maxSNR), alpha=alpha, label="G", bins=bins)
        pylab.hist(self._df.loc[:,'snr_T'].clip_upper(maxSNR), alpha=alpha, label="T", bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title,fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Exemplo n.º 18
0
    def plot(self, chr_name, x1=None, x2=None, Y=20):

        df = self.df.query("name == @chr_name")
        for _, item in df.iterrows():
            if item['type'] == "deletion":
                plot([item.start, item.end], [-1, -1], "r-", label="deletion")
            else:
                plot([item.start, item.end], [Y, Y], "b-", label="duplication")
        pylab.legend()
Exemplo n.º 19
0
    def barplot_per_sample(self, alpha=0.5, width=0.8, filename=None):
        df = self.get_data_reads()

        # this is ugly but will do the job for now
        under = df.query("name=='Undetermined'")
        others = df.query("name!='Undetermined'")

        under = under.groupby("name").sum().reset_index()
        others = others.groupby("name").sum().reset_index()

        under = under[["name", "count"]].set_index("name")
        others = others[["name", "count"]].set_index("name")

        all_data = others.sort_index(ascending=False)
        all_data.columns = ["samples"]

        # appended at the end
        all_data.loc['undetermined'] = 0

        # revert back
        all_data = all_data.loc[::-1]

        # just for legend
        under.columns = ['undetermined']
        if all_data.sum().min() > 1e6:
            all_data /= 1e6
            under /= 1e6
            M = True
        else:
            M = False

        all_data.plot(kind="barh", alpha=alpha, zorder=1, width=width, ec='k')

        under.plot(kind="barh",
                   alpha=alpha,
                   color="red",
                   ax=pylab.gca(),
                   zorder=1,
                   width=width,
                   ec='k')
        pylab.ylim([-0.5, len(all_data) + 0.5])
        if len(all_data) < 100:
            pylab.yticks(range(len(all_data)), all_data.index)

        pylab.legend()
        pylab.grid(True, zorder=-1)
        if M:
            pylab.xlabel("Number of reads (M)")
        else:
            pylab.xlabel("Number of reads")
        try:
            pylab.tight_layout()
        except:
            pass
        if filename:
            pylab.savefig(filename, dpi=200)
Exemplo n.º 20
0
    def plot_volcano_differences(self, mode="all"):
        cond1, cond2 = "cond1", "cond2"
        labels = [cond1, cond2]
        A = self.r1.df.loc[self.r1.gene_lists[mode]]
        B = self.r2.df.loc[self.r2.gene_lists[mode]]
        AB = set(A.index).intersection(set(B.index))
        Aonly = A.loc[set(A.index).difference(set(B.index))]
        Bonly = B.loc[set(B.index).difference(set(A.index))]
        Acommon = A.loc[AB]
        Bcommon = B.loc[AB]

        pylab.clf()
        pylab.plot(Acommon.log2FoldChange, -np.log10(Acommon.padj), marker="o",
            alpha=0.5, color="r", lw=0, label="Common in experiment 1", pickradius=4,
            picker=True)
        pylab.plot(Bcommon.log2FoldChange, -np.log10(Bcommon.padj), marker="o",
            alpha=0.5, color="orange", lw=0, label="Common in experiment 2", pickradius=4,
            picker=True)

        for x in AB:
            a_l = A.loc[x].log2FoldChange
            a_p = -np.log10(A.loc[x].padj)
            b_l = B.loc[x].log2FoldChange
            b_p = -np.log10(B.loc[x].padj)
            pylab.plot([a_l, b_l], [a_p, b_p], 'k', alpha=0.5)

        pylab.plot(Bonly.log2FoldChange, -np.log10(Bonly.padj), marker="*",
            alpha=0.5, color="blue", lw=0, label="In experiment 2 only", pickradius=4,
            picker=True)
        pylab.plot(Aonly.log2FoldChange, -np.log10(Aonly.padj), marker="*",
            alpha=0.5, color="cyan", lw=0, label="In experiment 1 only", pickradius=4,
            picker=True)

        for name, x in Bonly.iterrows():
            x1 = x.log2FoldChange
            y1 = -np.log10(x.padj)
            x2 = self.r1.df.loc[name].log2FoldChange
            y2 = -np.log10(self.r1.df.loc[name].padj)
            pylab.plot( [x1,x2], [y1,y2], ls="--", color='r')
        for name, x in Aonly.iterrows():
            x1 = x.log2FoldChange
            y1 = -np.log10(x.padj)
            x2 = self.r2.df.loc[name].log2FoldChange
            y2 = -np.log10(self.r2.df.loc[name].padj)
            pylab.plot( [x1,x2], [y1,y2], ls="-", color='r')


        pylab.axhline(1.33, alpha=0.5, ls="--", color="r")

        pylab.xlabel("log2 fold Change")
        pylab.ylabel("log10 adjusted p-values")
        pylab.legend()
        pylab.grid(True)

        return Aonly, Bonly, Acommon, Bcommon
Exemplo n.º 21
0
    def hist_snr(self,
                 bins=50,
                 alpha=0.5,
                 hold=False,
                 fontsize=12,
                 grid=True,
                 xlabel="SNR",
                 ylabel="#",
                 title=""):
        """Plot histogram of the ACGT SNRs for all reads

        :param int bins: binning for the histogram
        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import BAMPacbio
            from sequana import sequana_data
            b = BAMPacbio(sequana_data("test_pacbio_subreads.bam"))
            b.hist_snr()

        """
        if self._df is None:
            self._get_df()

        # old pacbio format has no SNR stored
        if len(self._df['snr_A'].dropna()) == 0:
            # nothing to plot
            from sequana import sequana_data
            pylab.clf()
            pylab.imshow(pylab.imread(sequana_data("no_data.jpg")))
            pylab.gca().axis('off')
            return

        if hold is False:
            pylab.clf()
        pylab.hist(self._df.loc[:, 'snr_A'], alpha=alpha, label="A", bins=bins)
        pylab.hist(self._df.loc[:, 'snr_C'], alpha=alpha, label="C", bins=bins)
        pylab.hist(self._df.loc[:, 'snr_G'], alpha=alpha, label="G", bins=bins)
        pylab.hist(self._df.loc[:, 'snr_T'], alpha=alpha, label="T", bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Exemplo n.º 22
0
 def hist_isoform_length_mapped_vs_unmapped(self, bins=None):
     df = self.df
     if bins is None:
         bins = range(0, len(df.reference_length.max()), 100)
     mapped = df[df.reference_name != -1]
     unmapped = df[df.reference_name == -1]
     pylab.hist(mapped.reference_length, bins=bins, alpha=0.5,
         label="mapped {}".format(len(mapped)), density=False)
     pylab.hist(unmapped.reference, bins=bins, alpha=0.5,
         label="unmapped {}".format(len(unmapped)), density=False)
     pylab.xlabel("Isoform length")
     pylab.legend()
Exemplo n.º 23
0
 def hist_isoform_length_mapped_vs_unmapped(self, bins=None):
     df = self.df
     if bins is None:
         bins = range(0, df.read_length.max(), 100)
     mapped = df[df.reference_name != -1]
     unmapped = df[df.reference_name == -1]
     pylab.hist(mapped.read_length, bins=bins, alpha=0.5,
         label="mapped {}".format(len(mapped)), normed=True)
     pylab.hist(unmapped.read_length, bins=bins, alpha=0.5,
         label="unmapped {}".format(len(unmapped)), normed=True)
     pylab.xlabel("Isoform length")
     pylab.legend()
Exemplo n.º 24
0
    def run(self,
            bins=50,
            xmin=0,
            xmax=30000,
            step=1000,
            burn=1000,
            alpha=1,
            output_filename=None):
        # compute histogram of the input reads once for all to be used
        # in the target_distribution method
        self.bins = bins
        self.Y, self.X = np.histogram(self.bam.df.read_length,
                                      bins=bins,
                                      normed=True)

        lengths = self.bam_simul.df.read_length.values
        self.tokeep = []
        vec = []
        x = self.bam.df.read_length.mean()
        for i in range(self.bam_simul.df.shape[0]):
            can = lengths[i]
            aprob = min([
                alpha,
                self.target_distribution(can) / self.target_distribution(x)
            ])
            #acceptance probability
            u = pylab.uniform(0, 1)
            if u < aprob:
                x = can
                vec.append(x)
                self.tokeep.append(True)
            else:
                self.tokeep.append(False)

        #plotting the results:
        #theoretical curve
        x = pylab.arange(xmin, xmax, step)
        y = self.target_distribution(x)
        pylab.subplot(211)
        pylab.title('Metropolis-Hastings')
        pylab.plot(vec)
        pylab.subplot(212)

        pylab.hist(vec[burn:], bins=bins, normed=1)
        pylab.plot(x, y, 'r-')
        pylab.ylabel('Frequency')
        pylab.xlabel('x')
        pylab.legend(('PDF', 'Samples'))

        if output_filename is not None:
            self.bam_simul.filter_bool(output_filename, self.tokeep)
Exemplo n.º 25
0
    def plot_indel_dist(self, fontsize=16):
        """Plot indel count (+ ratio)

        :Return: list of insertions, deletions and ratio insertion/deletion for
            different length starting at 1

        .. plot::
            :include-source:

            from sequana import sequana_data, BAM
            b = BAM(sequana_data("measles.fa.sorted.bam"))
            b.plot_indel_dist()

        What you see on this figure is the presence of 10 insertions of length
        1, 1 insertion of length 2 and 3 deletions of length 1


        # Note that in samtools, several insertions or deletions in a single
        alignment are ignored and only the first one seems to be reported. For
        instance 10M1I10M1I stored only 1 insertion in its report; Same comment
        for deletions.

        .. todo:: speed up and handle long reads cases more effitiently by 
            storing INDELS as histograms rather than lists
        """
        try:
            self.insertions
        except:
            self._set_indels()

        if len(self.insertions) ==0 or len(self.deletions) == 0:
            raise ValueError("No deletions or insertions found")

        N = max(max(Counter(self.deletions)), max(Counter(self.insertions))) + 1
        D = [self.deletions.count(i) for i in range(N)]
        I = [self.insertions.count(i) for i in range(N)]
        R = [i/d if d!=0 else 0 for i,d in zip(I, D)]
        fig, ax = pylab.subplots()
        ax.plot(range(N), I, marker="x", label="Insertions")
        ax.plot(range(N), D, marker="x", label="Deletions")
        ax.plot(range(N), R, "--r", label="Ratio insertions/deletions")
        ax.set_yscale("symlog")
        pylab.ylim([1, pylab.ylim()[1]])
        pylab.legend()
        pylab.grid()
        from matplotlib.ticker import MaxNLocator
        ax.xaxis.set_major_locator(MaxNLocator(integer=True))
        pylab.xlabel("Indel length", fontsize=fontsize)
        pylab.ylabel("Indel count", fontsize=fontsize)
        return I, D, R
Exemplo n.º 26
0
 def bar_plot_contigs_length(self):
     # show length of N contigs as compare to length of the reference
     fref = FastA(self.reference)
     Nref = len(fref.sequences)
     N = len(self.fasta)
     pylab.clf()
     pylab.bar(range(0, N, int(pylab.ceil(N / Nref))),
               sorted(fref.lengths),
               width=Nref / 1.1,
               label="Plasmodium chromosomes")
     pylab.bar(range(0, N),
               sorted(self.fasta.lengths),
               width=1,
               label="canu {} contigs".format(N))
     pylab.legend()
Exemplo n.º 27
0
    def hist_ORF_CDS_linearscale(self, alpha=0.5, bins=40, xlabel="Length", ylabel="#"):
        if self._ORF_pos is None:
            self._find_ORF_CDS()

        n_ORF = self._ORF_pos["len_ORF"].dropna().shape[0]
        n_CDS = self._ORF_pos["len_CDS"].dropna().shape[0]

        # plot for all ORF and CDS
        pylab.hist(self._ORF_pos["len_ORF"].dropna(),alpha=alpha, label="ORF, N = " + str(n_ORF),bins=bins)
        pylab.hist(self._ORF_pos["len_CDS"].dropna(),alpha=alpha, label="CDS, N = " + str(n_CDS),bins=bins)
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend()
        pylab.title("Length of ORF and CDS (after filter %s > %d)" \
            %(self._type_filter, self._threshold))
Exemplo n.º 28
0
    def hist_snr(self, bins=50, alpha=0.5, hold=False, fontsize=12,
                grid=True,xlabel="SNR",ylabel="#"):
        """Plot histogram of the ACGT SNRs for all reads"""
        if self._df is None:
            self._get_df()

        if hold is False:
            pylab.clf()
        pylab.hist(self._df.loc[:,'snr_A'], alpha=alpha, label="A", bins=bins)
        pylab.hist(self._df.loc[:,'snr_C'], alpha=alpha, label="C", bins=bins)
        pylab.hist(self._df.loc[:,'snr_G'], alpha=alpha, label="G", bins=bins)
        pylab.hist(self._df.loc[:,'snr_T'], alpha=alpha, label="T", bins=bins)
        pylab.legend()
        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
Exemplo n.º 29
0
    def plot_read_length(self):
        """Plot occurences of aligned read lengths

        .. plot::
            :include-source:

            from sequana import sequana_data, BAM
            b = BAM(sequana_data("test.bam"))
            b.plot_read_length()

        """
        X, Y = self._get_read_length()
        pylab.plot(X, Y,
            label="min length:{}; max length:{}".format(min(X), max(X)))
        pylab.grid()
        pylab.xlabel("Read length", fontsize=16)
        pylab.legend()
Exemplo n.º 30
0
    def plot_read_length(self):
        """Plot occurences of aligned read lengths

        .. plot::
            :include-source:

            from sequana import sequana_data, BAM
            b = BAM(sequana_data("test.bam"))
            b.plot_read_length()

        """
        X, Y = self._get_read_length()
        pylab.plot(X, Y,
            label="min length:{}; max length:{}".format(min(X), max(X)))
        pylab.grid()
        pylab.xlabel("Read length", fontsize=16)
        pylab.legend()
Exemplo n.º 31
0
    def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40,
        xlabel="Frame", ylabel="#", bar_width=0.35):
        if self._ORF_pos is None:
                self._find_ORF_CDS()
        # number of ORF and CDS found by frame
        frames = [-3, -2, -1, 1, 2, 3]
        nb_res_ORF = []
        nb_res_CDS = []
        for fr in frames:
            nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_ORF"].dropna().shape[0])
            nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_CDS"].dropna().shape[0])

        pylab.bar(np.array(frames)-(bar_width/2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" %sum(nb_res_ORF))
        pylab.bar(np.array(frames)+(bar_width/2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" %sum(nb_res_CDS))
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend(loc=1)
        pylab.title("Number of ORF and CDS by frame")
Exemplo n.º 32
0
    def hist_average_quality(self, fontsize=16):

        hq_qv = [
            mean([phred.ascii_to_quality(X) for X in read['quality'].decode()])
            for read in iso.hq_sequence
        ]
        lq_qv = [
            mean([phred.ascii_to_quality(X) for X in read['quality'].decode()])
            for read in iso.lq_sequence
        ]

        Y1, X = numpy.histogram(hq_qv, bins=range(0, 94))
        Y2, X = numpy.histogram(lq_qv, bins=range(0, 94))
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)
Exemplo n.º 33
0
    def plot_alignment(self,
                       motif,
                       window=200,
                       global_th=10,
                       title=None,
                       legend=True,
                       legend_fontsize=11):
        """


        plot alignments that match the motif. 

        """
        df = self._get_aligments(motif=motif,
                                 window=window,
                                 global_th=global_th)
        print("Found {} hits".format(len(df)))
        bam = BAM(self.bamfile)
        pylab.clf()
        count = 0
        for aln in bam:
            if aln.query_name in df.query_name.values:
                seq = aln.query_sequence
                if seq:
                    count += 1
                    X1 = [
                        seq[i:i + window].count(motif) for i in range(len(seq))
                    ]
                    pylab.plot(range(aln.reference_start,
                                     aln.reference_start + len(seq)),
                               X1,
                               label=aln.query_name)

        max_theo = int(1.2 * window / len(motif))
        pylab.ylim([0, max_theo])
        if legend and count < 15:
            pylab.legend(fontsize=legend_fontsize)
        if title:
            pylab.title(title, fontsize=16)

        return df
Exemplo n.º 34
0
 def plot_scores(self, filename=None, savefig=False):
     # scores
     from pylab import log10
     pylab.clf()
     pylab.plot(log10(self.df.query('score>540')['rep1_signal']),
                log10(self.df.query('score>540')['rep2_signal']),
                'ko',
                alpha=0.5,
                label='<0.05 IDR')
     pylab.plot(log10(self.df.query('score<540')['rep1_signal']),
                log10(self.df.query('score<540')['rep2_signal']),
                'ro',
                alpha=0.5,
                label='>=0.05 IDR')
     N = pylab.ylim()[1]
     pylab.plot([0, N], [0, N], color='blue', alpha=0.5, ls='--')
     pylab.xlabel("Rep1 log10 score")
     pylab.ylabel("Rep2 log10 score")
     pylab.legend(loc='lower right')
     if savefig:
         pylab.savefig(filename)
Exemplo n.º 35
0
    def barplot(self, filename="lane{}_status.png", lanes=None):
        df = self.get_data_reads()
        if lanes is None:
            lanes = df.lane.unique()

        for lane in lanes:
            pylab.clf()
            query = "lane==@lane and name!='Undetermined'"
            counts = df.query(query)['count']
            total = counts.sum()
            L = len(counts)

            query = "lane==@lane and name=='Undetermined'"
            under = df.query(query)['count'].sum()
            if total > 0:
                pylab.bar(range(L), counts, color="b", label="reads")

            if total == 0:
                color = "red"
            else:
                if 100 * under / total < 20:
                    color = "green"
                elif 100 * under / total < 50:
                    color = "orange"
                else:
                    color = "red"

            pylab.bar(range(L, L + 1),
                      under,
                      color=color,
                      label="undetermined")
            pylab.xticks([])
            pylab.ylabel("Number of reads")
            try:
                pylab.legend(loc="lower left")
            except:
                pass
            pylab.title("Lane {}".format(lane))
            pylab.savefig(filename.format(lane), dpi=200)
Exemplo n.º 36
0
    def plot_bar_grouped(self, normalise=False, ncol=2, N=None):
        """

        :param normalise:
        :param ncol: columns in the legend

        """
        if N is not None:
            N = np.array(N)
        else:
            N = np.array([len(x) for x in self.rawdata])

        dd = pd.DataFrame(self.sirv).T
        if normalise:
            dd = dd/ (N/max(N))
        dd.columns = self.labels

        dd.plot(kind="bar")
        pylab.xlabel("")
        pylab.legend(self.labels, ncol=ncol)
        pylab.tight_layout()
        return dd
Exemplo n.º 37
0
    def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000,alpha=1,output_filename=None):
        # compute histogram of the input reads once for all to be used
        # in the target_distribution method
        self.bins = bins
        self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, density=True)

        lengths = self.bam_simul.df.read_length.values
        self.tokeep = []
        vec = []
        x = self.bam.df.read_length.mean()
        for i in range(self.bam_simul.df.shape[0]):
            can = lengths[i]
            aprob = min([alpha,self.target_distribution(can)/self.target_distribution(x)])
            #acceptance probability
            u = pylab.uniform(0,1)
            if u < aprob:
                x = can
                vec.append(x)
                self.tokeep.append(True)
            else:
                self.tokeep.append(False)

        #plotting the results:
        #theoretical curve
        x = pylab.arange(xmin, xmax, step)
        y = self.target_distribution(x)
        pylab.subplot(211)
        pylab.title('Metropolis-Hastings')
        pylab.plot(vec)
        pylab.subplot(212)

        pylab.hist(vec[burn:], bins=bins, density=1)
        pylab.plot(x,y,'r-')
        pylab.ylabel('Frequency')
        pylab.xlabel('x')
        pylab.legend(('PDF','Samples'))

        if output_filename is not None:
            self.bam_simul.filter_bool(output_filename, self.tokeep)
Exemplo n.º 38
0
    def plot_bar_grouped(self, normalise=False, ncol=2, N=None):
        """

        :param normalise:
        :param ncol: columns in the legend

        """
        if N is not None:
            N = np.array(N)
        else:
            N = np.array([len(x) for x in self.rawdata])

        dd = pd.DataFrame(self.sirv).T
        if normalise:
            dd = dd / (N / max(N))
        dd.columns = self.labels

        dd.plot(kind="bar")
        pylab.xlabel("")
        pylab.legend(self.labels, ncol=ncol)
        pylab.tight_layout()
        return dd
Exemplo n.º 39
0
    def _do_legend(self, figure, color_dict, bbox_to_anchor):
        if color_dict:
            patches = [
                mpatches.Patch(color=c, label=l)
                for l, c in color_dict.items()
            ]
            legend = pylab.legend(
                loc="upper center",
                handles=patches,
                bbox_to_anchor=bbox_to_anchor,
                frameon=True,
                title="Sample groups",
            )

            figure.add_artist(legend)