示例#1
0
文件: isoseq.py 项目: sequana/sequana
    def hist_average_quality(self, fontsize=16, bins=None):
        """

        bins is from 0 to 94 
        """

        hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) 
                for read in self.hq_sequence]
        lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) 
            for read in self.lq_sequence]

        if bins is None:
            bins = range(0,94)
        Y1, X = np.histogram(hq_qv, bins=bins)
        Y2, X = np.histogram(lq_qv, bins=bins)
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlim([0.5, 93.5])

        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)

        ax = pylab.twinx()
        N = np.sum(Y1+Y2)
        ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
示例#2
0
    def hist_average_quality(self, fontsize=16, bins=None):
        """

        bins is from 0 to 94 
        """

        hq_qv = [pylab.mean([ord(X)-33 for X in read['quality'].decode()]) 
                for read in self.hq_sequence]
        lq_qv = [pylab.mean([ord(X) -33 for X in read['quality'].decode()]) 
            for read in self.lq_sequence]

        if bins is None:
            bins = range(0,94)
        Y1, X = np.histogram(hq_qv, bins=bins)
        Y2, X = np.histogram(lq_qv, bins=bins)
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlim([0.5, 93.5])

        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)

        ax = pylab.twinx()
        N = np.sum(Y1+Y2)
        ax.plot(X, [N] + list(N-np.cumsum(Y1+Y2)), "k")
示例#3
0
    def barplot_count_ORF_CDS_by_frame(self,
                                       alpha=0.5,
                                       bins=40,
                                       xlabel="Frame",
                                       ylabel="#",
                                       bar_width=0.35):
        if self._ORF_pos is None:
            self._find_ORF_CDS()
        # number of ORF and CDS found by frame
        frames = [-3, -2, -1, 1, 2, 3]
        nb_res_ORF = []
        nb_res_CDS = []
        for fr in frames:
            nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]
                              ["len_ORF"].dropna().shape[0])
            nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]
                              ["len_CDS"].dropna().shape[0])

        pylab.bar(np.array(frames) - (bar_width / 2),
                  nb_res_ORF,
                  bar_width,
                  alpha=alpha,
                  label="ORF N = %d" % sum(nb_res_ORF))
        pylab.bar(np.array(frames) + (bar_width / 2),
                  nb_res_CDS,
                  bar_width,
                  alpha=alpha,
                  label="CDS N = %d" % sum(nb_res_CDS))
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend(loc=1)
        pylab.title("Number of ORF and CDS by frame")
示例#4
0
    def plot_percentage_null_read_counts(self):
        """


        Bars represent the percentage of null counts in each samples. 
        The dashed horizontal line represents the percentage of 
        feature counts being equal to zero across all samples.

        .. plot::
            :include-source:
    
            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_percentage_null_read_counts()


        """
        N = len(self.sample_names)

        data = (self.df[self.sample_names]==0).sum() 
        data = data / len(self.df) * 100

        all_null = (self.df[self.sample_names].sum(axis=1) == 0).sum()

        pylab.clf()
        pylab.bar(range(N), data)
        pylab.axhline(all_null / len(self.df) * 100, lw=2, ls="--", color="k")
        pylab.xticks(range(N), self.sample_names)
        pylab.xlabel("Sample")
示例#5
0
    def histogram_sequence_lengths(self, logy=True):
        """Histogram sequence lengths

        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana import FastQC
            filename  = sequana_data("test.fastq", "testing")
            qc = FastQC(filename)
            qc.histogram_sequence_lengths()

        """
        data = [len(x) for x in self.sequences]
        bary, barx = np.histogram(data, bins=range(max(data)+1))

        # get rid of zeros to avoid warnings
        bx = [x for x,y in zip(barx, bary) if y!=0]
        by = [y for x,y in zip(barx, bary) if y!=0]
        if logy:
            pylab.bar(bx, pylab.log10(by))
        else:
            pylab.bar(bx, by)

        pylab.xlim([1,max(data)+1])

        pylab.grid(True)
        pylab.xlabel("position (bp)", fontsize=self.fontsize)
        pylab.ylabel("Count (log scale)", fontsize=self.fontsize)
示例#6
0
    def plot_count_per_sample(self, fontsize=12, sample_list=None):
        """"Number of mapped reads per sample. Each color for each replicate

        .. plot::
            :include-source:

            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_count_per_sample()
        """
        sample_names = self.sample_names
        N = len(sample_names)
        dd = self.df[sample_names].sum()
        pylab.clf()

        colors = []
        for sample in self.sample_names:
            colors.append(self.colors[self.get_cond_from_sample(sample)])

        pylab.bar(range(N), (dd/1000000).values, 
            color=colors, alpha=1, 
            zorder=10, lw=1, ec="k", width=0.9)
        pylab.xlabel("Samples", fontsize=fontsize)
        pylab.ylabel("Total read count (millions)", fontsize=fontsize)
        pylab.grid(True, zorder=0)
        pylab.title("Total read count per sample", fontsize=fontsize)
        pylab.xticks(range(N), self.sample_names)
示例#7
0
    def hist_ZMW_subreads(self,
                          alpha=0.5,
                          hold=False,
                          fontsize=12,
                          grid=True,
                          xlabel="Number of ZMW passes",
                          logy=True,
                          ylabel="#",
                          label="",
                          title="Number of ZMW passes"):
        """Plot histogram of number of reads per ZMW (number of passes)

        :param float alpha: transparency of the histograms
        :param bool hold:
        :param int fontsize:
        :param bool grid:
        :param str xlabel:
        :param str ylabel:
        :param bool logy: use log scale on the y axis (default to True)
        :param str label: label of the histogram (for the legend)
        :param str title:

        .. plot::
            :include-source:

            from sequana.pacbio import BAMPacbio
            from sequana import sequana_data
            b = BAMPacbio(sequana_data("test_pacbio_subreads.bam"))
            b.hist_ZMW_subreads()
        """
        if self._nb_pass is None:
            self._get_ZMW_passes()

        max_nb_pass = max(self._nb_pass.keys())
        k = range(1, max_nb_pass + 1)
        val = [self._nb_pass[i] for i in k]

        # histogram nb passes
        if hold is False:
            pylab.clf()
        pylab.bar(k, val, alpha=alpha, label=label, log=logy)
        if len(k) < 5:
            pylab.xticks(range(6), range(6))

        pylab.xlabel(xlabel, fontsize=fontsize)
        pylab.ylabel(ylabel, fontsize=fontsize)
        pylab.title(title, fontsize=fontsize)
        if grid is True:
            pylab.grid(True)
示例#8
0
 def bar_plot_contigs_length(self):
     # show length of N contigs as compare to length of the reference
     fref = FastA(self.reference)
     Nref = len(fref.sequences)
     N = len(self.fasta)
     pylab.clf()
     pylab.bar(range(0, N, int(pylab.ceil(N / Nref))),
               sorted(fref.lengths),
               width=Nref / 1.1,
               label="Plasmodium chromosomes")
     pylab.bar(range(0, N),
               sorted(self.fasta.lengths),
               width=1,
               label="canu {} contigs".format(N))
     pylab.legend()
示例#9
0
    def hist_average_quality(self, fontsize=16):

        hq_qv = [
            mean([phred.ascii_to_quality(X) for X in read['quality'].decode()])
            for read in iso.hq_sequence
        ]
        lq_qv = [
            mean([phred.ascii_to_quality(X) for X in read['quality'].decode()])
            for read in iso.lq_sequence
        ]

        Y1, X = numpy.histogram(hq_qv, bins=range(0, 94))
        Y2, X = numpy.histogram(lq_qv, bins=range(0, 94))
        pylab.bar(X[1:], Y1, width=1, label="HQ")
        pylab.bar(X[1:], Y2, bottom=Y1, width=1, label="LQ")
        pylab.xlabel("Isoform average QV")
        pylab.ylabel("# Isoform")
        pylab.legend(fontsize=fontsize)
示例#10
0
    def barplot_count_ORF_CDS_by_frame(self, alpha=0.5, bins=40,
        xlabel="Frame", ylabel="#", bar_width=0.35):
        if self._ORF_pos is None:
                self._find_ORF_CDS()
        # number of ORF and CDS found by frame
        frames = [-3, -2, -1, 1, 2, 3]
        nb_res_ORF = []
        nb_res_CDS = []
        for fr in frames:
            nb_res_ORF.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_ORF"].dropna().shape[0])
            nb_res_CDS.append(self._ORF_pos[self._ORF_pos["frame"] == fr]["len_CDS"].dropna().shape[0])

        pylab.bar(np.array(frames)-(bar_width/2), nb_res_ORF, bar_width, alpha=alpha, label="ORF N = %d" %sum(nb_res_ORF))
        pylab.bar(np.array(frames)+(bar_width/2), nb_res_CDS, bar_width, alpha=alpha, label="CDS N = %d" %sum(nb_res_CDS))
        pylab.xlabel(xlabel)
        pylab.ylabel(ylabel)
        pylab.legend(loc=1)
        pylab.title("Number of ORF and CDS by frame")
示例#11
0
    def barplot(self, filename="lane{}_status.png", lanes=None):
        df = self.get_data_reads()
        if lanes is None:
            lanes = df.lane.unique()

        for lane in lanes:
            pylab.clf()
            query = "lane==@lane and name!='Undetermined'"
            counts = df.query(query)['count']
            total = counts.sum()
            L = len(counts)

            query = "lane==@lane and name=='Undetermined'"
            under = df.query(query)['count'].sum()
            if total > 0:
                pylab.bar(range(L), counts, color="b", label="reads")

            if total == 0:
                color = "red"
            else:
                if 100 * under / total < 20:
                    color = "green"
                elif 100 * under / total < 50:
                    color = "orange"
                else:
                    color = "red"

            pylab.bar(range(L, L + 1),
                      under,
                      color=color,
                      label="undetermined")
            pylab.xticks([])
            pylab.ylabel("Number of reads")
            try:
                pylab.legend(loc="lower left")
            except:
                pass
            pylab.title("Lane {}".format(lane))
            pylab.savefig(filename.format(lane), dpi=200)
示例#12
0
    def plot_count_per_sample(self, fontsize=12, sample_list=None):
        """"Number of mapped reads per sample. Each color for each replicate

        .. plot::
            :include-source:
    
            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_count_per_sample()
        """
        sample_names = [x for x in self.df.columns if x.startswith("norm")] 
        sample_names = [x.replace("norm.", "") for x in sample_names]
        N = len(sample_names)
        dd = self.df[sample_names].sum()
        pylab.clf()
        pylab.bar(range(N), (dd/1000000).values, color=['r']*3+['b']*3, alpha=1)
        pylab.xlabel("Samples", fontsize=fontsize)
        pylab.ylabel("Total read count (millions)", fontsize=fontsize)
        pylab.grid(True)
        pylab.title("Total read count per sample", fontsize=fontsize)
示例#13
0
    def plot_percentage_null_read_counts(self):
        """

        Bars represent the percentage of null counts in each samples. 
        The dashed horizontal line represents the percentage of 
        feature counts being equal to zero across all samples.

        .. plot::
            :include-source:
    
            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_percentage_null_read_counts()


        """
        N = len(self.sample_names)

        data = (self.df[self.sample_names]==0).sum() 
        data = data / len(self.df) * 100

        all_null = (self.df[self.sample_names].sum(axis=1) == 0).sum()

        colors = []
        for sample in self.sample_names:
            colors.append(self.colors[self.get_cond_from_sample(sample)])

        pylab.clf()
        pylab.bar(range(N), data, 
            color=colors, alpha=1, 
            zorder=10, lw=1, ec="k", width=0.9)
        pylab.axhline(all_null / len(self.df) * 100, lw=2, ls="--", color="k",
            zorder=20)
        pylab.xticks(range(N), self.sample_names)
        pylab.xlabel("Sample")
        pylab.ylabel("Proportion of null counts (%)")
        pylab.grid(True, zorder=0)
示例#14
0
    def plot_count_per_sample(self, fontsize=12, rotation=45):
        """Number of mapped and annotated reads (i.e. counts) per sample. Each color
        for each replicate

        .. plot::
            :include-source:

            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_count_per_sample()

        """
        pylab.clf()
        df = self.counts_raw.sum().rename("total_counts")
        df = pd.concat([self.design_df, df], axis=1)

        pylab.bar(
            df.index,
            df.total_counts / 1000000,
            color=df.group_color,
            lw=1,
            zorder=10,
            ec="k",
            width=0.9,
        )

        pylab.xlabel("Samples", fontsize=fontsize)
        pylab.ylabel("reads (M)", fontsize=fontsize)
        pylab.grid(True, zorder=0)
        pylab.title("Total read count per sample", fontsize=fontsize)
        pylab.xticks(rotation=rotation, ha="right")
        # pylab.xticks(range(N), self.sample_names)
        try:
            pylab.tight_layout()
        except:
            pass
示例#15
0
    def plot_percentage_null_read_counts(self):
        """Bars represent the percentage of null counts in each samples.  The dashed
        horizontal line represents the percentage of feature counts being equal
        to zero across all samples

        .. plot::
            :include-source:

            from sequana.rnadiff import RNADiffResults
            from sequana import sequana_data

            r = RNADiffResults(sequana_data("rnadiff/rnadiff_onecond_1"))
            r.plot_percentage_null_read_counts()

        """
        pylab.clf()
        # how many null counts ?
        df = (self.counts_raw == 0).sum() / self.counts_raw.shape[0] * 100
        df = df.rename("percent_null")
        df = pd.concat([self.design_df, df], axis=1)

        pylab.bar(df.index,
                  df.percent_null,
                  color=df.group_color,
                  ec="k",
                  lw=1,
                  zorder=10)

        all_null = (self.counts_raw
                    == 0).all(axis=1).sum() / self.counts_raw.shape[0]

        pylab.axhline(all_null, ls="--", color="black", alpha=0.5)

        pylab.xticks(rotation=45, ha="right")
        pylab.ylabel("Proportion of null counts (%)")
        pylab.grid(True, zorder=0)
        pylab.tight_layout()