Exemplo n.º 1
0
 def summary(self):
     summary = {"name": "sequana_summary_pacbio_qc"}
     summary["read_stats"] = self.stats.copy()
     summary["mean_gc"] = float(np.mean(self._df.loc[:,'GC_content']))
     a, b = np.histogram(self._df.loc[:,'GC_content'], bins=100)
     summary['hist_gc'] = {"Y": a.tolist(), "X": b.tolist()}
     a, b =  np.histogram(self._df['read_length'],100)
     summary['hist_read_length'] = {"Y": a.tolist(), "X": b.tolist()}
     return summary
Exemplo n.º 2
0
 def summary(self):
     summary = {"name": "sequana_summary_pacbio_qc"}
     summary["read_stats"] = self.stats.copy()
     summary["mean_gc"] = float(np.mean(self._df.loc[:, 'GC_content']))
     a, b = np.histogram(self._df.loc[:, 'GC_content'], bins=100)
     summary['hist_gc'] = {"Y": a.tolist(), "X": b.tolist()}
     a, b = np.histogram(self._df['read_length'], 100)
     summary['hist_read_length'] = {"Y": a.tolist(), "X": b.tolist()}
     return summary
Exemplo n.º 3
0
    def histogram_sequence_lengths(self, logy=True):
        """Histogram sequence lengths

        .. plot::
            :include-source:

            from sequana import sequana_data
            from sequana import FastQC
            filename  = sequana_data("test.fastq", "testing")
            qc = FastQC(filename)
            qc.histogram_sequence_lengths()

        """
        data = [len(x) for x in self.sequences]
        bary, barx = np.histogram(data, bins=range(max(data)+1))

        # get rid of zeros to avoid warnings
        bx = [x for x,y in zip(barx, bary) if y!=0]
        by = [y for x,y in zip(barx, bary) if y!=0]
        if logy:
            pylab.bar(bx, pylab.log10(by))
        else:
            pylab.bar(bx, by)

        pylab.xlim([1,max(data)+1])

        pylab.grid(True)
        pylab.xlabel("position (bp)", fontsize=self.fontsize)
        pylab.ylabel("Count (log scale)", fontsize=self.fontsize)
Exemplo n.º 4
0
    def run(self,
            bins=50,
            xmin=0,
            xmax=30000,
            step=1000,
            burn=1000,
            alpha=1,
            output_filename=None):
        # compute histogram of the input reads once for all to be used
        # in the target_distribution method
        self.bins = bins
        self.Y, self.X = np.histogram(self.bam.df.read_length,
                                      bins=bins,
                                      normed=True)

        lengths = self.bam_simul.df.read_length.values
        self.tokeep = []
        vec = []
        x = self.bam.df.read_length.mean()
        for i in range(self.bam_simul.df.shape[0]):
            can = lengths[i]
            aprob = min([
                alpha,
                self.target_distribution(can) / self.target_distribution(x)
            ])
            #acceptance probability
            u = pylab.uniform(0, 1)
            if u < aprob:
                x = can
                vec.append(x)
                self.tokeep.append(True)
            else:
                self.tokeep.append(False)

        #plotting the results:
        #theoretical curve
        x = pylab.arange(xmin, xmax, step)
        y = self.target_distribution(x)
        pylab.subplot(211)
        pylab.title('Metropolis-Hastings')
        pylab.plot(vec)
        pylab.subplot(212)

        pylab.hist(vec[burn:], bins=bins, normed=1)
        pylab.plot(x, y, 'r-')
        pylab.ylabel('Frequency')
        pylab.xlabel('x')
        pylab.legend(('PDF', 'Samples'))

        if output_filename is not None:
            self.bam_simul.filter_bool(output_filename, self.tokeep)
Exemplo n.º 5
0
    def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000,alpha=1,output_filename=None):
        # compute histogram of the input reads once for all to be used
        # in the target_distribution method
        self.bins = bins
        self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, density=True)

        lengths = self.bam_simul.df.read_length.values
        self.tokeep = []
        vec = []
        x = self.bam.df.read_length.mean()
        for i in range(self.bam_simul.df.shape[0]):
            can = lengths[i]
            aprob = min([alpha,self.target_distribution(can)/self.target_distribution(x)])
            #acceptance probability
            u = pylab.uniform(0,1)
            if u < aprob:
                x = can
                vec.append(x)
                self.tokeep.append(True)
            else:
                self.tokeep.append(False)

        #plotting the results:
        #theoretical curve
        x = pylab.arange(xmin, xmax, step)
        y = self.target_distribution(x)
        pylab.subplot(211)
        pylab.title('Metropolis-Hastings')
        pylab.plot(vec)
        pylab.subplot(212)

        pylab.hist(vec[burn:], bins=bins, density=1)
        pylab.plot(x,y,'r-')
        pylab.ylabel('Frequency')
        pylab.xlabel('x')
        pylab.legend(('PDF','Samples'))

        if output_filename is not None:
            self.bam_simul.filter_bool(output_filename, self.tokeep)