def test_pacbio_random(): b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) with TempFile() as fh: b.random_selection(fh.name, nreads=10) with TempFile() as fh: b.random_selection(fh.name, expected_coverage=10, reference_length=10000)
def __init__(self, filename): self.filename = filename self.bam = PacbioSubreads(self.filename) self._df = None
class PacbioIsoSeqMappedIsoforms(object): """Here, we load a SAM/BAM file generated with minimap using as input the BAM file created with the mapping og HQ isoforms on a reference. df contains a dataframe for each read found in the SAM (and hq_isoform) we populate the GC content, the mapping flag, the reference name (-1 means no mapping i.e flag ==4). flag of 4 means unmapped and there is no ambiguity about it. In the data file example, other falgs are 0, 16 (SEQ being reverse complement<F12>) , 2048 (supplementary segment). Example of minimap2 command:: minimap2 -t 4 -ax splice -uf --secondary=no SIRV-E0.fa hq_isoforms.fasta 1> hq_isoforms.fasta.sam 2> hq_isoforms.fasta.sam.log Reads a SAM file for now. BAM should work as well """ def __init__(self, filename): self.filename = filename self.bam = PacbioSubreads(self.filename) self._df = None @property def df(self): if self._df is not None: return self._df # !! for isoseq, we should be able to load everything into memory self.bam.reset() data = [a for a in self.bam.data] df = self.bam.df.copy() rnames = [self.bam.data.get_reference_name(a.rname) if a.rname!=-1 else -1 for a in data] df['reference_name'] = rnames df['flags'] = [a.flag for a in data] df['mapq'] = [a.mapq for a in data] df['cigar'] = [a.cigarstring for a in data] df['qname'] = [a.qname for a in data] # Drop SNR that are not populated in the mapped BAM file. df.drop(['snr_A', 'snr_C', 'snr_G', 'snr_T'], axis=1, inplace=True) # TODO. input could be basde on mapping of CCS in which case, the ZMW is # stored and the following does not work. could check whether the # pattern is pXXfXX try: df["full_length"] = df["qname"].apply(lambda x: int(x.split('/')[1].split("p")[0].strip("f"))) df["non_full_length"] = df["qname"].apply(lambda x: int(x.split("/")[1].split("p")[1].strip("f"))) except: pass self._df = df return self._df def hist_isoform_length_mapped_vs_unmapped(self, bins=None): df = self.df if bins is None: bins = range(0, len(df.reference_length.max()), 100) mapped = df[df.reference_name != -1] unmapped = df[df.reference_name == -1] pylab.hist(mapped.reference_length, bins=bins, alpha=0.5, label="mapped {}".format(len(mapped)), density=False) pylab.hist(unmapped.reference, bins=bins, alpha=0.5, label="unmapped {}".format(len(unmapped)), density=False) pylab.xlabel("Isoform length") pylab.legend() def hist_transcript(self, hide_unmapped=True): pylab.clf() if hide_unmapped is True: query = "reference_length>0 and reference_name!=-1" else: query = "reference_length>0" print(query) ts = self.df.query(query).groupby("reference_name").count().reference_length if len(ts) == 0: print("nothing to plot") return ts ts.plot(kind="bar" ,color="r") try: pylab.tight_layout() except: pass return ts def bar_mapq(self, logy=True, xmin=0, xmax=60, fontsize=12): self.df.mapq.hist() if logy: pylab.semilogy() pylab.xlim([xmin, xmax]) pylab.xlabel("Mapping quality", fontsize=fontsize) def plot_sirv_by_group(self, title, shift=5, plot=False, mapq_min=-1): aa = self.df.query("reference_name not in [-1, '-1']").copy() if len(aa) == 0: return pd.Series(), self.df aa['group'] = aa.reference_name.apply(lambda x: x[0:shift]) mapped = aa.query("mapq>@mapq_min").groupby("group").count()["mapq"] mapped.name = None if plot: mapped.plot(kind="bar") pylab.title(title) pylab.tight_layout() #data.to_csv(path + "_hq_sirv_grouped.csv") return mapped, self.df
def test_pacbio_stride(): b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) with TempFile() as fh: b.stride(fh.name, stride=2) with TempFile() as fh: b.stride(fh.name, stride=2, random=True)
def test_pacbio(): b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) assert len(b) == 130 b.df #assert b.nb_pass[1] == 130 with TempFile() as fh: b.filter_length(fh.name, threshold_min=500) print(b) # check length assert b.stats['mean_GC'] > 62.46 assert b.stats['mean_GC'] < 65.47 b.summary() b = PacbioSubreads(sequana_data("test_pacbio_subreads.bam")) # test hist_snr from scratch b._df = None b.hist_snr() # test hist_len from scratch b._df = None b.hist_read_length() b.hist_nb_passes() b.get_mean_nb_passes() # test from scratch b._df = None b.hist_GC() # test from scratch b._df = None b.plot_GC_read_len() # test from scratch b._df = None with TempFile() as fh: b.to_fasta(fh.name, threads=1) with TempFile() as fh: b.to_fastq(fh.name, threads=1) with TempFile() as fh: b.save_summary(fh.name)
class PacbioIsoSeqMappedIsoforms(object): """Here, we load a SAM/BAM file generated with minimap using as input the BAM file created with the mapping og HQ isoforms on a reference. df contains a dataframe for each read found in the SAM (and hq_isoform) we populate the GC content, the mapping flag, the reference name (-1 means no mapping i.e flag ==4). flag of 4 means unmapped and there is no ambiguity about it. In the data file example, other flags are 0, 16 (SEQ being reverse complement<F12>) , 2048 (supplementary segment). Example of minimap2 command:: minimap2 -t 4 -ax splice -uf --secondary=no SIRV-E0.fa hq_isoforms.fasta 1> hq_isoforms.fasta.sam 2> hq_isoforms.fasta.sam.log Reads a SAM file for now. BAM should work as well """ def __init__(self, filename): self.filename = filename self.bam = PacbioSubreads(self.filename) self._df = None @property def df(self): if self._df is not None: return self._df # !! for isoseq, we should be able to load everything into memory self.bam.reset() data = [a for a in self.bam.data] df = self.bam.df.copy() rnames = [ self.bam.data.get_reference_name(a.rname) if a.rname != -1 else -1 for a in data ] df['reference_name'] = rnames df['flags'] = [a.flag for a in data] df['mapq'] = [a.mapq for a in data] df['cigar'] = [a.cigarstring for a in data] df['qname'] = [a.qname for a in data] # Drop SNR that are not populated in the mapped BAM file. df.drop(['snr_A', 'snr_C', 'snr_G', 'snr_T'], axis=1, inplace=True) # TODO. input could be basde on mapping of CCS in which case, the ZMW is # stored and the following does not work. could check whether the # pattern is pXXfXX try: df["full_length"] = df["qname"].apply( lambda x: int(x.split('/')[1].split("p")[0].strip("f"))) df["non_full_length"] = df["qname"].apply( lambda x: int(x.split("/")[1].split("p")[1].strip("f"))) except: pass self._df = df return self._df def hist_isoform_length_mapped_vs_unmapped(self, bins=None): df = self.df if bins is None: bins = range(0, len(df.reference_length.max()), 100) mapped = df[df.reference_name != -1] unmapped = df[df.reference_name == -1] pylab.hist(mapped.reference_length, bins=bins, alpha=0.5, label="mapped {}".format(len(mapped)), density=False) pylab.hist(unmapped.reference, bins=bins, alpha=0.5, label="unmapped {}".format(len(unmapped)), density=False) pylab.xlabel("Isoform length") pylab.legend() def hist_transcript(self, hide_unmapped=True): pylab.clf() if hide_unmapped is True: query = "reference_length>0 and reference_name!=-1" else: query = "reference_length>0" print(query) ts = self.df.query(query).groupby( "reference_name").count().reference_length if len(ts) == 0: print("nothing to plot") return ts ts.plot(kind="bar", color="r") try: pylab.tight_layout() except: pass return ts def bar_mapq(self, logy=True, xmin=0, xmax=60, fontsize=12): self.df.mapq.hist() if logy: pylab.semilogy() pylab.xlim([xmin, xmax]) pylab.xlabel("Mapping quality", fontsize=fontsize) def plot_sirv_by_group(self, title, shift=5, plot=False, mapq_min=-1): aa = self.df.query("reference_name not in [-1, '-1']").copy() if len(aa) == 0: return pd.Series(), self.df aa['group'] = aa.reference_name.apply(lambda x: x[0:shift]) mapped = aa.query("mapq>@mapq_min").groupby("group").count()["mapq"] mapped.name = None if plot: mapped.plot(kind="bar") pylab.title(title) pylab.tight_layout() #data.to_csv(path + "_hq_sirv_grouped.csv") return mapped, self.df
""" read length histograms pacbio data ===================================== QC pacbio example """ ######################################## # First, let us get a data set example. # Note the .bam extension from sequana import sequana_data dataset = sequana_data("test_pacbio_subreads.bam") ############################################# # Create a :class:`sequana.pacbio.BAMPacbio` instance from sequana.pacbio import PacbioSubreads qc = PacbioSubreads(dataset) ######################################### # plot the histogram of read length qc.hist_read_length() ################################################# # plot the histogram of the SNRs for each base qc.hist_snr()