def get_bam_stats(filename): from sequana import BAM import pandas as pd bam = BAM(filename) stats = bam.get_stats() df = pd.Series(stats).to_frame().T return df
class LAA_Assembly(): """ Input is a SAM/BAM from the mapping of amplicon onto a known reference. Based on the position, we can construct the new reference. """ def __init__(self, filename): self.bam = BAM(filename) def build_reference(self): self.bam.reset() # scan BAM file assuming it is small aa = [a for a in self.bam] # retrieve data of interest data = [(a.pos, { "name": a.query_name, "sequence": a.query_sequence, "cigar": a.cigarstring, "position": a.pos, "qstart": a.qstart, "qend": a.qend }) for a in aa] # sort by starting position data.sort(key=lambda x: x[0]) for i, read in enumerate(data): read = read[1] if i == 0: sequence = read["sequence"] # 2 is query_sequence else: pr = data[i - 1][1] # previous read L = len(pr["sequence"]) end_position_pr = pr['position'] - pr['qstart'] + L # overlap between previous read and this one overlap = end_position_pr - (read['position'] - read['qstart']) + 0 print(overlap) print(pr['position'], pr['qstart'], L, end_position_pr) print(read['position'], read['qstart']) sequence = sequence + read["sequence"][overlap + 1:] # argmax([sum(a==b for a,b in zip(X[-i:] , Y[:i]))/float(i+1) for i in range(1000)]) return sequence def save_fasta(self, filename, sequence=None): if sequence is None: sequence = self.build_reference() with open(filename, "w") as fout: fout.write(">test\n{}".format(sequence))
def find_motif(bamfile, motif="CAGCAG", window=200, savefig=False, local_th=5, global_th=10): """ If at least 10 position contains at least 5 instances of the motif, then this is a hit and the alignment is kept """ b1 = BAM(bamfile) # FIND motif and create pictures count = 0 found = [] Ss = [] alns = [] for a in b1: count +=1 if a.query_sequence is None: continue seq = a.query_sequence X1 = [seq[i:i+window].count(motif) for i in range(len(seq))] S = sum([x>local_th for x in X1]) Ss.append(S) als.append(a) if S > global_th: found.append(True) off = a.query_alignment_start pylab.clf() pylab.plot(range(off+a.reference_start, off+a.reference_start+len(seq)),X1) if savefig: pylab.savefig("{}_{}_{}.png".format(a.reference_name, S, a.query_name.replace("/", "_"))) else: found.append(False) return alns, found, Ss
def plot_specific_alignment(self, query_name, motif, clf=True, windows=[10, 50, 100, 200, 500, 1000]): found = None bam = BAM(self.bamfile) for aln in bam: if aln.query_name == query_name: found = aln if found: # Detection seq = found.query_sequence if clf: pylab.clf() for window in windows: X = [seq[i:i + window].count(motif) for i in range(len(seq))] pylab.plot(X, label=window) score = sum([x > window / 6 for x in X]) print(window, score / 3.) pylab.legend() pylab.ylabel("# {} in a given sliding window".format(motif)) pylab.title(query_name) else: print("Not found")
def plot_specific_alignment(self, bamfile, query_name, motif,clf=True, show_figure=True, authorized_flags=[0,16], windows=[10, 50, 100, 150,200, 250,500, 1000], local_threshold=5): found = None bam = BAM(bamfile) for aln in bam: if aln.query_name == query_name and aln.flag in authorized_flags: found = aln break # we may have several entries. let us pick up the first sizes = [] if found: # Detection seq = found.query_sequence if clf:pylab.clf() for window in windows: X = [seq[i:i+window].count(motif) for i in range(len(seq))] if show_figure: pylab.plot(X, label=window) score = sum([x>local_threshold for x in X]) sizes.append(score-window) if show_figure: pylab.legend() pylab.ylabel("# {} in a given sliding window".format(motif)) pylab.title(query_name) else: print("{} Not found in {} file".format(query_name, bamfile)) return sizes
def __init__(self, filename, reference=None, bamfile=None, mode="canu"): """ minimap2 -x map-pb reference filename -a > temp.sam bioconvert sam2bam temp.sam temp.bam """ super(Contigs, self).__init__(filename) self.mode = mode self._df = None if bamfile: self.bam = BAM(bamfile) else: self.bam = None self.reference = reference
def plot_alignment(self, bamfile, motif, window=200, global_th=10,title=None,legend=True, legend_fontsize=11, valid_rnames=[], valid_flags=[]): """ plot alignments that match the motif. """ bam = BAM(bamfile) print("Found {} hits".format(len(bam))) pylab.clf() count = 0 for aln in bam: if valid_rnames and aln.rname not in valid_rnames: continue if valid_flags and aln.flag not in valid_flags: continue seq = aln.query_sequence if seq: count += 1 X1 = [seq[i:i+window].count(motif) for i in range(len(seq))] pylab.plot(range(aln.reference_start, aln.reference_start+len(seq)),X1, label=aln.query_name) print("Showing {} entries after filtering".format(count)) max_theo = int(1.2*window / len(motif)) pylab.ylim([0, max_theo]) if legend and count<15: pylab.legend(fontsize=legend_fontsize) if title: pylab.title(title, fontsize=16)
def _computation(self): self.bam = BAM(self.bam_input) results = {} results['alignment_count'] = len(self.bam) # first, we store the flags df = self.bam.get_flags_as_df().sum() df = df.to_frame() df.columns = ['counter'] sf = SAMFlags() df['meaning'] = sf.get_meaning() df = df[['meaning', 'counter']] results['flags'] = df return results self.bam.plot_bar_flags(logy=False, filename=self.directory + os.sep + "bar_flags.png") self.bam.plot_bar_mapq(filename=self.directory + os.sep + "bar_mapq.png")
def bam_get_paired_distance(filename): """Return distance between 2 mated-reads :return: list of tuples where each tuple contains the position start, position end of the paired-end reads that were mapped + the mode. mode =1 means fragment is reversed. mode = 2 means mate is reversed. mode = 3 means none are reversed. :: distances = bam_get_paired_distance(bamfile) hist([x[1]-x[0] for x in distances]) .. warning:: experimental """ b = BAM(filename) distances = [] for fragment in b: if fragment.is_unmapped is False and fragment.mate_is_unmapped is False \ and fragment.is_read1: # get the mate: mate = next(b) if fragment.is_reverse: position2 = fragment.reference_end position1 = mate.reference_start mode = 1 elif mate.is_reverse: position1 = fragment.reference_start position2 = mate.reference_end mode = 2 else: # if both are not reversed, what does that mean. # On Hm2, this is the case for 4 pairs out of 1622 # This seems to be a special case for fragment ends exactly # at the end of the reference and mate starts exactly at # the beginnin with a length less than 100 print(fragment.reference_start, fragment.reference_end) print(mate.reference_start, mate.reference_end) position1 = -1 position2 = -1 mode = 3 distances.append((position1, position2, mode)) return distances
def sniff(filename): logger.info("Sniffing file {}".format(filename)) from sequana import BAM, SAM, CRAM from sequana.sniffer import sniffer datatype = sniffer(filename) if datatype == "SAM": logger.info("Input data in SAM format") data = SAM(filename) elif datatype == "BAM": logger.info("Input data in BAM format") data = BAM(filename) elif datatype == "CRAM": logger.info("Input data in CRAM format") data = CRAM(filename) else: raise ValueError( "Your input file does not seem to be a valid SAM/BAM/CRAM file") return data
def find_motif(self, motif, window=200, figure=False, savefig=False): b1 = BAM(self.bamfile) df = { "query_name": [], "hit": [], "length": [], "start": [], "end": [] } for a in b1: if a.query_sequence is None: continue seq = a.query_sequence X1 = [seq[i:i + window].count(motif) for i in range(len(seq))] S = sum([x >= self.local_threshold for x in X1]) df['query_name'].append(a.query_name) df['start'].append(a.reference_start) df['end'].append(a.reference_end) df['length'].append(a.rlen) df['hit'].append(S) if S >= self.global_threshold: off = a.query_alignment_start #pylab.clf() if figure: pylab.plot( range(off + a.reference_start, off + a.reference_start + len(seq)), X1) if savefig: pylab.savefig("{}_{}_{}.png".format( a.reference_name, S, a.query_name.replace("/", "_"))) df = pd.DataFrame(df) L = len(df.query("hit>5")) print(L) return df
def plot_alignment(self, motif, window=200, global_th=10, title=None, legend=True, legend_fontsize=11): """ plot alignments that match the motif. """ df = self._get_aligments(motif=motif, window=window, global_th=global_th) print("Found {} hits".format(len(df))) bam = BAM(self.bamfile) pylab.clf() count = 0 for aln in bam: if aln.query_name in df.query_name.values: seq = aln.query_sequence if seq: count += 1 X1 = [ seq[i:i + window].count(motif) for i in range(len(seq)) ] pylab.plot(range(aln.reference_start, aln.reference_start + len(seq)), X1, label=aln.query_name) max_theo = int(1.2 * window / len(motif)) pylab.ylim([0, max_theo]) if legend and count < 15: pylab.legend(fontsize=legend_fontsize) if title: pylab.title(title, fontsize=16) return df
def find_motif_bam(self, filename, motif, window=200, figure=False, savefig=False, local_threshold=None, global_threshold=None): from sequana import BAM b1 = BAM(filename) df = { "query_name": [], "hit": [], "length": [], "start": [], "end": [] } for a in b1: if a.query_sequence is None: continue seq = a.query_sequence X1, S = self.find_motif_from_sequence(seq, motif, window=window, local_threshold=local_threshold) df['query_name'].append(a.query_name) df['start'].append(a.reference_start) df['end'].append(a.reference_end) df['length'].append(a.rlen) df['hit'].append(S) if S >= self.global_threshold: off = a.query_alignment_start #pylab.clf() if figure: pylab.plot(range(off+a.reference_start, off+a.reference_start+len(seq)),X1) if savefig: pylab.savefig("{}_{}_{}.png".format(a.reference_name, S, a.query_name.replace("/", "_"))) df = pd.DataFrame(df) L = len(df.query("hit>5")) print(L) return df
class Contigs(ContigsBase): def __init__(self, filename, reference=None, bamfile=None, mode="canu"): """ minimap2 -x map-pb reference filename -a > temp.sam bioconvert sam2bam temp.sam temp.bam """ super(Contigs, self).__init__(filename) self.mode = mode self._df = None if bamfile: self.bam = BAM(bamfile) else: self.bam = None self.reference = reference def bar_plot_contigs_length(self): # show length of N contigs as compare to length of the reference fref = FastA(self.reference) Nref = len(fref.sequences) N = len(self.fasta) pylab.clf() pylab.bar(range(0, N, int(pylab.ceil(N / Nref))), sorted(fref.lengths), width=Nref / 1.1, label="Plasmodium chromosomes") pylab.bar(range(0, N), sorted(self.fasta.lengths), width=1, label="canu {} contigs".format(N)) pylab.legend() #pylab.savefig("1179_195_contigs.png", dpi=200) def hist_plot_contig_length(self, bins=40, fontsize=16): """Plot distribution of contig lengths""" L = len(self.fasta.sequences) pylab.hist(self.fasta.lengths, lw=1, ec="k", bins=bins) pylab.grid() pylab.xlabel("Contig length", fontsize=fontsize) pylab.ylabel("#", fontsize=fontsize) pylab.title("Distribution {} contigs".format(L)) def get_df(self, window=100): print("building GC content") data = tools._base_content(self.filename, window, "GC") names = self.fasta.names lengths = self.fasta.lengths GC = [np.nanmean(data[name]) for name in names] nreads = [0] * len(GC) covStats = [0] * len(GC) if self.mode == "canu": for i, comment in enumerate(self.fasta.comments): read = [x for x in comment.split() if x.startswith("reads")][0] covStat = [ x for x in comment.split() if x.startswith("covStat") ][0] read = read.split("=")[1] covStat = covStat.split("=")[1] nreads[i] = int(read) covStats[i] = float(covStat) #if self.bamfile df = pd.DataFrame({ "GC": list(GC), "length": lengths, "name": names, "nread": nreads, "covStat": covStats }) # deal with the bamfile if self.bam: bam_df = self.bam.get_df() bam_df = bam_df.query("flag in [0,16]") bam_df.set_index("qname", inplace=True) chrom_name = bam_df.loc[self.fasta.names]["rname"] df["chromosome"] = list(chrom_name) self._df = df.copy() return df def plot_contig_length_vs_nreads(self, fontsize=16): # same as plot_scatter_contig_length_nread_cov if self._df is None: _ = self.get_df() pylab.clf() df = self._df m1 = df.length.min() M1 = df.length.max() pylab.loglog(df.length, df.nread, "o") pylab.xlabel("Contig length", fontsize=fontsize) pylab.ylabel("Contig N reads", fontsize=fontsize) pylab.grid() X = df.query("nread>10 and length>100000")['length'] Y = df.query("nread>10 and length>100000")['nread'] A = np.vstack([X, np.ones(len(X))]).T m, c = np.linalg.lstsq(A, Y.as_matrix())[0] x = np.array([m1, M1]) pylab.plot(x, m * x + c, "o-r") pylab.tight_layout() def plot_scatter_contig_length_nread_cov(self, fontsize=16, vmin=0, vmax=50, min_nreads=20, min_length=50000): if self._df is None: _ = self.get_df() pylab.clf() df = self._df m1 = df.length.min() M1 = df.length.max() # least square X = df.query("nread>@min_nreads and length>@min_length")['length'] Y = df.query("nread>@min_nreads and length>@min_length")['nread'] Z = df.query("nread>@min_nreads and length>@min_length")['covStat'] print(X) print(Y) print(Z) A = np.vstack([X, np.ones(len(X))]).T m, c = np.linalg.lstsq(A, Y.as_matrix())[0] x = np.array([m1, M1]) X = df['length'] Y = df['nread'] Z = df['covStat'] pylab.scatter(X, Y, c=Z, vmin=vmin, vmax=vmax) pylab.colorbar() pylab.xlabel("Contig length", fontsize=fontsize) pylab.ylabel("Contig reads", fontsize=fontsize) pylab.title("coverage function of contig length and reads used") pylab.grid() pylab.plot(x, m * x + c, "o-r") pylab.loglog() pylab.tight_layout() def get_contig_per_chromosome(self): if self.bam is None: print("no bam file found") return df = self.bam.get_df() df = df.query("flag in [0,16]") alldata = {} for chrom in sorted(df.rname.unique()): data = df.query("rname == @chrom").sort_values(by='rstart')[[ "qname", "qlen", "rstart", "rend" ]] alldata[chrom] = data return alldata def stats(self): from sequana.stats import N50, L50 length = self.get_df()['length'] return { 'N50': N50(length), 'total_length': sum(length), 'L50': L50(length) } def plot_contig_length_vs_GC(self): pylab.plot(self.get_df()["length"], self.get_df()['GC'], "o")
class BAMQCModule(SequanaBaseModule): """Report dedicated to BAM file :: from sequana import sequana_data from sequana.modules_report.bamqc import BAMQCModule filename = sequana_data("test.bam") r = BAMQCModule(filename) r.create_html("test.html") # report/bam.html is now available .. todo:: right now, the computation is performed in the class. Ideally, we would like the computation to happen elsewhere, where a json is stored. The json would be the input to this class. """ def __init__(self, bam_input, output_filename=None): super().__init__() self.bam_input = bam_input self.title = "Bam Report" self.create_report_content() self.create_html(output_filename) def create_report_content(self): self.sections = list() self.add_flag_section() self.add_images_section() def _computation(self): self.bam = BAM(self.bam_input) results = {} results['alignment_count'] = len(self.bam) # first, we store the flags df = self.bam.get_flags_as_df().sum() df = df.to_frame() df.columns = ['counter'] sf = SAMFlags() df['meaning'] = sf.get_meaning() df = df[['meaning', 'counter']] results['flags'] = df return results self.bam.plot_bar_flags(logy=False, filename=self.directory + os.sep + "bar_flags.png") self.bam.plot_bar_mapq(filename=self.directory + os.sep + "bar_mapq.png") def add_flag_section(self): data = self._computation() df = data['flags'] datatable = DataTable(df, "flags", index=True) datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'tB', "paging": "false", 'buttons': ['copy', 'csv'] } js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') html = "" html += "{} {}".format(html_tab, js) self.sections.append({ "name": "Flags information", "anchor": "flags", "content": html }) def add_images_section(self): style = "width:65%" import pylab pylab.ioff() def plotter1(filename): self.bam.plot_bar_flags(logy=True, filename=filename) html1 = self.create_embedded_png(plotter1, "filename", style=style) def plotter2(filename): self.bam.plot_bar_flags(logy=False, filename=filename) html2 = self.create_embedded_png(plotter2, "filename", style=style) def plotter3(filename): self.bam.plot_bar_mapq(filename=filename) html3 = self.create_embedded_png(plotter3, "filename", style=style) self.sections.append({ "name": "Image", "anchor": "table", "content": html1 + html2 + html3 })
""" BAM module example ==================== Plot histogram of MAPQ values contained in a BAM file """ ################################################# # from sequana import BAM, sequana_data ##################################################### # Get a data set (BAM file) for testing from sequana import BAM, sequana_data datatest = sequana_data('test.bam', "testing") #################################################### # Use :class:`sequana.bamtools.BAM` class to plot the MAPQ historgram b = BAM(datatest) b.plot_bar_mapq()
""" BAM module example ==================== Plot histogram of MAPQ values contained in a BAM file """ ################################################# # first import the relevant modules from sequana import BAM, sequana_data ##################################################### # Get a data set (BAM file) for testing from sequana import BAM, sequana_data datatest = sequana_data('test.bam', "testing") ########################################################################## # Use :class:`sequana.bamtools.BAM` class to plot the MAPQ historgram b = BAM(datatest) b.plot_bar_mapq()
class BAMQCModule(SequanaBaseModule): """Report dedicated to BAM file :: from sequana import sequana_data from sequana.modules_report.bamqc import BAMQCModule filename = sequana_data("test.bam") r = BAMQCModule(filename) r.create_html("test.html") # report/bam.html is now available .. todo:: right now, the computation is performed in the class. Ideally, we would like the computation to happen elsewhere, where a json is stored. The json would be the input to this class. """ def __init__(self, bam_input, output_filename=None): super().__init__() self.bam_input = bam_input self.title = "Bam Report" self.create_report_content() self.create_html(output_filename) def create_report_content(self): self.sections = list() self.add_flag_section() self.add_images_section() def _computation(self): self.bam = BAM(self.bam_input) results = {} results['alignment_count'] = len(self.bam) # first, we store the flags df = self.bam.get_flags_as_df().sum() df = df.to_frame() df.columns = ['counter'] sf = SAMFlags() df['meaning'] = sf.get_meaning() df = df[['meaning', 'counter']] results['flags'] = df return results self.bam.plot_bar_flags(logy=False, filename=self.directory + os.sep + "bar_flags.png") self.bam.plot_bar_mapq(filename=self.directory + os.sep + "bar_mapq.png") def add_flag_section(self): data = self._computation() df = data['flags'] datatable = DataTable(df, "flags", index=True) datatable.datatable.datatable_options = { 'scrollX': '300px', 'pageLength': 15, 'scrollCollapse': 'true', 'dom': 'tB', "paging": "false", 'buttons': ['copy', 'csv']} js = datatable.create_javascript_function() html_tab = datatable.create_datatable(float_format='%.3g') html = "" html += "{} {}".format(html_tab, js) self.sections.append({ "name": "Flags information", "anchor": "flags", "content": html }) def add_images_section(self): style = "width:65%" import pylab pylab.ioff() def plotter1(filename): self.bam.plot_bar_flags(logy=True, filename=filename) html1 = self.create_embedded_png(plotter1, "filename", style=style) def plotter2(filename): self.bam.plot_bar_flags(logy=False, filename=filename) html2 = self.create_embedded_png(plotter2, "filename", style=style) def plotter3(filename): self.bam.plot_bar_mapq(filename=filename) html3 = self.create_embedded_png(plotter3, "filename", style=style) self.sections.append({ "name": "Image", "anchor": "table", "content": html1 + html2 + html3 })
def summary(**kwargs): """Create a HTML report for various type of NGS formats. \b * bamqc * fastq This will process all files in the given pattern (in back quotes) sequentially and procude one HTML file per input file. Other module all work in the same way. For example, for FastQ files:: sequana summary one_input.fastq sequana summary `ls *fastq` """ names = kwargs['name'] module = kwargs['module'] if module is None: if names[0].endswith('fastq.gz') or names[0].endswith('.fastq'): module = "fastq" elif names[0].endswith('.bam'): module = "bam" elif names[0].endswith('.gff') or names[0].endswith('gff3'): module = "gff" elif names[0].endswith('fasta.gz') or names[0].endswith('.fasta'): module = "fasta" else: logger.error( "please use --module to tell us about the input fimes") sys.exit(1) if module == "bamqc": for name in names: print(f"Processing {name}") from sequana.modules_report.bamqc import BAMQCModule report = BAMQCModule(name, "bamqc.html") elif module == "fasta": # there is no module per se. HEre we just call FastA.summary() from sequana.fasta import FastA for name in names: f = FastA(name) f.summary() elif module == "fastq": # there is no module per se. HEre we just call FastA.summary() from sequana.fastq import FastQ from sequana import FastQC for filename in names: ff = FastQC(filename, max_sample=1e6, verbose=False) stats = ff.get_stats() print(stats) elif module == "bam": import pandas as pd from sequana import BAM for filename in names: ff = BAM(filename) stats = ff.get_stats() df = pd.Series(stats).to_frame().T print(df) elif module == "gff": import pandas as pd from sequana import GFF3 for filename in names: ff = GFF3(filename) print("#filename: {}".format(filename)) print("#Number of entries per genetic type:") print(ff.df.value_counts('type').to_string()) print("#Number of duplicated attribute (if any) per attribute:") ff.get_duplicated_attributes_per_type()
def bam_to_mapped_unmapped_fastq(filename, output_directory=None, verbose=True): """Create mapped and unmapped fastq files from a BAM file :context: given a reference, one or two FastQ files are mapped onto the reference to generate a BAM file. This BAM file is a compressed version of a SAM file, which interpretation should be eased within this function. :param filename: input BAM file :param output_directory: where to save the mapped and unmapped files :return: dictionary with number of reads for each file (mapped/unmapped for R1/R2) as well as the mode (paired or not), the number of unpaired reads, and the number of duplicated reads. The unpaired reads should be zero (sanity check) Given a BAM file, create FASTQ with R1/R2 reads mapped and unmapped. In the paired-end case, 4 files are created. Note that this function is efficient in that it does not create intermediate files limiting IO in the process. As compared to standard tools such as bedtools bamtofastq, it is 1.5 to 2X slower but it does create the mapped AND unmapped reads. :Details: Secondary alignment (flag 256) are dropped so as to remove any ambiguous alignments. The output dictionary stores "secondary" key to keep track of the total number of secondary reads that are dropped. If the flag is 256 and the read is unpaired, the key *unpaired* is also incremented. If the flag is not equal to 256, we first reverse complement reads that are tagged as *reverse* in the BAM file. Then, reads that are not paired or not "proper pair" (neither flag 4 nor flag 8) are ignored. If R1 is mapped **or** R2 is mapped then the reads are considered mapped. If both R1 and R2 are unmapped, then reads are unmapped. .. note:: about chimeric alignment: one is the representative and the other is the supplementary. This flag is not used in this function. Note also that chimeric alignment have same QNAME and flag 4 and 8 .. note:: the contamination reported is basde on R1 only. .. todo:: comments are missing since there are not stored in the BAM file. .. note:: the mapped reads may not be synchronized because we include also the chimeric alignment (cf samtools documentation). However, total reads = unmappeds reads + R1 mapped + R2 mapped - supplementary reads (those with flag 2048). """ bam = BAM(filename) # figure out if this is paired or unpaired newname, ext = os.path.splitext(filename) import collections stats = collections.defaultdict(int) stats['R1_unmapped'] = 0 stats['R1_mapped'] = 0 # figure out where to save the file if output_directory is None: pass else: assert isinstance(filename, str) from sequana.snaketools import FileFactory ff = FileFactory(filename) newname = output_directory + os.sep + ff.filenames[0] rt1 = "_R1_" rt2 = "_R2_" R1_mapped = open(newname + "{}.mapped.fastq".format(rt1), "wb") R1_unmapped = open(newname + "{}.unmapped.fastq".format(rt1), "wb") stats['duplicated'] = 0 stats['unpaired'] = 0 unpaired = 0 # if paired, let open other files if bam.is_paired: stats['mode'] = "pe" stats['R2_unmapped'] = 0 stats['R2_mapped'] = 0 R2_mapped = open(newname + "{}.mapped.fastq".format(rt2), "wb") R2_unmapped = open(newname + "{}.unmapped.fastq".format(rt2), "wb") else: stats['mode'] = "se" # loop through the BAM (make sure it is rewinded) bam.reset() if verbose: from easydev import Progress pb = Progress(len(bam)) for i, this in enumerate(bam): if this.flag & 256: # Unmapped reads are in the BAM file but have no valid assigned # position (N.B., they may have an assigned position, but it should be ignored). # It's typically the case that a number of reads can't be aligned, due to things # like sequencing errors, imperfect matches between the DNA sequenced and the # reference, random e. coli or other contamination, etc.. # A secondary alignment occurs when a given read could align reasonably well to # more than one place. One of the possible reported alignments is termed "primary" # and the others will be marked as "secondary". stats['secondary'] += 1 if this.is_paired is False: stats['unpaired'] += 1 else: # quick hack if this.is_read1: suffix = b"/1" else: suffix = b"/2" # in pysam, seq is a string and qual a bytes.... if this.is_reverse is True: txt = b"@" + bytes(this.qname, "utf-8") + suffix + b"\n" revcomp = reverse_complement(this.seq) txt += bytes(revcomp, "utf-8") + b"\n" txt += b"+\n" txt += bytes(this.qual[::-1], 'utf-8') + b"\n" else: txt = b"@" + bytes(this.qname, "utf-8") + suffix + b"\n" txt += bytes(this.seq, "utf-8") + b"\n" txt += b"+\n" txt += bytes(this.qual, "utf-8") + b"\n" # Here, we must be careful as to keep the pairs. So if R1 is mapped # but R2 is unmapped (or the inverse), then the pair is mapped if this.is_read1: if this.is_unmapped and this.mate_is_unmapped: R1_unmapped.write(txt) stats['R1_unmapped'] += 1 else: R1_mapped.write(txt) stats['R1_mapped'] += 1 elif this.is_read2: if this.is_unmapped and this.mate_is_unmapped: R2_unmapped.write(txt) stats['R2_unmapped'] += 1 else: R2_mapped.write(txt) stats['R2_mapped'] += 1 else: # This should be a single read #assert self.is_paired is False stats['unpaired'] += 1 if this.is_unmapped: R1_unmapped.write(txt) stats['R1_unmapped'] += 1 else: R1_mapped.write(txt) stats['R1_mapped'] += 1 if this.is_duplicate: stats['duplicated'] += 1 if verbose: pb.animate(i + 1) if bam.is_paired: R2_mapped.close() R2_unmapped.close() logger.info("\nNumber of entries in the BAM: %s" % str(i + 1)) R1_mapped.close() R1_unmapped.close() _x = stats['R1_mapped'] _y = stats['R1_unmapped'] stats["contamination"] = _x / float(_x + _y) * 100 return stats
def __init__(self, filename): self.bam = BAM(filename)
def bam_to_mapped_unmapped_fastq(filename, output_directory=None, verbose=True): """Create mapped and unmapped fastq files from a BAM file :context: given a reference, one or two FastQ files are mapped onto the reference to generate a BAM file. This BAM file is a compressed version of a SAM file, which interpretation should be eased within this function. :param filename: input BAM file :param output_directory: where to save the mapped and unmapped files :return: dictionary with number of reads for each file (mapped/unmapped for R1/R2) as well as the mode (paired or not), the number of unpaired reads, and the number of duplicated reads. The unpaired reads should be zero (sanity check) Given a BAM file, create FASTQ with R1/R2 reads mapped and unmapped. In the paired-end case, 4 files are created. Note that this function is efficient in that it does not create intermediate files limiting IO in the process. As compared to standard tools such as bedtools bamtofastq, it is 1.5 to 2X slower but it does create the mapped AND unmapped reads. :Details: Secondary alignment (flag 256) are dropped so as to remove any ambiguous alignments. The output dictionary stores "secondary" key to keep track of the total number of secondary reads that are dropped. If the flag is 256 and the read is unpaired, the key *unpaired* is also incremented. If the flag is not equal to 256, we first reverse complement reads that are tagged as *reverse* in the BAM file. Then, reads that are not paired or not "proper pair" (neither flag 4 nor flag 8) are ignored. If R1 is mapped **or** R2 is mapped then the reads are considered mapped. If both R1 and R2 are unmapped, then reads are unmapped. .. note:: about chimeric alignment: one is the representative and the other is the supplementary. This flag is not used in this function. Note also that chimeric alignment have same QNAME and flag 4 and 8 .. note:: the contamination reported is basde on R1 only. .. todo:: comments are missing since there are not stored in the BAM file. .. note:: the mapped reads may not be synchronized because we include also the chimeric alignment (cf samtools documentation). However, total reads = unmappeds reads + R1 mapped + R2 mapped - supplementary reads (those with flag 2048). """ bam = BAM(filename) # figure out if this is paired or unpaired newname, ext = os.path.splitext(filename) import collections stats = collections.defaultdict(int) stats['R1_unmapped'] = 0 stats['R1_mapped'] = 0 # figure out where to save the file if output_directory is None: pass else: assert isinstance(filename, str) from sequana.snaketools import FileFactory ff = FileFactory(filename) newname = output_directory + os.sep + ff.filenames[0] rt1 = "_R1_" rt2 = "_R2_" R1_mapped = open(newname + "{}.mapped.fastq".format(rt1), "wb") R1_unmapped = open(newname + "{}.unmapped.fastq".format(rt1), "wb") stats['duplicated'] = 0 stats['unpaired'] = 0 unpaired = 0 # if paired, let open other files if bam.is_paired: stats['mode'] = "pe" stats['R2_unmapped'] = 0 stats['R2_mapped'] = 0 R2_mapped = open(newname + "{}.mapped.fastq".format(rt2), "wb") R2_unmapped = open(newname + "{}.unmapped.fastq".format(rt2), "wb") else: stats['mode'] = "se" # loop through the BAM (make sure it is rewinded) bam.reset() if verbose: from easydev import Progress pb = Progress(len(bam)) for i, this in enumerate(bam): if this.flag & 256: # Unmapped reads are in the BAM file but have no valid assigned # position (N.B., they may have an assigned position, but it should be ignored). # It's typically the case that a number of reads can't be aligned, due to things # like sequencing errors, imperfect matches between the DNA sequenced and the # reference, random e. coli or other contamination, etc.. # A secondary alignment occurs when a given read could align reasonably well to # more than one place. One of the possible reported alignments is termed "primary" # and the others will be marked as "secondary". stats['secondary'] += 1 if this.is_paired is False: stats['unpaired'] += 1 else: # quick hack if this.is_read1: suffix = b"/1" else: suffix = b"/2" # in pysam, seq is a string and qual a bytes.... if this.is_reverse is True: txt = b"@" + bytes(this.qname, "utf-8") + suffix + b"\n" revcomp = reverse_complement(this.seq) txt += bytes(revcomp, "utf-8") + b"\n" txt += b"+\n" txt += bytes(this.qual[::-1], 'utf-8') + b"\n" else: txt = b"@" + bytes(this.qname, "utf-8") + suffix + b"\n" txt += bytes(this.seq, "utf-8") + b"\n" txt += b"+\n" txt += bytes(this.qual,"utf-8") + b"\n" # Here, we must be careful as to keep the pairs. So if R1 is mapped # but R2 is unmapped (or the inverse), then the pair is mapped if this.is_read1: if this.is_unmapped and this.mate_is_unmapped: R1_unmapped.write(txt) stats['R1_unmapped'] += 1 else: R1_mapped.write(txt) stats['R1_mapped'] += 1 elif this.is_read2: if this.is_unmapped and this.mate_is_unmapped: R2_unmapped.write(txt) stats['R2_unmapped'] += 1 else: R2_mapped.write(txt) stats['R2_mapped'] += 1 else: # This should be a single read #assert self.is_paired is False stats['unpaired'] += 1 if this.is_unmapped: R1_unmapped.write(txt) stats['R1_unmapped'] += 1 else: R1_mapped.write(txt) stats['R1_mapped'] += 1 if this.is_duplicate: stats['duplicated'] += 1 if verbose: pb.animate(i+1) if bam.is_paired: R2_mapped.close() R2_unmapped.close() if verbose: print("\nNumber of entries in the BAM: %s" % str(i+1)) R1_mapped.close() R1_unmapped.close() _x = stats['R1_mapped'] _y = stats['R1_unmapped'] stats["contamination"] = _x / float(_x + _y) * 100 return stats