def filter_bam(input_bam, pore_c_table, output_bam, clean_read_name): from pysam import AlignmentFile inbam = AlignmentFile(input_bam, "rb") outbam = AlignmentFile(output_bam, "wb", template=inbam) aligns = pd.read_parquet(pore_c_table, engine=PQ_ENGINE, columns=["align_idx", "pass_filter"]).set_index(["align_idx"]) aligns = aligns[aligns["pass_filter"]] expected = len(aligns) counter = 0 for align in inbam.fetch(until_eof=True): align_idx = int(align.query_name.rsplit(":")[2]) if align_idx not in aligns.index: continue if clean_read_name: readname_only = align.query_name.split(":")[0] align.query_name = readname_only outbam.write(align) counter += 1 if counter != expected: raise ValueError( f"Number of alignments doesn't match. Expected {expected} got {counter}" ) logger.info(f"Wrote {counter} reads to {output_bam}")
def subset_bamfile(sam, barcodes): """ Subset a SAM/BAM file, keeping only alignments from given cellular barcodes """ from pysam import AlignmentFile start_time = time.time() sam_file = open_bamfile(sam) out_file = AlignmentFile("-", "wh", template=sam_file) track = sam_file.fetch(until_eof=True) # peek at first alignment to determine the annotations queryalignment = track.next() annotations = detect_alignment_annotations(queryalignment) track = itertools.chain([queryalignment], track) re_string = construct_transformed_regex(annotations) parser_re = re.compile(re_string) barcodes = set(barcode.strip() for barcode in barcodes) for count, aln in enumerate(track, start=1): if count and not count % 100000: logger.info("Processed %d alignments." % count) match = parser_re.match(aln.qname) tags = aln.tags if "cellular" in annotations: cb = match.group('CB') if cb in barcodes: out_file.write(aln)
class BAMWriter: def __init__(self, output, indexed_sequence_list, index_options): header = self.build_header(indexed_sequence_list, index_options) self.writer = AlignmentFile(output, 'wb', header=header) def __del__(self): self.close() def close(self): if hasattr(self, 'writer'): self.writer.close() del self.writer def build_header(self, indexed_sequence_list, index_options): return { 'SQ': indexed_sequence_list, 'PG': [{ 'ID': 'minimap2', 'PN': 'minimap2', 'CL': index_options, 'DS': 'minimap2 invoked by poreplex' }] } def write(self, fields): line = '\t'.join(map(str, fields)) segment = AlignedSegment.fromstring(line, self.writer.header) self.writer.write(segment)
def clean_sam(sam_fn, clean_sam_fn, orig_fq_fn, NB_MM): sam = AlignmentFile(sam_fn, 'r') clean_sam = AlignmentFile(clean_sam_fn, 'wb', template=sam) reads_kept = set() reads_deleted = set() for alignment in sam: if delete_alignment(alignment, NB_MM): reads_deleted.add(alignment.query_name) else: clean_sam.write(alignment) reads_kept.add(alignment.query_name) # only add alignment to fastq if read involved hasn't been kept on a cross-mapping loci keep_as_failed_list = [] for read in reads_deleted: if not(read in reads_kept): keep_as_failed_list.append(read) seqs_failed = alignments_to_seqs(keep_as_failed_list, orig_fq_fn) ## Print discarded alignments in fastq fastq_output_fn = sam_fn[:-7] + ".cl.fq" SeqIO.write(seqs_failed, fastq_output_fn, 'fastq') ## Print in log file log_clean = open("{}.SamCleaner.log".format(sam_fn[:-7]), 'a') cpt_treated = len(reads_kept) + len(reads_deleted) log_clean.write('Number of alignments treated : {}\nNumber of alignment kept : {}\nNumber of alignments deleted : {}\n'.format(cpt_treated, len(reads_kept), len(reads_deleted))) log_clean.close() clean_sam.close()
def umappedq2zero(bamdir): """ Reads in a BAM file, setting the MAPQ value for an alignment segment to zero if it is unmapped. Opens up both infile and outfile and outputs these modified reads to outfile. """ if not os.path.exists(bamdir): sys.stderr.write("Sorry, but the specified directory does not exist.") sys.exit(1) bamfiles = os.listdir(bamdir) bampaths = filter(lambda x: x.endswith(".bam"), bamfiles) bampaths = map(lambda x: os.path.join(bamdir, x), bampaths) for bam in bampaths: inbam = AlignmentFile(bam, "rb") # Template is specified to maintain the same header information. outbam = AlignmentFile("temp.bam", "wb", template=inbam) # Construct reads iterator using fetch. reads = inbam.fetch(until_eof=True) for read in reads: if read.is_unmapped == True: read.mapping_quality = 0 outbam.write(read) # Don't omit any reads! # Overwrite the original with the new file with MAPQs set to zero. os.rename("temp.bam", bam)
def bamtag(sam, umi_only): ''' Convert a BAM/SAM with fastqtransformed read names to have UMI and cellular barcode tags ''' from pysam import AlignmentFile if umi_only: parser_re = re.compile('.*:UMI_(?P<MB>.*)') else: parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)') start_time = time.time() sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) out_file = AlignmentFile("-", "wh", template=sam_file) track = sam_file.fetch(until_eof=True) for count, aln in enumerate(track): if not count % 100000: logger.info("Processed %d alignments.") match = parser_re.match(aln.qname) tags = aln.tags if not umi_only: aln.tags += [('XC', match.group('CB'))] aln.tags += [('XR', match.group('MB'))] out_file.write(aln) total_time = time.time() - start_time logger.info('BAM tag conversion done - {:.3}s, {:,} alns/min'.format(total_time, int(60. * count / total_time)))
def subset_bamfile(sam, barcodes): """ Subset a SAM/BAM file, keeping only alignments from given cellular barcodes """ from pysam import AlignmentFile start_time = time.time() sam_file = open_bamfile(sam) out_file = AlignmentFile("-", "wh", template=sam_file) track = sam_file.fetch(until_eof=True) # peek at first alignment to determine the annotations queryalignment = track.next() annotations = detect_alignment_annotations(queryalignment) track = itertools.chain([queryalignment], track) re_string = construct_transformed_regex(annotations) parser_re = re.compile(re_string) barcodes = set(barcode.strip() for barcode in barcodes) for count, aln in enumerate(track, start=1): if count and not count % 1000000: logger.info("Processed %d alignments." % count) match = parser_re.match(aln.qname) tags = aln.tags if "cellular" in annotations: cb = match.group('CB') if cb in barcodes: out_file.write(aln)
def main( sam: str, output: str, reference2taxid: str ) -> None: """Write row with taxid and classification status for each alignment.""" aln_infile = AlignmentFile(sam, "r") aln_outfile = AlignmentFile('-', "w", template=aln_infile) ref2taxid_df = pd.read_csv( reference2taxid, sep='\t', names=['acc', 'taxid'], index_col=0) output_tsv = open(output, 'w+') for aln in aln_infile.fetch(until_eof=True): mapped = 'U' if aln.is_unmapped else 'C' queryid = aln.query_name querylen = aln.query_length taxid = 0 if not aln.is_unmapped: taxid = ref2taxid_df.at[aln.reference_name, 'taxid'] output_tsv.write( '{mapped}\t{queryid}\t{taxid}\t0|{querylen}\n'.format( mapped=mapped, queryid=queryid, taxid=taxid, querylen=querylen ) ) aln_outfile.write(aln)
def main(args): if args.log: log = MappingStats(args.log) #if args.gff3: # log.add_gff3(args.gff3) else: log = None infile = AlignmentFile(args.bam, "rb") outfile = AlignmentFile(args.output, "wb", template=infile) for alignment in infile: tags = {x[0]: x[1] for x in alignment.tags} # checking and counting special cases c1 = check_multi_location(alignment, tags, log) c2 = check_clipping(alignment, log) if log: log.count(alignment) check_barcode_is_off(alignment, tags, log) c3 = check_is_mapped(alignment, log) if c1 or c2 or c3: # if any of these checks fail (return true), the read will not be counted in mpileup # if they all pass count it as passing log.passing(alignment) # writing the filtered sam file outfile.write(alignment) if args.log: log.write()
def remove_low_cellcount_reads(inbam, outbam, mincount, log): """ This function takes a bam file with barcodes in the RG tag as input and outputs a bam file containing only barcodes that exceed the minimum number of aligments for a given barcode. """ treatment = AlignmentFile(inbam, 'rb') header = treatment.header barcodecounts = {bc['ID']: 0 for bc in header['RG']} # first parse the file to determine the per barcode # alignment counts for aln in treatment.fetch(until_eof=True): if aln.is_proper_pair and aln.is_read1 or not aln.is_paired: rg = aln.get_tag('RG') barcodecounts[rg] += 1 treatment.close() # make new header with the valid barcodes treatment = AlignmentFile(inbam, 'rb') header = treatment.header.to_dict().copy() rgheader = [] for rg in header['RG']: if barcodecounts[rg['ID']] >= mincount: rgheader.append(rg) header['RG'] = rgheader #log summary log_content = {} log_content['below_minbarcodecounts'] = 0 log_content['above_minbarcodecounts'] = 0 log_content['total'] = 0 for bc in barcodecounts: log_content['total'] += barcodecounts[bc] if barcodecounts[bc] >= mincount: log_content['above_minbarcodecounts'] += barcodecounts[bc] else: log_content['below_minbarcodecounts'] += barcodecounts[bc] bam_writer = AlignmentFile(outbam, 'wb', header=header) for aln in treatment.fetch(until_eof=True): if barcodecounts[aln.get_tag('RG')] >= mincount: bam_writer.write(aln) treatment.close() bam_writer.close() #write log file with open(log, 'w') as f: f.write('Readgroup\tcounts\n') for icnt in log_content: f.write('{}\t{}\n'.format(icnt, log_content[icnt]))
def deduplicate_reads(bamin, bamout, report, by_rg=True): """This script deduplicates the original bamfile. Deduplication removes reads align to the same position. If the reads in the bamfile contain a RG tag and by_rg=True, deduplication is done for each group separately. Parameters ---------- bamfile : str Sorted bamfile containing barcoded reads. output : str Output path to a bamfile that contains the deduplicated reads. by_rg : boolean If True, the reads will be split by group tag. """ bamfile = AlignmentFile(bamin, 'rb') output = AlignmentFile(bamout, 'wb', template=bamfile) log_counts = {'total': 0, 'retained': 0, 'removed': 0} # grep all barcodes from the header #barcodes = set() last_barcode = {} for aln in bamfile.fetch(): # if previous hash matches the current has # skip the read val = (aln.reference_id, aln.reference_start, aln.is_reverse, aln.tlen) if aln.has_tag('RG') and by_rg: rg = aln.get_tag('RG') else: rg = 'dummy' log_counts['total'] += 1 if rg not in last_barcode: output.write(aln) # clear dictionary last_barcode[rg] = val if val == last_barcode[rg]: log_counts['removed'] += 1 continue else: output.write(aln) last_barcode[rg] = val log_counts['retained'] += 1 if (log_counts['retained'] % 1000000) == 0: print("Processed {}/{} total/removed reads".format( log_counts['total'], log_counts['removed'])) #write log file with open(report, 'w') as f: f.write('\tcounts\n') for icnt in log_counts: f.write('{}\t{}\n'.format(icnt, log_counts[icnt]))
def remove_chroms(bamin, bamout, rmchroms): """ Removes chromosomes from bam-file. The function searches for matching chromosomes using regular expressions. For example, rmchroms=['chrM', '_random'] would remove 'chrM' as well as all random chromsomes. E.g. chr1_KI270706v1_random. Parameters ---------- bamin : str Input bam file. bamout : str Output bam file. rmchroms : list(str) List of chromosome names or name patterns to be removed. Returns ------- None """ treatment = AlignmentFile(bamin, 'rb') header = copy(treatment.header.as_dict()) newheader = [] for seq in header['SQ']: if not any([x in seq['SN'] for x in rmchroms]): newheader.append(seq) header['SQ'] = newheader tidmap = {k['SN']: i for i, k in enumerate(header['SQ'])} bam_writer = AlignmentFile(bamout, 'wb', header=header) # write new bam files containing only valid chromosomes for aln in treatment.fetch(until_eof=True): if aln.is_unmapped: continue if aln.reference_name not in tidmap or aln.next_reference_name not in tidmap: continue refid = tidmap[aln.reference_name] refnextid = tidmap[aln.next_reference_name] aln.reference_id = refid aln.next_reference_id = refnextid bam_writer.write(aln) bam_writer.close() treatment.close()
def filter_reads(a, barcodes, out_bam_filename): outstream = AlignmentFile(out_bam_filename, 'wb') count = 1 for read in a: if (count % 100000) == 0: print(f'Processed {count} reads') if read.has_tag('CB') and read.get_tag('CB') in barcodes: outstream.write(read) count += 1 print(f'Processed {count-1} reads') outstream.close()
def extract_barcode(sam, barcode): parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)') sam_file = AlignmentFile(sam, mode='r') filter_file = AlignmentFile("-", mode='wh', template=sam_file) track = sam_file.fetch(until_eof=True) for i, aln in enumerate(track): if aln.is_unmapped: continue match = parser_re.match(aln.qname) CB = match.group('CB') if CB == barcode: filter_file.write(aln)
def extract_barcode(sam, barcode): parser_re = re.compile(".*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)") sam_file = AlignmentFile(sam, mode="r") filter_file = AlignmentFile("-", mode="wh", template=sam_file) track = sam_file.fetch(until_eof=True) for i, aln in enumerate(track): if aln.is_unmapped: continue match = parser_re.match(aln.qname) CB = match.group("CB") if CB == barcode: filter_file.write(aln)
def get_reads_with_bcs_sam(bc_file, sam_file, out_handle=stdout): bcs = parse_bc_list(bc_file) sam_file = open_samfile(sam_file) sam_out = SamFile(out_handle, "w", template=sam_file) for read in sam_file: try: bc = get_bc_sam(read) if bc in bcs: continue sam_out.write(read) except KeyError: pass sam_file.close() sam_out.close()
def _sort(in_filename, out_filename): '''Custom sorts SAM file.''' sam_file = AlignmentFile(in_filename, 'r') out_file = AlignmentFile(out_filename, 'wh', template=sam_file, header=sam_file.header) for read in sorted([read for read in sam_file], key=lambda x: (-x.query_length, x.reference_start)): out_file.write(read) out_file.close() return out_filename
def pairs_to_telbam(af_pairs: AlignmentFile, af_telbam: AlignmentFile): read_iter = af_pairs.fetch(until_eof=True) while True: read_a = next(read_iter, None) if read_a is None: break read_b = next(read_iter) qseq = read_a.query_sequence if TEL_PATS[0] in qseq or TEL_PATS[1] in qseq: af_telbam.write(read_a) af_telbam.write(read_b) else: qseq = read_b.query_sequence if TEL_PATS[0] in qseq or TEL_PATS[1] in qseq: af_telbam.write(read_a) af_telbam.write(read_b) return
def remove_idx_from_read_names(input_bam: Path): """ Replace READNAME:ALIGN_IDX with just READNAME Originally created because WhatsHap requires unique read names. """ infile = AlignmentFile(input_bam, "rb") stdout = AlignmentFile("-", "wb", template=infile) align_iter = infile.fetch(until_eof=True) for read in align_iter: readname = read.query_name.split(":")[0] read.query_name = readname stdout.write(read) stdout.close() infile.close()
def downgrade_read_edges(in_fpath, out_fpath, read_start_size, read_end_size, qual_to_substract=QUAL_TO_SUBSTRACT): in_sam = AlignmentFile(in_fpath) out_sam = AlignmentFile(out_fpath, 'wb', template=in_sam) for aligned_read in in_sam: if (aligned_read.has_tag(LEFT_DOWNGRADED_TAG) or aligned_read.has_tag(RIGTH_DOWNGRADED_TAG)): raise RuntimeError('Edge qualities already downgraded\n') _downgrade_edge_qualities(aligned_read, read_start_size, read_end_size, qual_to_substract=qual_to_substract) out_sam.write(aligned_read)
def add_idx_to_read_name(input_bam: Path): """ Changes the readname to be READNAME:ALIGN_IDX to have 'unique' readnames WhatsHap requires unique read names. """ infile = AlignmentFile(input_bam, "rb") stdout = AlignmentFile("-", "wb", template=infile) align_iter = infile.fetch(until_eof=True) i = 0 for read in align_iter: read.query_name = read.query_name + ":" + str(i) stdout.write(read) i = i + 1 stdout.close() infile.close()
def bam(args, logger): if not args.chrom_sizes: exit("Chrom sizes required for bam conversion") chrom_mods = build_transform(args.mod, logger) input_ = AlignmentFile(args.input, 'rb') header = update_header(input_.header.as_dict(), args.chrom_sizes) output = AlignmentFile(args.output, 'wb', header=header) curr_chrom = "" for line in input_: if input_.references[line.reference_id] != curr_chrom: curr_chrom = input_.references[line.reference_id] positions, deltas = get_positions_and_deltas(chrom_mods, curr_chrom, logger) try: start_delta = find_delta(positions, deltas, int(line.reference_start)) # end_delta = find_delta(positions, # deltas, # int(line.reference_end)) mod_index = build_modification_index(positions, deltas, line, start_delta) # new_cigar = update_cigar(mod_index, line.cigar) # if len(line.cigar) < len(new_cigar): # line.cigar = new_cigar[-1*len(line.cigar):] # else: # line.cigar = new_cigar line.reference_start = int(line.reference_start) + start_delta output.write(line) except IndexError: print "IndexError: ", line pass except TypeError: print "TypeError:", line pass
def bamtag(sam): ''' Convert a BAM/SAM with fastqtransformed read names to have UMI and cellular barcode tags ''' from pysam import AlignmentFile start_time = time.time() sam_file = open_bamfile(sam) out_file = AlignmentFile("-", "wh", template=sam_file) track = sam_file.fetch(until_eof=True) # peek at first alignment to determine the annotations if is_python3(): queryalignment = next(track) else: queryalignment = track.next() annotations = detect_alignment_annotations(queryalignment) track = itertools.chain([queryalignment], track) re_string = construct_transformed_regex(annotations) parser_re = re.compile(re_string) for count, aln in enumerate(track, start=1): if count and not count % 1000000: logger.info("Processed %d alignments." % count) match = parser_re.match(aln.qname) tags = aln.tags if "cellular" in annotations: aln.tags += [('XC', match.group('CB'))] if "molecular" in annotations: aln.tags += [('RX', match.group('MB'))] if "sample" in annotations: aln.tags += [('XS', match.group('SB'))] out_file.write(aln) total_time = time.time() - start_time logger.info('BAM tag conversion done - {:.3}s, {:,} alns/min'.format(total_time, int(60. * count / total_time))) logger.info("Processed %d alignments." % count)
def bamtag(sam): ''' Convert a BAM/SAM with fastqtransformed read names to have UMI and cellular barcode tags ''' from pysam import AlignmentFile start_time = time.time() sam_file = open_bamfile(sam) out_file = AlignmentFile("-", "wh", template=sam_file) track = sam_file.fetch(until_eof=True) # peek at first alignment to determine the annotations queryalignment = track.next() annotations = detect_alignment_annotations(queryalignment) track = itertools.chain([queryalignment], track) re_string = construct_transformed_regex(annotations) parser_re = re.compile(re_string) for count, aln in enumerate(track, start=1): if count and not count % 100000: logger.info("Processed %d alignments." % count) match = parser_re.match(aln.qname) tags = aln.tags if "cellular" in annotations: aln.tags += [('XC', match.group('CB'))] if "molecular" in annotations: aln.tags += [('RX', match.group('MB'))] if "sample" in annotations: aln.tags += [('XS', match.group('SB'))] out_file.write(aln) total_time = time.time() - start_time logger.info('BAM tag conversion done - {:.3}s, {:,} alns/min'.format( total_time, int(60. * count / total_time))) logger.info("Processed %d alignments." % count)
def deduplicate_reads(bamin, bamout, tag='CB'): """Performs deduplication within barcodes/cells. Parameters ---------- bamin : str Position sorted input bamfile. bamout : str Output file containing deduplicated reads. tag : str or callable Indicates the barcode tag or custom function to extract the barcode. Default: 'CB' Returns ------- None """ bamfile = AlignmentFile(bamin, 'rb') output = AlignmentFile(bamout, 'wb', template=bamfile) last_barcode = {} barcoder = Barcoder(tag) for aln in bamfile.fetch(): # if previous hash matches the current has # skip the read val = (aln.reference_id, aln.reference_start, aln.is_reverse, aln.tlen) barcode = barcoder(aln) if barcode not in last_barcode: output.write(aln) # clear dictionary last_barcode[barcode] = val if val == last_barcode[barcode]: continue else: output.write(aln) last_barcode[barcode] = val
def atac(args, logger): """ """ if not args.chrom_sizes: exit("Chrom sizes required for bam conversion") chrom_mods = build_transform(args.mod, logger) input_ = AlignmentFile(args.input, 'rb') header = update_header(input_.header.as_dict(), args.chrom_sizes) output = AlignmentFile(args.output, 'wb', header=header) curr_chrom = "" for line in input_: if input_.references[line.reference_id] != curr_chrom: curr_chrom = input_.references[line.reference_id] positions, deltas = get_positions_and_deltas( chrom_mods, curr_chrom, logger) # if line.is_reverse and (line.reference_length != len(line.seq)): # print line # print line.reference_length # print line.cigar # print len(line.seq) # print len(line.get_reference_positions()) # try: if not line.is_reverse: start_delta = find_delta(positions, deltas, int(line.reference_start)) line.reference_start = int(line.reference_start) + start_delta else: end_delta = find_delta(positions, deltas, int(line.reference_end)) mapped_end = int(line.reference_end) + end_delta line.reference_start = mapped_end - len( line.seq) # line.reference_length output.write(line)
def run_project_alignments(args): """ Project mapped sam file""" sam = args.sam chromosomes = args.chromosomes.split(",") graph_dir = args.data_dir linear_ref_paths = {} haplotype_paths = {} out_sam = AlignmentFile(args.out_sam, "w", template=AlignmentFile(sam)) logging.info("Reading linear paths") for chromosome in tqdm(chromosomes): linear_ref_paths[chromosome] = NumpyIndexedInterval.from_file( graph_dir + chromosome + "_linear_pathv2.interval") haplotype_paths[chromosome] = NumpyIndexedInterval.from_file( args.linear_paths_base_name + "_" + chromosome + ".intervalcollection.indexed") logging.info("Converting") n_unmapped = 0 for sam_record in tqdm(read_sam(sam), total=number_of_lines_in_file(sam)): chromosome = sam_record.chromosome if chromosome is None: out_sam.write(sam_record.pysam_object) n_unmapped += 1 continue length = len(sam_record.sequence) projected_start = convert_position_on_haplotype_to_position_on_linear_ref( linear_ref_paths[chromosome], haplotype_paths[chromosome], sam_record.start) sam_record.set_start(projected_start) out_sam.write(sam_record.pysam_object) logging.info("%d sam records missed chromosome (unmapped)" % n_unmapped)
print("Alignment of query %s (length %d) : [%d - %d) to %s (length %d) : %s - %s. Identity: %.3f. Length on query (reference): %d (%d). Reverse: %r" % \ (r.query_name, init_length, query_alignment_start, query_alignment_end, \ alignment.get_reference_name(r.reference_id), alignment.lengths[r.reference_id], \ reference_start_str, reference_end_str, \ idy * 100., r.query_alignment_length, r.reference_length, r.is_reverse), \ file=sys.stderr) print( "%s\t%d\t%d\t%d\t%r\t%s\t%d\t%d\t%d\t%.3f" % (r.query_name, init_length, query_alignment_start, query_alignment_end, r.is_reverse, alignment.get_reference_name( r.reference_id), alignment.lengths[r.reference_id], r.reference_start, r.reference_end, idy * 100.)) if args.filtered: out_alignment.write(r) if args.bed: if r.reference_length < 2 * args.bed_trim: print("WARN reference span %d to small for used trim %d" % (r.reference_length, args.bed_trim), file=sys.stderr) else: print("%s\t%d\t%d\t%s" % (alignment.get_reference_name( r.reference_id), r.reference_start + args.bed_trim, r.reference_end - args.bed_trim, r.query_name), file=out_bed) if args.filtered: out_alignment.close()
def remove_chroms(inbam, outbam, rmchroms, log): """ This function takes a bam-file and outputs a bam-file in which the specified chromosomes have been removed. The function searches for matching chromosomes using regular expressions. For example, rmchroms=['chrM', '_random'] would remove 'chrM' as well as all random chromsomes. E.g. chr1_KI270706v1_random. """ treatment = AlignmentFile(inbam, 'rb') header = treatment.header new_chroms = [] chrnames = [] # tid_map is to reindex the chromosomes in the # new bam file. tid_map = [-1 for i in range(len(header['SQ']))] N = 0 chr_to_remove_reason = {} # make new header with valid chromsomes for i, seq in enumerate(header['SQ']): keep = True for chrom in rmchroms: if chrom in seq['SN']: keep = False chr_to_remove_reason[seq['SN']] = chrom break if keep: tid_map[i] = N N += 1 new_chroms.append(seq) chrnames.append(seq['SN']) new_header = {'SQ': new_chroms} bam_writer = AlignmentFile(outbam, 'wb', header=new_header) log_content = {chrom: 0 for chrom in rmchroms} log_content['remaining'] = 0 log_content['unmapped'] = 0 log_content['total'] = 0 # write new bam files containing only valid chromosomes for aln in treatment.fetch(until_eof=True): log_content['total'] += 1 if aln.is_unmapped: log_content['unmapped'] += 1 continue if aln.reference_name in chrnames: aln.reference_id = tid_map[aln.reference_id] if aln.is_paired and aln.is_proper_pair: aln.next_reference_id = tid_map[aln.next_reference_id] bam_writer.write(aln) log_content['remaining'] += 1 else: log_content[chr_to_remove_reason[aln.reference_name]] += 1 bam_writer.close() treatment.close() #write log file with open(log, 'w') as f: f.write('Readgroup\tcounts\n') for icnt in log_content: f.write('{}\t{}\n'.format(icnt, log_content[icnt]))
def remove_low_mapq_reads(inbam, outbam, minmapq, log): """ This function takes a bam file and it produces a new bam file with aligments with a minimum mapping quality. For paired-end data, only alignments where both mates exceed the threshold are retained. """ treatment = AlignmentFile(inbam, 'rb') bam_writer = AlignmentFile(outbam, 'wb', template=treatment) log_content = {} log_content['below_mapq'] = 0 log_content['above_mapq'] = 0 log_content['total'] = 0 waiting_for_pair = {} for aln in treatment.fetch(until_eof=True): log_content['total'] += 1 if aln.is_paired: if aln.rname not in waiting_for_pair: # for the first mate that we encounter # we get here and keep the aln in the waiting list # if it is valid if aln.mapq >= minmapq and not aln.is_unmapped: waiting_for_pair[aln.rname] = aln else: # None marks an invalid first mate waiting_for_pair[aln.rname] = None log_content['below_mapq'] += 1 else: # for the second mate that we encounter # we get here if aln.mapq >= minmapq and not aln.is_unmapped \ and waiting_for_pair[aln.rname] is not None: # both pairs satisfy the min mapq threshold # and are mapped. # write them into the output file bam_writer.write(waiting_for_pair[aln.rname]) bam_writer.write(aln) log_content['above_mapq'] += 2 else: # either the first mate was invalid # or the second mate was below mapq log_content['below_mapq'] += 1 # finally clear the waiting list to save memory waiting_for_pair.pop(aln.rname) else: # single end if aln.maqp >= minmapq: bam_writer.write(aln) log_content['above_mapq'] += 1 else: log_content['below_mapq'] += 1 treatment.close() bam_writer.close() #write log file with open(log, 'w') as f: f.write('Readgroup\tcounts\n') for icnt in log_content: f.write('{}\t{}\n'.format(icnt, log_content[icnt]))
def _processs_bam_file(bam_fname, metrics, mapq_th, skipped, segmentation=None, gap_th=1000000): """ Extract data from BAM file into chunks of genome. Parameters ---------- bam_fname : str BAM file with mapped reads. metrics : iCount.Metrics Metrics object for storing analysis metadata. mapq_th : int Ignore hits with MAPQ < mapq_th. skipped : str Output BAM file to store reads that do not map as expected by segmentation and reference genome sequence. If read's second start does not fall on any of segmentation borders, it is considered problematic. If segmentation is not provided, every read in two parts with gap longer than gap_th is not used (skipped). All such reads are reported to the user for further exploration. segmentation : str File with segmentation (obtained by ``iCount segment``). gap_th : int Reads with gaps less than gap_th are treated as if they have no gap. Returns ------- dict Internal structure of BAM file, described in docstring. list BAM file with """ metrics.all_recs = 0 # All records metrics.notmapped_recs = 0 # Not mapped records metrics.mapped_recs = 0 # Mapped records metrics.lowmapq_recs = 0 # Records with insufficient quality metrics.used_recs = 0 # Records used in analysis (all - unmapped - lowmapq) metrics.invalidrandomer_recs = 0 # Records with invalid randomer metrics.norandomer_recs = 0 # Records with no randomer metrics.bc_cn = {} # Barcode counter metrics.strange_recs = 0 # Strange records (not expected by segmentation) def finalize(reads_pending_fwd, reads_pending_rev, start, chrom, progress): """Yield appropriate data.""" reads_to_process_fwd = {} for pos in list(reads_pending_fwd): if pos < start: reads_to_process_fwd[pos] = reads_pending_fwd.pop(pos) if reads_to_process_fwd: yield ((chrom, '+'), progress, reads_to_process_fwd) reads_to_process_rev = {} for pos in list(reads_pending_rev): if pos < start: reads_to_process_rev[pos] = reads_pending_rev.pop(pos) if reads_to_process_rev: yield ((chrom, '-'), progress, reads_to_process_rev) genome_done = 0 ann_data = None LOGGER.info('Detecting cross-links...') with AlignmentFile(bam_fname, 'rb') as bamfile: strange_bam = AlignmentFile(skipped, 'wb', header=bamfile.header) genome_size = sum([contig['LN'] for contig in bamfile.header['SQ']]) for chrom in bamfile.references: chrom_len = bamfile.header['SQ'][bamfile.get_tid(chrom)]['LN'] if segmentation: # pylint: disable=protected-access ann_data = iCount.genomes.segment._prepare_segmentation(segmentation, chrom) reads_pending_fwd = {} reads_pending_rev = {} read = None for read in bamfile.fetch(chrom): metrics.all_recs += 1 if read.is_unmapped: metrics.notmapped_recs += 1 continue metrics.mapped_recs += 1 if read.mapping_quality < mapq_th: metrics.lowmapq_recs += 1 continue metrics.used_recs += 1 rdata = _get_read_data( read, metrics, mapq_th, segmentation=ann_data, gap_th=gap_th) (xlink_pos, barcode, is_strange, strand), read_data = rdata[0:4], rdata[4:] if is_strange: strange_bam.write(read) else: if strand == '+': reads_pending_fwd.setdefault( xlink_pos, {}).setdefault(barcode, []).append(read_data) else: reads_pending_rev.setdefault( xlink_pos, {}).setdefault(barcode, []).append(read_data) # Sliding window start (smaller coordinate) start = 0 if read is None else (0 if not read.positions else read.positions[0]) progress = round(min((genome_done + start) / genome_size, 1.0), 4) for data in finalize(reads_pending_fwd, reads_pending_rev, start, chrom, progress): yield data start = chrom_len progress = round(min((genome_done + start) / genome_size, 1.0), 4) for data in finalize(reads_pending_fwd, reads_pending_rev, start, chrom, progress): yield data genome_done += chrom_len # Report: LOGGER.info('All records in BAM file: %d', metrics.all_recs) LOGGER.info('Reads not mapped: %d', metrics.notmapped_recs) LOGGER.info('Mapped reads records (hits): %d', metrics.mapped_recs) LOGGER.info('Hits ignored because of low MAPQ: %d', metrics.lowmapq_recs) LOGGER.info('Records used for quantification: %d', metrics.used_recs) LOGGER.info('Records with invalid randomer info in header: %d', metrics.invalidrandomer_recs) LOGGER.info('Records with no randomer info: %d', metrics.norandomer_recs) LOGGER.info('Ten most frequent randomers:') top10 = sorted( [(count, barcode) for barcode, count in metrics.bc_cn.items()], reverse=True)[:10] for count, barcode in top10: LOGGER.info(' %s: %d', barcode, count) LOGGER.info('There are %d reads with second-start not falling on segmentation. They are ' 'reported in file: %s', metrics.strange_recs, skipped)
import sys from pysam import AlignmentFile from argparse import ArgumentParser valid_spliced_reads=0 problem_reads=0 parser = ArgumentParser() parser.add_argument('infile', nargs='?', default='-') parser.add_argument('outfile', nargs='?', default='-') args = parser.parse_args() infile = AlignmentFile(args.infile, 'r') outfile = AlignmentFile(args.outfile, 'wh', template=infile) for read in infile: splice_len = 0 min_edge = 1e6 if read.mapping_quality < 10: continue for cig_op, cig_len in read.cigartuples: if cig_op == 3: # N splice_len += cig_len elif cig_op == 0: min_edge = min(min_edge, cig_len) if splice_len > 50 and min_edge >= 6: outfile.write(read) valid_spliced_reads += 1 if valid_spliced_reads % 100000 == 0: sys.stderr.write("%d valid, %d problematic spliced reads\n" % (valid_spliced_reads, problem_reads) ) sys.stderr.write("%d valid, %d problematic spliced reads\n" % (valid_spliced_reads, problem_reads) )
class Writer(Thread): def __init__(self, mode, iterator, aligner, fd=sys.stdout, duplex=False, ref_fn=None, groups=None, group_key=None): super().__init__() self.fd = fd self.log = [] self.mode = mode self.duplex = duplex self.aligner = aligner self.iterator = iterator self.fastq = mode == 'wfq' self.group_key = group_key self.output = AlignmentFile( fd, 'w' if self.fastq else self.mode, add_sam_header=not self.fastq, reference_filename=ref_fn, header=AlignmentHeader.from_references( reference_names=aligner.seq_names if aligner else [], reference_lengths=[ len(aligner.seq(name)) for name in aligner.seq_names ] if aligner else [], text=sam_header(groups), ) ) def run(self): with CSVLogger(summary_file(), sep='\t') as summary: for read, res in self.iterator: seq = res['sequence'] qstring = res.get('qstring', '*') mean_qscore = res.get('mean_qscore', mean_qscore_from_qstring(qstring)) mapping = res.get('mapping', False) mods_tags = res.get('mods', []) if self.duplex: samples = len(read[0].signal) + len(read[1].signal) read_id = '%s;%s' % (read[0].read_id, read[1].read_id) else: samples = len(read.signal) read_id = read.read_id tags = [ f'RG:Z:{read.run_id}_{self.group_key}', f'qs:i:{round(mean_qscore)}', *read.tagdata(), *mods_tags, ] if len(seq): if self.mode == 'wfq': write_fastq(read_id, seq, qstring, fd=self.fd, tags=tags) else: self.output.write( AlignedSegment.fromstring( sam_record(read_id, seq, qstring, mapping, tags=tags), self.output.header ) ) if self.duplex: summary.append(duplex_summary_row(read[0], read[1], len(seq), mean_qscore, alignment=mapping)) else: summary.append(summary_row(read, len(seq), mean_qscore, alignment=mapping)) self.log.append((read_id, samples)) else: logger.warn("> skipping empty sequence %s", read_id)
def call_family_consensus( current_family_forward_size: int, current_family_reverse_size: int, current_family_size: int, debug: bool, debug_family_ids: List[str], debug_family_location: str, family_file_prefix: str, family_index: int, input_bam: pysam.AlignmentFile, output_bam: pysam.AlignmentFile, synthetic_read_prefix: str, temp_bam_filename_forward: str, temp_bam_filename_reverse: str, calling_method: str, ) -> None: """ Call consensus read for family identifying if collapse is required and selecting if forward or reverse orientation should be used :param current_family_forward_size: number of reads in forward orientation for this family :param current_family_reverse_size: number of reads in reverse orientation for this family :param current_family_size: total family size :param debug: debug mode :param debug_family_ids: families for which to generate debug files :param debug_family_location: location where to save the debug files :param family_file_prefix: prefix of family files :param family_index: index for the family :param input_bam: input bam file (opened) :param output_bam: output bam files :param synthetic_read_prefix: prefix for synthetic reads :param temp_bam_filename_forward: temp filename of file with forward reads :param temp_bam_filename_reverse: temp filenem of file with reverse reads :param calling_method: method for base calling :return: None """ if current_family_size > 1: if current_family_forward_size >= current_family_reverse_size: new_read = call_consensus( temp_bam_filename_forward, new_read_name=f'{synthetic_read_prefix}{family_index}', temp_sorted_filename=f'{temp_bam_filename_forward}.sorted.bam', calling_method=calling_method) output_bam.write(new_read) else: new_read = call_consensus( temp_bam_filename_reverse, new_read_name=f'{synthetic_read_prefix}{family_index}', temp_sorted_filename=f'{temp_bam_filename_forward}.sorted.bam', calling_method=calling_method) output_bam.write(new_read) else: if current_family_forward_size == 1: # copy read, could cache last read and avoid re-opening file with pysam.AlignmentFile(temp_bam_filename_forward, "rb") as family_file: first_read = family_file.__next__() output_bam.write(first_read) else: # copy read, could cache last read and avoid re-opening file with pysam.AlignmentFile(temp_bam_filename_reverse, "rb") as family_file: first_read = family_file.__next__() output_bam.write(first_read) if debug: # save information about specific families if family_index in debug_family_ids: save_family_debug(debug_family_location, family_file_prefix, family_index, input_bam, new_read, temp_bam_filename_forward, temp_bam_filename_reverse)
class CTCWriter(Thread): """ CTC writer process that writes output numpy training data. """ def __init__( self, mode, iterator, aligner, fd=sys.stdout, min_coverage=0.90, min_accuracy=0.99, ref_fn=None, groups=None, group_key=None, ): super().__init__() self.fd = fd self.log = [] self.mode = mode self.aligner = aligner self.iterator = iterator self.group_key = group_key self.min_coverage = min_coverage self.min_accuracy = min_accuracy self.output = AlignmentFile( fd, 'w' if self.mode == 'wfq' else self.mode, add_sam_header=self.mode != 'wfq', reference_filename=ref_fn, header=AlignmentHeader.from_references( reference_names=aligner.seq_names, reference_lengths=[len(aligner.seq(name)) for name in aligner.seq_names], text=sam_header(groups), ) ) def run(self): chunks = [] targets = [] lengths = [] with CSVLogger(summary_file(), sep='\t') as summary: for read, ctc_data in self.iterator: seq = ctc_data['sequence'] qstring = ctc_data['qstring'] mean_qscore = ctc_data.get('mean_qscore', mean_qscore_from_qstring(qstring)) mapping = ctc_data.get('mapping', False) self.log.append((read.read_id, len(read.signal))) if len(seq) == 0 or mapping is None: continue cov = (mapping.q_en - mapping.q_st) / len(seq) acc = mapping.mlen / mapping.blen refseq = self.aligner.seq(mapping.ctg, mapping.r_st, mapping.r_en) if acc < self.min_accuracy or cov < self.min_coverage or 'N' in refseq: continue self.output.write( AlignedSegment.fromstring( sam_record(read.read_id, seq, qstring, mapping), self.output.header ) ) summary.append(summary_row(read, len(seq), mean_qscore, alignment=mapping)) if mapping.strand == -1: refseq = mappy.revcomp(refseq) target = [int(x) for x in refseq.translate({65: '1', 67: '2', 71: '3', 84: '4'})] targets.append(target) chunks.append(read.signal) lengths.append(len(target)) if len(chunks) == 0: sys.stderr.write("> no suitable ctc data to write\n") return chunks = np.array(chunks, dtype=np.float16) targets_ = np.zeros((chunks.shape[0], max(lengths)), dtype=np.uint8) for idx, target in enumerate(targets): targets_[idx, :len(target)] = target lengths = np.array(lengths, dtype=np.uint16) indices = np.random.permutation(typical_indices(lengths)) chunks = chunks[indices] targets_ = targets_[indices] lengths = lengths[indices] summary = pd.read_csv(summary_file(), sep='\t') summary.iloc[indices].to_csv(summary_file(), sep='\t', index=False) output_directory = '.' if sys.stdout.isatty() else dirname(realpath('/dev/fd/1')) np.save(os.path.join(output_directory, "chunks.npy"), chunks) np.save(os.path.join(output_directory, "references.npy"), targets_) np.save(os.path.join(output_directory, "reference_lengths.npy"), lengths) sys.stderr.write("> written ctc training data\n") sys.stderr.write(" - chunks.npy with shape (%s)\n" % ','.join(map(str, chunks.shape))) sys.stderr.write(" - references.npy with shape (%s)\n" % ','.join(map(str, targets_.shape))) sys.stderr.write(" - reference_lengths.npy shape (%s)\n" % ','.join(map(str, lengths.shape))) def stop(self): self.join()
def analyzeReferenceId(self, referenceId, alignmentFile, outputDir): """ Analyze the given reference id in the given alignment file (if an alignment to the reference id is present). @param referenceId: The C{str} id of the reference sequence to analyze. @param alignmentFile: The C{str} name of an alignment file. @param outputDir: The C{str} name of the output directory. @return: C{None} if C{referenceId} is not present in C{alignmentFile} or if no significant offsets are found. Else, a C{dict} containing the signifcant offsets and the consensus sequence that best matches C{referenceId}. """ analysis = self.initialReferenceIdAnalysis(referenceId, alignmentFile, outputDir) if analysis: (genomeLength, alignedReads, readCountAtOffset, baseCountAtOffset, readsAtOffset, significantOffsets, samFilter, paddedSAM) = analysis else: return insignificantOffsets = set( range(genomeLength)) - set(significantOffsets) reference = self.referenceGenomes[referenceId] referenceSequence = reference.sequence consensus = [] for base in referenceSequence: ob = OffsetBases() ob.incorporateBase(base) consensus.append(ob) readQueue = PriorityQueue() self.updatePriorityQueue(readQueue, alignedReads, consensus, significantOffsets) consensusFilename = join(outputDir, 'reference-consensus.sam') nonConsensusFilename = join(outputDir, 'reference-non-consensus.sam') self.report(' Writing consensus SAM to', consensusFilename) self.report(' Writing non-consensus SAM to', nonConsensusFilename) with samfile(alignmentFile) as sam: consensusAlignment = AlignmentFile(consensusFilename, mode='w', template=sam) nonConsensusAlignment = AlignmentFile(nonConsensusFilename, mode='w', template=sam) # Reads with no significant offsets get written to both output files. readsWithNoSignificantOffsetsCount = 0 for read in alignedReads: if not read.significantOffsets: readsWithNoSignificantOffsetsCount += 1 consensusAlignment.write(read.alignment) nonConsensusAlignment.write(read.alignment) for offset in insignificantOffsets: base = read.base(offset) if base is not None: consensus[offset].incorporateBase(base) self.report(' %d read%s did not overlap any significant offsets' % (readsWithNoSignificantOffsetsCount, s(readsWithNoSignificantOffsetsCount))) readsMatchingConsensusCount = readsNotMatchingConsensusCount = 0 cutoff = self.cutoff while readQueue: mismatchFraction, _ = readQueue.lowestPriority() read = readQueue.pop() if mismatchFraction <= cutoff: # We want this read. Incorporate it into the consensus. readsMatchingConsensusCount += 1 consensusAlignment.write(read.alignment) affectedReads = set() for offset in read.significantOffsets: readBase = read.base(offset) consensus[offset].incorporateBase(readBase) for readAtOffset in readsAtOffset[offset]: if readAtOffset in readQueue: affectedReads.add(readAtOffset) self.updatePriorityQueue(readQueue, affectedReads, consensus, significantOffsets) else: readsNotMatchingConsensusCount += 1 nonConsensusAlignment.write(read.alignment) consensusAlignment.close() nonConsensusAlignment.close() self.report( ' %d read%s matched the consensus, %d did not.' % (readsMatchingConsensusCount, s(readsMatchingConsensusCount), readsNotMatchingConsensusCount)) # Remove the reference bases from the consensus. for offset, base in enumerate(referenceSequence): consensus[offset].unincorporateBase(base) consensusInfoFilename = join(outputDir, 'reference-consensus.txt') self.report(' Writing consensus info to', consensusInfoFilename) with open(consensusInfoFilename, 'w') as fp: consensusSequence = [] for offset in range(genomeLength): # Take a copy of the commonest set because we may pop from # it below. commonest = set(consensus[offset].commonest) referenceBase = referenceSequence[offset] if len(commonest) > 1: nucleotides = ' Nucleotides: %s' % ( consensus[offset].baseCountsToStr()) else: nucleotides = '' if referenceBase in commonest: consensusBase = referenceBase else: if len(commonest) == 1: # Nothing in the included reads covers this offset. consensusBase = '-' elif len(commonest) > 1: # Report a draw (in which the reference base is not # included and so cannot be used to break the draw). commonest.pop() else: consensusBase = commonest.pop() consensusSequence.append(consensusBase) mismatch = '' if referenceBase == consensusBase else ( ' Mismatch (reference has %s)' % referenceBase) print('%d: %s%s%s' % (offset + 1, consensusBase, mismatch, nucleotides), file=fp) consensusRead = Read('gready-consensus-%s' % referenceId, ''.join(consensusSequence)) consensusFilename = join(outputDir, 'reference-consensus.fasta') self.report(' Writing gready consensus info to', consensusFilename) Reads([consensusRead]).save(consensusFilename) return { 'consensusRead': consensusRead, 'significantOffsets': significantOffsets, }