def main(args): option = "r" if args.samformat else "rb" samfile = Samfile(args.bamfile, "rb") ref_ids = [samfile.gettid(r) for r in samfile.references] #Iterates over each read instead of each contig reads_to_print = [] for aln in samfile.fetch(until_eof = True): if pair_is_aligned(aln, ref_ids): if args.read_pair == 1 and aln.is_read1: reads_to_print.append(aln) elif args.read_pair == 2 and aln.is_read2: reads_to_print.append(aln) elif args.read_pair == 0: reads_to_print.append(aln) if len(reads_to_print) >= 10000: # Flush the reads collected print_reads(reads_to_print) reads_to_print = [] print_reads(reads_to_print)
def main(args): option = "r" if args.samformat else "rb" samfile = Samfile(args.bamfile, option) ref_ids = [samfile.gettid(r) for r in samfile.references] #Iterates over each read instead of each contig reads_to_print_1 = [] reads_to_print_2 = [] reads_to_print_u = [] for aln in samfile.fetch(until_eof = True): if aln.tid in ref_ids: # This read is aligned if aln.rnext in ref_ids: # The mate is also aligned if aln.is_read1: reads_to_print_1.append(aln) reads_to_print_1 = flush_reads(reads_to_print_1, args.R1) elif aln.is_read2: reads_to_print_2.append(aln) reads_to_print_2 = flush_reads(reads_to_print_2, args.R2) else: reads_to_print_u.append(aln) reads_to_print_u = flush_reads(reads_to_print_u, args.u) print_reads(reads_to_print_1, args.R1) print_reads(reads_to_print_2, args.R2) print_reads(reads_to_print_u, args.u)
def bam_surject_msa(args): """ Caveats: - flags are remained as original statuses - remaining original values for MD, NM, and AS tags - mate are given as unmapped - same records are emited """ skip_flag = args.skip_flag sam = Samfile(args.bam) fasta = Fasta(open(args.msa_fasta)) mapped_ref_set = set(sam.references) # setup output if args.refnames is None: refnames = [ 'consensus{0}'.format(i) for i in xrange(len(args.msa_fastas)) ] else: refnames = args.refnames assert len(refnames) == len( args.msa_fastas ), 'The number of refnames should be the same as that of msa_fastas.' logging.info('Loading MSA fastas') logging.info('Skip flag: %s', args.skip_flag) fastas = [] ref_lens = [] target_ref_set = set() for fn in args.msa_fastas: with open(fn) as fp: fasta = Fasta(fp) fastas.append(fasta) if len(fasta.contigs) == 0: logging.error('Fasta file %s has no contigs', fn) raise Exception('No contigs') ref_lens.append(len(fasta.contigs[0])) target_ref_set.update(fasta.names) rest_refs = [r for r in sam.references if r not in target_ref_set] logging.info('%s are included in surjection targets.', len(target_ref_set)) logging.info('%s are not included in surjection targets.', len(rest_refs)) if args.keep_rest: logging.info('Rest of reference will be kept in surjected BAM file') org_ref_len_map = dict(zip(sam.references, sam.lengths)) refnames.extend([r for r in rest_refs]) ref_lens.extend([org_ref_len_map[r] for r in rest_refs]) fastas.extend([None for r in rest_refs]) logging.info('Setting up output BAMs') if args.output.endswith('.bam'): mode = 'wb' else: mode = 'wh' out = pysam.Samfile(args.output, mode=mode, reference_names=[refname], reference_lengths=[ref_length]) # iteration for refname, fasta in zip(refnames, fastas): out_tid = out.gettid(refname) if fasta is None: logging.info('Transfering %s', refname) src_tid = sam.gettid(refname) for rec in sam.fetch(reference=refname): if rec.flag & skip_flag: continue a = rec.__copy__() a.reference_id = out_tid if a.next_reference_id != src_tid: # pair on the same refs a.next_reference_id = out_tid else: a.next_reference_id = -1 # unpair a.next_reference_start = -1 out.write(a) continue logging.info('Surjecing to %s', refname) query_refs = fasta.names cc = _CigarChecker() if args.check else None for qref in query_refs: if qref not in mapped_ref_set: logging.warning('%s is not found in original BAM file', qref) continue #a = pysam.AlignedSegment() a = rec.__copy__() #print (rec) if not rec.is_unmapped: org_cigar = Cigar(rec.cigartuples) pos, cigar = mc.convert(rec.pos, org_cigar) if org_cigar.query_length != cigar.query_length: logging.error('Invalid cigar conversion for %s', rec.qname) logging.error('org %s %s %s', rec.pos, org_cigar, org_cigar.query_length) logging.error('new %s %s %s', pos, cigar, cigar.query_length) s1 = pos e1 = mc.get_pos(rec.pos + cigar.ref_length) logging.error('ref %s-%s %s', s1, e1, mc.get_ref_cigar(s1, e1)) logging.error('read %s', rec.seq) logging.error('qref %s', q_aln.seq[s1:e1]) raise Exception('Incompatible Cigar') cc and cc.check(rec, pos, cigar, org_cigar, mc, q_aln) a.cigar = cigar.values a.reference_start = pos a.reference_id = out_tid a.next_reference_id = -1 # this is required a.next_reference_start = -1 # this is required #a.flag = rec.flag #orec.seq = '*' #print (orec) out.write(a)