def pesam2bed(input_file, chrlen_file, output_file, logfile, errfile, filter_function=check_good_read): ''' Converts aligned BAM file into BED file of fragments filtering reads that do not pass the filtering criteria Input: input_file - aligned (unsorted) paired end BAM chrlen_file - TSV file of format chromosome<TAB>length output_file - output file (BED file with start and end corresponding to both ends of the PE fragment) logfile, errfile - logging files filter_function - function that accepts an aligned read (pysam.AlignedSegment) and returns true, false or weight of the fragment Default: utils.check_good_reads Note: the function can return weight, if trying to accommodate reads from repetitive sequences Output: None (written into output file) See also: check_good_read ''' samfile = AlignmentFile(input_file) take_read = [x for x in itertools.imap(filter_function, samfile)] # check good_read also checks if this is R1... samfile.reset() logfile.write("%d out of %d reads passed filtering\n" % (sum(take_read), len(take_read) / 2)) logfile.flush() chrlens = genomic_utils.get_chr_lens(chrlen_file) outfile = open(output_file, 'w') count = 0 for i, r in enumerate(samfile): count += 1 if count % 1000000 == 0: logfile.write("pesam2bed: %s, Read # %d\n" % (input_file, count)) logfile.flush() read_1 = r if take_read[i] == 0: continue if not read_1.is_reverse: rstart = read_1.reference_start start = max(0, rstart) rend = rstart + read_1.template_length end = min(rend, chrlens[samfile.getrname(read_1.reference_id)]) if start > end: continue try: outfile.write(samfile.getrname(read_1.reference_id)) outfile.write('\t') outfile.write('%d\t%d\t%s\t%d\t+\n' % (start, end, read_1.qname, take_read[i] * 100)) except ValueError: errfile.write( "***** pesam2bed failed with the following value\n") errfile.write(samfile.getrname(read_1.reference_id)) errfile.write(' %d\t%d\t%s\t%d\t-\n' % (start, end, read_1.qname, take_read[i] * 100)) errfile.flush() sys.exit(-1) else: rstart = read_1.reference_end - abs(read_1.template_length) start = max(0, rstart) rend = read_1.reference_end end = min(rend, chrlens[samfile.getrname(read_1.reference_id)]) if start > end: continue try: outfile.write(samfile.getrname(read_1.reference_id)) outfile.write('\t') outfile.write('%d\t%d\t%s\t%d\t-\n' % (start, end, read_1.qname, take_read[i] * 100)) except ValueError: errfile.write( "***** pesam2bed failed with the following value\n") errfile.write(samfile.getrname(read_1.reference_id)) errfile.write(' %d\t%d\t%s\t%d\t-\n' % (start, end, read_1.qname, take_read[i] * 100)) errfile.flush() sys.exit(-1) outfile.close()