def read_pair_info_iter(bam_in, target_regions, references): while (True): try: keep_going = True while (keep_going): r1 = bam_in.next() if not (r1.is_secondary): keep_going = False keep_going = True while (keep_going): r2 = bam_in.next() if not (r2.is_secondary): keep_going = False except StopIteration: break name1 = r1.qname.split()[0] name2 = r2.qname.split()[0] assert (name1 == name2) if r1.is_read1: read1 = r1 read2 = r2 else: read1 = r2 read2 = r1 if r1.tid == r2.tid and r2.pos < r1.pos: pos2 = r1 pos1 = r2 else: pos1 = r1 pos2 = r2 read_pair_info = {} read_pair_info['name'] = name1 if read1.is_read1: read1_index = 1 read2_index = 2 else: read1_index = 2 read2_index = 1 read_pair_info['r1_index'] = read1_index read_pair_info['r2_index'] = read2_index read_pair_info[ 'is_duplicate'] = read1.is_duplicate or read2.is_duplicate read_pair_info['r1_mapped'] = not (read1.is_unmapped) read_pair_info['r2_mapped'] = not (read2.is_unmapped) read_pair_info['r1_mapq'] = read1.mapq read_pair_info['r2_mapq'] = read2.mapq if read1.tid == -1: ref1 = '' else: ref1 = references[read1.tid] read_pair_info['r1_chrom'] = ref1 read_pair_info['r1_pos'] = read1.pos read_pair_info['r1_direct'] = not (read1.is_reverse) read_pair_info['r1_min_qual'] = tk_fasta.get_min_qual(read1.qual) read_pair_info['r1_seq_len'] = len( read1.seq) if read1.seq is not None else 0 if read1.seq is not None and 'N' in read1.seq: read_pair_info['r1_contains_N'] = True else: read_pair_info['r1_contains_N'] = False if read2.tid == -1: ref2 = '' else: ref2 = references[read2.tid] read_pair_info['r2_chrom'] = ref2 read_pair_info['r2_pos'] = read2.pos read_pair_info['r2_direct'] = not (read2.is_reverse) read_pair_info['r2_min_qual'] = tk_fasta.get_min_qual(read2.qual) read_pair_info['r2_seq_len'] = len( read2.seq) if read2.seq is not None else 0 if read2.seq is not None and 'N' in read2.seq: read_pair_info['r2_contains_N'] = True else: read_pair_info['r2_contains_N'] = False # # TODO: it would be preferable to add override ability in the tk_io calls, # but jedna doesn't have its own tenkit branch # read_raw_bc = crdna_io.get_read_raw_barcode(read1) if read_raw_bc is None: read_raw_bc = '' read_bc = crdna_io.get_read_barcode(read1) if read_bc is None: read_bc = '' read_bc_qual = crdna_io.get_read_barcode_qual(read1) read_sample_index = tk_io.get_read_sample_index(read1) if read_sample_index is None: read_sample_index = '' read_sample_index_qual = tk_io.get_read_sample_index_qual(read1) read_pair_info['10X_raw_bc_len'] = len(read_raw_bc) read_pair_info['sample_index_len'] = len(read_sample_index) read_pair_info['10X_raw_bc'] = read_raw_bc read_pair_info['10X_called_bc'] = read_bc read_pair_info['sample_index'] = read_sample_index read_pair_info['10X_bc_min_qual'] = tk_fasta.get_min_qual(read_bc_qual) read_pair_info['10X_bc_mean_qual'] = tk_fasta.get_mean_qual( read_bc_qual) if read_sample_index: read_pair_info['sample_index_min_qual'] = tk_fasta.get_min_qual( read_sample_index_qual) read_pair_info['sample_index_mean_qual'] = tk_fasta.get_mean_qual( read_sample_index_qual) else: read_pair_info['sample_index_min_qual'] = None read_pair_info['sample_index_mean_qual'] = None if not (ref1 == ref2) or target_regions is None: read_pair_targ_dist = None else: left_pos = pos1.pos right_pos = pos2.pos + (len(pos2.seq) if pos2.seq is not None else 0) regions = target_regions.get(ref1, None) if regions is None: # No targets on this chrom -- distance is large positive number read_pair_targ_dist = 100000000 else: read_pair_targ_dist = get_read_regions_dist( left_pos, right_pos, regions) read_pair_info['read_pair_targ_dist'] = read_pair_targ_dist r1_mapped, r1_soft_clipped = cigar_count_bases(read1) r2_mapped, r2_soft_clipped = cigar_count_bases(read2) read_pair_info['mapped_bases'] = r1_mapped + r2_mapped read_pair_info[ 'soft_clipped_bases'] = r1_soft_clipped + r2_soft_clipped if (not read1.is_unmapped and not read2.is_unmapped) and read1.tid == read2.tid: read_pair_info['insert_length'] = pos2.aend - pos1.pos else: read_pair_info['insert_length'] = None read_pair_info['r1_q20_bases'] = tk_fasta.get_bases_qual( read1.qual, 20) read_pair_info['r1_q30_bases'] = tk_fasta.get_bases_qual( read1.qual, 30) read_pair_info['r2_q20_bases'] = tk_fasta.get_bases_qual( read2.qual, 20) read_pair_info['r2_q30_bases'] = tk_fasta.get_bases_qual( read2.qual, 30) read_pair_info['si_q20_bases'] = tk_fasta.get_bases_qual( read_sample_index_qual, 20) read_pair_info['si_q30_bases'] = tk_fasta.get_bases_qual( read_sample_index_qual, 30) read_pair_info['bc_q20_bases'] = tk_fasta.get_bases_qual( read_bc_qual, 20) read_pair_info['bc_q30_bases'] = tk_fasta.get_bases_qual( read_bc_qual, 30) yield read_pair_info
def min_qual_below(qual, threshold): """ Return true if the min qual is below the threshold qual: a read's qual string """ return threshold is not None and tk_fasta.get_min_qual(qual) < threshold