def trim_nongenomic_polyA_from_end(mapping, region_fetcher): ''' If a mapping ends in a polyA stretch, soft clip from the first nongenomic A onward. ''' if sam.contains_indel_pysam(mapping) or mapping.is_unmapped: return mapping first_ref_index = None if mapping.is_reverse: bases_to_trim = 0 poly_T_end = find_poly_T(mapping.seq) for read_index, ref_index in mapping.aligned_pairs[::-1]: if read_index == None: # indels are filtered out above, so this can only be # a skip from splicing continue if read_index > poly_T_end: first_ref_index = ref_index continue ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1) if ref_base != 'T': bases_to_trim = read_index + 1 break else: # first_ref_index needs to be set to the last position # that passed that 'are you genomic?' test first_ref_index = ref_index else: first_ref_index = mapping.pos bases_to_trim = 0 poly_A_start = find_poly_A(mapping.seq) for read_index, ref_index in mapping.aligned_pairs: if read_index < poly_A_start: continue ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1) if ref_base != 'A': bases_to_trim = len(mapping.seq) - read_index break if first_ref_index == None: print mapping raise ValueError('first_ref_index not set') if bases_to_trim > 0: mapping.pos = first_ref_index trimmed_length = len(mapping.seq) - bases_to_trim soft_clipped_block = [(sam.BAM_CSOFT_CLIP, bases_to_trim)] if mapping.is_reverse: # Remove blocks from the beginning. trimmed_cigar = sam.truncate_cigar_blocks_from_beginning( mapping.cigar, trimmed_length) updated_cigar = soft_clipped_block + trimmed_cigar else: # Remove blocks from the end. trimmed_cigar = sam.truncate_cigar_blocks_up_to( mapping.cigar, trimmed_length) updated_cigar = trimmed_cigar + soft_clipped_block mapping.cigar = updated_cigar if mapping.tags: # Clear the MD tag since the possible removal of bases to the # alignment may have made it inaccurate. # TODO: now have machinery to make it accurate. filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags) mapping.tags = filtered_tags set_nongenomic_length(mapping, bases_to_trim) return mapping
def trim_mismatches_from_start(mapping, region_fetcher, type_counts): ''' Remove all consecutive Q30+ mismatches from the beginning of alignments, under the assumption that these represent untemplated additions during reverse transcription. Characterize the mismatches into type_counts. ''' if sam.contains_indel_pysam(mapping) or mapping.is_unmapped: set_nongenomic_length(mapping, 0) return mapping if mapping.is_reverse: aligned_pairs = mapping.aligned_pairs[::-1] index_lookup = utilities.base_to_complement_index else: aligned_pairs = mapping.aligned_pairs index_lookup = utilities.base_to_index decoded_qual = fastq.decode_sanger(mapping.qual) bases_to_trim = 0 found_trim_point = False first_ref_index = None for read_index, ref_index in aligned_pairs: if read_index == None: # This shouldn't be able to be triggered since alignments # containing indels are ruled out above. continue if mapping.is_reverse: corrected_read_index = mapping.qlen - 1 - read_index else: corrected_read_index = read_index ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1) read_base = mapping.seq[read_index] read_qual = decoded_qual[read_index] coords = ( mapping.qlen, corrected_read_index, read_qual, index_lookup[ref_base], index_lookup[read_base], ) type_counts[coords] += 1 if not found_trim_point: if read_base != ref_base and read_qual >= 30: bases_to_trim += 1 else: first_ref_index = ref_index found_trim_point = True if first_ref_index == None: raise ValueError('first_ref_index not set') if bases_to_trim == 0: trimmed_mapping = mapping else: trimmed_mapping = pysam.AlignedRead() trimmed_mapping.qname = mapping.qname trimmed_mapping.tid = mapping.tid # first_ref_index has been set above to the be index of the # reference base aligned to the first non-trimmed base in the # read. If the mapping is forward, this will be the new pos. # If the mapping is reverse, the pos won't change. if mapping.is_reverse: first_ref_index = mapping.pos trimmed_mapping.pos = first_ref_index trimmed_mapping.is_reverse = mapping.is_reverse trimmed_mapping.is_secondary = mapping.is_secondary trimmed_mapping.mapq = mapping.mapq if mapping.is_reverse: # bases_to_trim is never zero here, so there is no danger # of minus zero trimmed_slice = slice(None, -bases_to_trim) else: trimmed_slice = slice(bases_to_trim, None) trimmed_mapping.seq = mapping.seq[trimmed_slice] trimmed_mapping.qual = mapping.qual[trimmed_slice] trimmed_mapping.rnext = -1 trimmed_mapping.pnext = -1 trimmed_length = len(mapping.seq) - bases_to_trim if mapping.is_reverse: # Remove blocks from the end trimmed_cigar = sam.truncate_cigar_blocks_up_to( mapping.cigar, trimmed_length) else: # Remove blocks from the beginning trimmed_cigar = sam.truncate_cigar_blocks_from_beginning( mapping.cigar, trimmed_length) trimmed_mapping.cigar = trimmed_cigar return trimmed_mapping
def trim_nongenomic_polyA_from_end(mapping, region_fetcher): ''' If a mapping ends in a polyA stretch, soft clip from the first nongenomic A onward. ''' if sam.contains_indel_pysam(mapping) or mapping.is_unmapped: return mapping first_ref_index = None if mapping.is_reverse: bases_to_trim = 0 poly_T_end = find_poly_T(mapping.seq) for read_index, ref_index in mapping.aligned_pairs[::-1]: if read_index == None: # indels are filtered out above, so this can only be # a skip from splicing continue if read_index > poly_T_end: first_ref_index = ref_index continue ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1) if ref_base != 'T': bases_to_trim = read_index + 1 break else: # first_ref_index needs to be set to the last position # that passed that 'are you genomic?' test first_ref_index = ref_index else: first_ref_index = mapping.pos bases_to_trim = 0 poly_A_start = find_poly_A(mapping.seq) for read_index, ref_index in mapping.aligned_pairs: if read_index < poly_A_start: continue ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1) if ref_base != 'A': bases_to_trim = len(mapping.seq) - read_index break if first_ref_index == None: print mapping raise ValueError('first_ref_index not set') if bases_to_trim > 0: mapping.pos = first_ref_index trimmed_length = len(mapping.seq) - bases_to_trim soft_clipped_block = [(sam.BAM_CSOFT_CLIP, bases_to_trim)] if mapping.is_reverse: # Remove blocks from the beginning. trimmed_cigar = sam.truncate_cigar_blocks_from_beginning(mapping.cigar, trimmed_length) updated_cigar = soft_clipped_block + trimmed_cigar else: # Remove blocks from the end. trimmed_cigar = sam.truncate_cigar_blocks_up_to(mapping.cigar, trimmed_length) updated_cigar = trimmed_cigar + soft_clipped_block mapping.cigar = updated_cigar if mapping.tags: # Clear the MD tag since the possible removal of bases to the # alignment may have made it inaccurate. # TODO: now have machinery to make it accurate. filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags) mapping.tags = filtered_tags set_nongenomic_length(mapping, bases_to_trim) return mapping
def filter_mappings( mappings, minimum_mapq=42, max_insert_length=1000, counts_dict=None, verbose=False, unmapped_fns=None, ): ''' Filters out unmapped, nonuniquely mapped, or discordantly mapped reads. ''' pair_counts = { 'total': 0, 'unmapped': 0, 'indel': 0, 'nonunique': 0, 'discordant': 0, 'disoriented': 0, 'unique': Counter(), 'mapqs': Counter(), 'fragment_lengths': Counter(), 'tids': Counter(), } if unmapped_fns: R1_unmapped_fn, R2_unmapped_fn = unmapped_fns R1_unmapped_fh = open(R1_unmapped_fn, 'w') R2_unmapped_fh = open(R2_unmapped_fn, 'w') for _, aligned_pair in utilities.group_by(mappings, key=lambda m: m.qname): if len(aligned_pair) != 2: raise ValueError(len(aligned_pair)) pair_counts['total'] += 1 R1_aligned, R2_aligned = aligned_pair # If R2 is mapped but R1 isn't, R2 gets reported first. if not R1_aligned.is_read1: R1_aligned, R2_aligned = R2_aligned, R1_aligned if (not R1_aligned.is_read1) or (not R2_aligned.is_read2): raise ValueError(R1_aligned, R2_aligned) pair_counts['mapqs'][R1_aligned.mapq] += 1 pair_counts['mapqs'][R2_aligned.mapq] += 1 if R1_aligned.is_unmapped or R2_aligned.is_unmapped: pair_counts['unmapped'] += 1 if verbose: logging.info('{0} was unmapped'.format(R1_aligned.qname)) if unmapped_fns: R1_read = sam.mapping_to_Read(R1_aligned) R2_read = sam.mapping_to_Read(R2_aligned) R1_unmapped_fh.write(str(R1_read)) R2_unmapped_fh.write(str(R2_read)) elif is_discordant(R1_aligned, R2_aligned, max_insert_length): pair_counts['discordant'] += 1 else: pair_counts['tids'][R1_aligned.tid] += 1 if is_disoriented(R1_aligned, R2_aligned): pair_counts['disoriented'] += 1 elif R1_aligned.mapq < minimum_mapq or R2_aligned.mapq < minimum_mapq: pair_counts['nonunique'] += 1 if verbose: logging.info('{0} was nonunique, {1}, {2}'.format( R1_aligned.qname, R1_aligned.mapq, R2_aligned.mapq)) else: pair_counts['unique'][R1_aligned.tid] += 1 fragment_length = abs(R1_aligned.tlen) pair_counts['fragment_lengths'][fragment_length] += 1 if sam.contains_indel_pysam( R1_aligned) or sam.contains_indel_pysam(R2_aligned): pair_counts['indel'] += 1 yield R1_aligned, R2_aligned if counts_dict != None: counts_dict.update(pair_counts)
def trim_mismatches_from_start(mapping, region_fetcher, type_counts): ''' Remove all consecutive Q30+ mismatches from the beginning of alignments, under the assumption that these represent untemplated additions during reverse transcription. Characterize the mismatches into type_counts. ''' if sam.contains_indel_pysam(mapping) or mapping.is_unmapped: set_nongenomic_length(mapping, 0) return mapping if mapping.is_reverse: aligned_pairs = mapping.aligned_pairs[::-1] index_lookup = utilities.base_to_complement_index else: aligned_pairs = mapping.aligned_pairs index_lookup = utilities.base_to_index decoded_qual = fastq.decode_sanger(mapping.qual) bases_to_trim = 0 found_trim_point = False first_ref_index = None for read_index, ref_index in aligned_pairs: if read_index == None: # This shouldn't be able to be triggered since alignments # containing indels are ruled out above. continue if mapping.is_reverse: corrected_read_index = mapping.qlen - 1 - read_index else: corrected_read_index = read_index ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1) read_base = mapping.seq[read_index] read_qual = decoded_qual[read_index] coords = (mapping.qlen, corrected_read_index, read_qual, index_lookup[ref_base], index_lookup[read_base], ) type_counts[coords] += 1 if not found_trim_point: if read_base != ref_base and read_qual >= 30: bases_to_trim += 1 else: first_ref_index = ref_index found_trim_point = True if first_ref_index == None: raise ValueError('first_ref_index not set') if bases_to_trim == 0: trimmed_mapping = mapping else: trimmed_mapping = pysam.AlignedRead() trimmed_mapping.qname = mapping.qname trimmed_mapping.tid = mapping.tid # first_ref_index has been set above to the be index of the # reference base aligned to the first non-trimmed base in the # read. If the mapping is forward, this will be the new pos. # If the mapping is reverse, the pos won't change. if mapping.is_reverse: first_ref_index = mapping.pos trimmed_mapping.pos = first_ref_index trimmed_mapping.is_reverse = mapping.is_reverse trimmed_mapping.is_secondary = mapping.is_secondary trimmed_mapping.mapq = mapping.mapq if mapping.is_reverse: # bases_to_trim is never zero here, so there is no danger # of minus zero trimmed_slice = slice(None, -bases_to_trim) else: trimmed_slice = slice(bases_to_trim, None) trimmed_mapping.seq = mapping.seq[trimmed_slice] trimmed_mapping.qual = mapping.qual[trimmed_slice] trimmed_mapping.rnext = -1 trimmed_mapping.pnext = -1 trimmed_length = len(mapping.seq) - bases_to_trim if mapping.is_reverse: # Remove blocks from the end trimmed_cigar = sam.truncate_cigar_blocks_up_to(mapping.cigar, trimmed_length) else: # Remove blocks from the beginning trimmed_cigar = sam.truncate_cigar_blocks_from_beginning(mapping.cigar, trimmed_length) trimmed_mapping.cigar = trimmed_cigar return trimmed_mapping