def trim_nongenomic_polyA_from_end(mapping, region_fetcher): ''' If a mapping ends in a polyA stretch, soft clip from the first nongenomic A onward. ''' if sam.contains_indel_pysam(mapping) or mapping.is_unmapped: return mapping first_ref_index = None if mapping.is_reverse: bases_to_trim = 0 poly_T_end = find_poly_T(mapping.seq) for read_index, ref_index in mapping.aligned_pairs[::-1]: if read_index == None: # indels are filtered out above, so this can only be # a skip from splicing continue if read_index > poly_T_end: first_ref_index = ref_index continue ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1) if ref_base != 'T': bases_to_trim = read_index + 1 break else: # first_ref_index needs to be set to the last position # that passed that 'are you genomic?' test first_ref_index = ref_index else: first_ref_index = mapping.pos bases_to_trim = 0 poly_A_start = find_poly_A(mapping.seq) for read_index, ref_index in mapping.aligned_pairs: if read_index < poly_A_start: continue ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1) if ref_base != 'A': bases_to_trim = len(mapping.seq) - read_index break if first_ref_index == None: print mapping raise ValueError('first_ref_index not set') if bases_to_trim > 0: mapping.pos = first_ref_index trimmed_length = len(mapping.seq) - bases_to_trim soft_clipped_block = [(sam.BAM_CSOFT_CLIP, bases_to_trim)] if mapping.is_reverse: # Remove blocks from the beginning. trimmed_cigar = sam.truncate_cigar_blocks_from_beginning( mapping.cigar, trimmed_length) updated_cigar = soft_clipped_block + trimmed_cigar else: # Remove blocks from the end. trimmed_cigar = sam.truncate_cigar_blocks_up_to( mapping.cigar, trimmed_length) updated_cigar = trimmed_cigar + soft_clipped_block mapping.cigar = updated_cigar if mapping.tags: # Clear the MD tag since the possible removal of bases to the # alignment may have made it inaccurate. # TODO: now have machinery to make it accurate. filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags) mapping.tags = filtered_tags set_nongenomic_length(mapping, bases_to_trim) return mapping
def trim_mismatches_from_start(mapping, region_fetcher, type_counts): ''' Remove all consecutive Q30+ mismatches from the beginning of alignments, under the assumption that these represent untemplated additions during reverse transcription. Characterize the mismatches into type_counts. ''' if sam.contains_indel_pysam(mapping) or mapping.is_unmapped: set_nongenomic_length(mapping, 0) return mapping if mapping.is_reverse: aligned_pairs = mapping.aligned_pairs[::-1] index_lookup = utilities.base_to_complement_index else: aligned_pairs = mapping.aligned_pairs index_lookup = utilities.base_to_index decoded_qual = fastq.decode_sanger(mapping.qual) bases_to_trim = 0 found_trim_point = False first_ref_index = None for read_index, ref_index in aligned_pairs: if read_index == None: # This shouldn't be able to be triggered since alignments # containing indels are ruled out above. continue if mapping.is_reverse: corrected_read_index = mapping.qlen - 1 - read_index else: corrected_read_index = read_index ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1) read_base = mapping.seq[read_index] read_qual = decoded_qual[read_index] coords = ( mapping.qlen, corrected_read_index, read_qual, index_lookup[ref_base], index_lookup[read_base], ) type_counts[coords] += 1 if not found_trim_point: if read_base != ref_base and read_qual >= 30: bases_to_trim += 1 else: first_ref_index = ref_index found_trim_point = True if first_ref_index == None: raise ValueError('first_ref_index not set') if bases_to_trim == 0: trimmed_mapping = mapping else: trimmed_mapping = pysam.AlignedRead() trimmed_mapping.qname = mapping.qname trimmed_mapping.tid = mapping.tid # first_ref_index has been set above to the be index of the # reference base aligned to the first non-trimmed base in the # read. If the mapping is forward, this will be the new pos. # If the mapping is reverse, the pos won't change. if mapping.is_reverse: first_ref_index = mapping.pos trimmed_mapping.pos = first_ref_index trimmed_mapping.is_reverse = mapping.is_reverse trimmed_mapping.is_secondary = mapping.is_secondary trimmed_mapping.mapq = mapping.mapq if mapping.is_reverse: # bases_to_trim is never zero here, so there is no danger # of minus zero trimmed_slice = slice(None, -bases_to_trim) else: trimmed_slice = slice(bases_to_trim, None) trimmed_mapping.seq = mapping.seq[trimmed_slice] trimmed_mapping.qual = mapping.qual[trimmed_slice] trimmed_mapping.rnext = -1 trimmed_mapping.pnext = -1 trimmed_length = len(mapping.seq) - bases_to_trim if mapping.is_reverse: # Remove blocks from the end trimmed_cigar = sam.truncate_cigar_blocks_up_to( mapping.cigar, trimmed_length) else: # Remove blocks from the beginning trimmed_cigar = sam.truncate_cigar_blocks_from_beginning( mapping.cigar, trimmed_length) trimmed_mapping.cigar = trimmed_cigar return trimmed_mapping
def trim_nongenomic_polyA_from_end(mapping, region_fetcher): ''' If a mapping ends in a polyA stretch, soft clip from the first nongenomic A onward. ''' if sam.contains_indel_pysam(mapping) or mapping.is_unmapped: return mapping first_ref_index = None if mapping.is_reverse: bases_to_trim = 0 poly_T_end = find_poly_T(mapping.seq) for read_index, ref_index in mapping.aligned_pairs[::-1]: if read_index == None: # indels are filtered out above, so this can only be # a skip from splicing continue if read_index > poly_T_end: first_ref_index = ref_index continue ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1) if ref_base != 'T': bases_to_trim = read_index + 1 break else: # first_ref_index needs to be set to the last position # that passed that 'are you genomic?' test first_ref_index = ref_index else: first_ref_index = mapping.pos bases_to_trim = 0 poly_A_start = find_poly_A(mapping.seq) for read_index, ref_index in mapping.aligned_pairs: if read_index < poly_A_start: continue ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1) if ref_base != 'A': bases_to_trim = len(mapping.seq) - read_index break if first_ref_index == None: print mapping raise ValueError('first_ref_index not set') if bases_to_trim > 0: mapping.pos = first_ref_index trimmed_length = len(mapping.seq) - bases_to_trim soft_clipped_block = [(sam.BAM_CSOFT_CLIP, bases_to_trim)] if mapping.is_reverse: # Remove blocks from the beginning. trimmed_cigar = sam.truncate_cigar_blocks_from_beginning(mapping.cigar, trimmed_length) updated_cigar = soft_clipped_block + trimmed_cigar else: # Remove blocks from the end. trimmed_cigar = sam.truncate_cigar_blocks_up_to(mapping.cigar, trimmed_length) updated_cigar = trimmed_cigar + soft_clipped_block mapping.cigar = updated_cigar if mapping.tags: # Clear the MD tag since the possible removal of bases to the # alignment may have made it inaccurate. # TODO: now have machinery to make it accurate. filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags) mapping.tags = filtered_tags set_nongenomic_length(mapping, bases_to_trim) return mapping
def combine_paired_mappings(R1_mapping, R2_mapping, verbose=False): ''' Takes two pysam mappings representing opposite ends of a fragment and combines them into one mapping, (ab)using BAM_CREF_SKIP to bridge the gap (if any) between them. ''' R1_strand = sam.get_strand(R1_mapping) if R1_strand == '+': left_mapping, right_mapping = R1_mapping, R2_mapping elif R1_strand == '-': left_mapping, right_mapping = R2_mapping, R1_mapping left_md = dict(left_mapping.tags)['MD'] right_md = dict(right_mapping.tags)['MD'] right_aligned_pairs = sam.cigar_to_aligned_pairs( right_mapping.cigar, right_mapping.reference_start) right_after_overlap_pair_index = len(right_aligned_pairs) for i, (read, ref) in enumerate(right_aligned_pairs): if ref != None and ref >= left_mapping.aend: right_after_overlap_pair_index = i break right_overlap_pairs = right_aligned_pairs[:right_after_overlap_pair_index] right_after_overlap_pairs = right_aligned_pairs[ right_after_overlap_pair_index:] right_reads_after = [ read for read, ref in right_after_overlap_pairs if read != None and read != 'N' ] right_refs_after = [ ref for read, ref in right_after_overlap_pairs if ref != None ] right_overlap_cigar = sam.aligned_pairs_to_cigar(right_overlap_pairs) right_after_overlap_cigar = sam.aligned_pairs_to_cigar( right_after_overlap_pairs) right_after_overlap_md = sam.truncate_md_string_from_beginning( right_md, len(right_refs_after)) right_after_overlap_read_start = len( right_mapping.seq) - len(right_reads_after) right_overlap_seq = right_mapping.seq[:right_after_overlap_read_start] right_overlap_qual = right_mapping.qual[:right_after_overlap_read_start] right_after_overlap_seq = right_mapping.seq[ right_after_overlap_read_start:] right_after_overlap_qual = right_mapping.qual[ right_after_overlap_read_start:] left_aligned_pairs = sam.cigar_to_aligned_pairs( left_mapping.cigar, left_mapping.reference_start) left_before_overlap_pair_index = -1 for i, (read, ref) in list(enumerate(left_aligned_pairs))[::-1]: if ref != None and ref < right_mapping.pos: left_before_overlap_pair_index = i break left_overlap_pairs = left_aligned_pairs[left_before_overlap_pair_index + 1:] left_before_overlap_pairs = left_aligned_pairs[: left_before_overlap_pair_index + 1] left_reads_before = [ read for read, ref in left_before_overlap_pairs if read != None and read != 'N' ] left_refs_before = [ ref for read, ref in left_before_overlap_pairs if ref != None ] left_overlap_cigar = sam.aligned_pairs_to_cigar(left_overlap_pairs) left_before_overlap_cigar = sam.aligned_pairs_to_cigar( left_before_overlap_pairs) left_before_overlap_md = sam.truncate_md_string_up_to( left_md, len(left_refs_before)) left_overlap_read_start = len(left_reads_before) left_overlap_seq = left_mapping.seq[left_overlap_read_start:] left_overlap_qual = left_mapping.qual[left_overlap_read_start:] left_before_overlap_seq = left_mapping.seq[:left_overlap_read_start] left_before_overlap_qual = left_mapping.qual[:left_overlap_read_start] if left_overlap_pairs or right_overlap_pairs: gap_length = 0 left_has_splicing = sam.contains_splicing(left_mapping) right_has_splicing = sam.contains_splicing(right_mapping) if left_overlap_cigar == right_overlap_cigar: # If the two mappings agree about the location of indels in their overlap, # use the seq from the mapping with the higher average quality in the # overlap. left_mean_qual = np.mean(fastq.decode_sanger(left_overlap_qual)) right_mean_qual = np.mean(fastq.decode_sanger(right_overlap_qual)) if left_mean_qual > right_mean_qual: use_overlap_from = 'left' else: use_overlap_from = 'right' elif left_has_splicing != right_has_splicing: # A temporary(?) heuristic - if one read has splicing and the other # doesn't, use the overlap from the one with splicing under the # assumption that the other just has a few bases overhanging the # splice junction. if left_has_splicing: use_overlap_from = 'left' else: use_overlap_from = 'right' else: # If the two mappings disagree about the location of indels in their overlap, # we need a heuristic for picking which mapping we believe reflects the # true structure of the input fragment. The most innocuous explanation # is that a 'true' indel happened to lie close to the edge of one of the # mappings. A more problematic situation is a 'false' indel (that is, # produced during cluster generation or sequencing-by-synthesis, NOT # template production). Our strategy is: realign the overlapping part of # left mapping starting from the left edge of the overlap according to the # cigar of the right mapping and realign the overlapping part of the right # mapping starting from the right edge of the overlap according to the cigar # of the left mapping. Count the number of mismatches produced by each. # If the left overlap can accomodate the right cigar with fewer mismatches, # use the right cigar and seq. If the right overlap can accomodate the left # cigar with fewer mismatches, use the left cigar and seq. # The leftmost aligned_pair from the right mapping is guaranteed by the # mapping process to not involve a gap. _, overlap_ref_start = right_overlap_pairs[0] # Similarly, the rightmost aligned_pair from the left mapping can't be a # gap. _, overlap_ref_end = left_overlap_pairs[-1] realigned_left_cigar = sam.truncate_cigar_blocks_up_to( right_mapping.cigar, len(left_overlap_seq)) realigned_right_cigar = sam.truncate_cigar_blocks_from_beginning( left_mapping.cigar, len(right_overlap_seq)) ref_dict = sam.merge_ref_dicts( sam.ref_dict_from_mapping(left_mapping), sam.ref_dict_from_mapping(right_mapping), ) try: left_using_right_mismatches = realigned_mismatches( left_overlap_seq, overlap_ref_start, realigned_left_cigar, ref_dict) right_using_left_mismatches = realigned_mismatches_backwards( right_overlap_seq, overlap_ref_end, realigned_right_cigar, ref_dict) except ValueError: print left_mapping print right_mapping raise if verbose: logging.info('disagreements in {0}'.format(left_mapping.qname)) logging.info('left overlap cigar is {0}'.format( str(left_overlap_cigar))) logging.info('right overlap cigar is {0}'.format( str(right_overlap_cigar))) logging.info('left_using_right_mismatches - {0}'.format( len(left_using_right_mismatches))) logging.info('right_using_left_mismatches - {0}'.format( len(right_using_left_mismatches))) if len(left_using_right_mismatches) < len( right_using_left_mismatches): use_overlap_from = 'right' elif len(right_using_left_mismatches) < len( left_using_right_mismatches): use_overlap_from = 'left' else: logging.info('disagreements in {0}'.format(left_mapping.qname)) logging.info('left overlap cigar is {0}'.format( str(left_overlap_cigar))) logging.info('right overlap cigar is {0}'.format( str(right_overlap_cigar))) logging.info('left_using_right_mismatches - {0}'.format( len(left_using_right_mismatches))) logging.info('right_using_left_mismatches - {0}'.format( len(right_using_left_mismatches))) logging.info('ambiguous disagreement') return False else: gap_length = right_mapping.pos - left_mapping.aend # It doesn't matter what use_overlap_from is set to; there is no overlap use_overlap_from = 'left' combined_mapping = pysam.AlignedRead() combined_mapping.qname = left_mapping.qname combined_mapping.tid = left_mapping.tid combined_mapping.mapq = min(left_mapping.mapq, right_mapping.mapq) combined_mapping.rnext = -1 combined_mapping.pnext = -1 combined_mapping.pos = left_mapping.pos if R1_strand == '-': combined_mapping.is_reverse = True gap_cigar = [(sam.BAM_CREF_SKIP, gap_length)] if use_overlap_from == 'left': combined_mapping.seq = left_mapping.seq + right_after_overlap_seq combined_mapping.qual = left_mapping.qual + right_after_overlap_qual combined_mapping.cigar = left_mapping.cigar + gap_cigar + right_after_overlap_cigar combined_md = sam.combine_md_strings(left_md, right_after_overlap_md) combined_mapping.setTag('MD', combined_md) overlap_seq_tag = right_overlap_seq overlap_qual_tag = right_overlap_qual elif use_overlap_from == 'right': combined_mapping.seq = left_before_overlap_seq + right_mapping.seq combined_mapping.qual = left_before_overlap_qual + right_mapping.qual combined_mapping.cigar = left_before_overlap_cigar + gap_cigar + right_mapping.cigar combined_md = sam.combine_md_strings(left_before_overlap_md, right_md) combined_mapping.setTag('MD', combined_md) overlap_seq_tag = left_overlap_seq overlap_qual_tag = left_overlap_qual if len(overlap_seq_tag) > 0: # Having empty tags causes problems, so don't create them. combined_mapping.setTag('Xs', overlap_seq_tag) combined_mapping.setTag('Xq', overlap_qual_tag) combined_mapping.setTag('Xw', use_overlap_from) return combined_mapping
def trim_mismatches_from_start(mapping, region_fetcher, type_counts): ''' Remove all consecutive Q30+ mismatches from the beginning of alignments, under the assumption that these represent untemplated additions during reverse transcription. Characterize the mismatches into type_counts. ''' if sam.contains_indel_pysam(mapping) or mapping.is_unmapped: set_nongenomic_length(mapping, 0) return mapping if mapping.is_reverse: aligned_pairs = mapping.aligned_pairs[::-1] index_lookup = utilities.base_to_complement_index else: aligned_pairs = mapping.aligned_pairs index_lookup = utilities.base_to_index decoded_qual = fastq.decode_sanger(mapping.qual) bases_to_trim = 0 found_trim_point = False first_ref_index = None for read_index, ref_index in aligned_pairs: if read_index == None: # This shouldn't be able to be triggered since alignments # containing indels are ruled out above. continue if mapping.is_reverse: corrected_read_index = mapping.qlen - 1 - read_index else: corrected_read_index = read_index ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1) read_base = mapping.seq[read_index] read_qual = decoded_qual[read_index] coords = (mapping.qlen, corrected_read_index, read_qual, index_lookup[ref_base], index_lookup[read_base], ) type_counts[coords] += 1 if not found_trim_point: if read_base != ref_base and read_qual >= 30: bases_to_trim += 1 else: first_ref_index = ref_index found_trim_point = True if first_ref_index == None: raise ValueError('first_ref_index not set') if bases_to_trim == 0: trimmed_mapping = mapping else: trimmed_mapping = pysam.AlignedRead() trimmed_mapping.qname = mapping.qname trimmed_mapping.tid = mapping.tid # first_ref_index has been set above to the be index of the # reference base aligned to the first non-trimmed base in the # read. If the mapping is forward, this will be the new pos. # If the mapping is reverse, the pos won't change. if mapping.is_reverse: first_ref_index = mapping.pos trimmed_mapping.pos = first_ref_index trimmed_mapping.is_reverse = mapping.is_reverse trimmed_mapping.is_secondary = mapping.is_secondary trimmed_mapping.mapq = mapping.mapq if mapping.is_reverse: # bases_to_trim is never zero here, so there is no danger # of minus zero trimmed_slice = slice(None, -bases_to_trim) else: trimmed_slice = slice(bases_to_trim, None) trimmed_mapping.seq = mapping.seq[trimmed_slice] trimmed_mapping.qual = mapping.qual[trimmed_slice] trimmed_mapping.rnext = -1 trimmed_mapping.pnext = -1 trimmed_length = len(mapping.seq) - bases_to_trim if mapping.is_reverse: # Remove blocks from the end trimmed_cigar = sam.truncate_cigar_blocks_up_to(mapping.cigar, trimmed_length) else: # Remove blocks from the beginning trimmed_cigar = sam.truncate_cigar_blocks_from_beginning(mapping.cigar, trimmed_length) trimmed_mapping.cigar = trimmed_cigar return trimmed_mapping