def test_match_probability(): a = InsertAligner('TTAGACATAT', 'CAGTGGAGTA') k = 3 n = 5 i3 = (120 / (6 * 2)) * (0.25 ** 3) * (0.75 ** 2) i4 = (120 / 24) * (0.25 ** 4) * 0.75 i5 = 0.25 ** 5 assert approx_equal(a.match_probability(k, n), i3 + i4 + i5, 0.0001)
def test_short_adapter_overlap(): a1_seq = 'TTAGACATAT' a2_seq = 'CAGTGGAGTA' seq1 = 'GACAGGCCGTTTGAATGTTGACGGGATGTT' seq2 = 'CATCCCGTCAACATTCAAACGGCCTGTCCA' aligner = InsertAligner(a1_seq, a2_seq) insert_match, match1, match2 = aligner.match_insert(seq1, seq2) assert match1.rstart == 28 assert match1.length == 2 assert match2.rstart == 28 assert match2.length == 2
def test_insert_align(): a1_seq = 'TTAGACATATGG' a2_seq = 'CAGTGGAGTATA' aligner = InsertAligner(a1_seq, a2_seq) r1 = 'AGTCGAGCCCATTGCAGACT' + a1_seq[0:10] r2 = 'AGTCTGCAATGGGCTCGACT' + a2_seq[0:10] insert_match, match1, match2 = aligner.match_insert(r1, r2) assert match1.rstart == 20 assert match1.length == 10 assert match2.rstart == 20 assert match2.length == 10
def __init__( self, adapter1, adapter2, action='trim', mismatch_action=None, symmetric=True, min_insert_overlap=1, **aligner_args): ErrorCorrectorMixin.__init__(self, mismatch_action) self.adapter1 = adapter1 self.adapter2 = adapter2 self.aligner = InsertAligner( adapter1.sequence, adapter2.sequence, min_insert_overlap=min_insert_overlap, **aligner_args) self.min_insert_len = min_insert_overlap self.action = action self.symmetric = symmetric self.with_adapters = [0, 0]
def __init__(self, adapter1, adapter2, action='trim', mismatch_action=None, symmetric=True, min_insert_overlap=1, **aligner_args): ErrorCorrectorMixin.__init__(self, mismatch_action) self.adapter1 = adapter1 self.adapter2 = adapter2 self.aligner = InsertAligner(adapter1.sequence, adapter2.sequence, min_insert_overlap=min_insert_overlap, **aligner_args) self.min_insert_len = min_insert_overlap self.action = action self.symmetric = symmetric self.with_adapters = [0, 0]
class InsertAdapterCutter(ReadPairModifier, ErrorCorrectorMixin): """AdapterCutter that uses InsertAligner to first try to identify insert overlap before falling back to semi-global adapter alignment. Args: adapter1, adapter2: Adapters. action: Action to take on adapter match: trim, mask (replace adapter with N's), lower (convert adapter bases to lower case), or None. mismatch_action: How to deal with mismatches. See :class:`ErrorCorrectorMixin`. symmetric: Whether to assume that the adapter should appear in the same place on overlapping reads. min_insert_overlap: Minimum overlap required between reads to be considered an insert match. aligner_args: Additional arguments to :class:`InsertAligner`. """ def __init__(self, adapter1, adapter2, action='trim', mismatch_action=None, symmetric=True, min_insert_overlap=1, **aligner_args): ErrorCorrectorMixin.__init__(self, mismatch_action) self.adapter1 = adapter1 self.adapter2 = adapter2 self.aligner = InsertAligner(adapter1.sequence, adapter2.sequence, min_insert_overlap=min_insert_overlap, **aligner_args) self.min_insert_len = min_insert_overlap self.action = action self.symmetric = symmetric self.with_adapters = [0, 0] def __call__(self, read1, read2): read_lengths = [len(r) for r in (read1, read2)] if any(l < self.min_insert_len for l in read_lengths): return (read1, read2) match = self.aligner.match_insert(read1.sequence, read2.sequence) read1.insert_overlap = read2.insert_overlap = (match is not None) insert_match = None correct_errors = False if match: insert_match, adapter_match1, adapter_match2 = match correct_errors = self.mismatch_action and insert_match[5] > 0 else: adapter_match1 = self.adapter1.match_to(read1) adapter_match2 = self.adapter2.match_to(read2) # If the adapter matches are complementary, perform error correction if (self.mismatch_action and adapter_match1 and adapter_match2 and adapter_match1.rstart == adapter_match2.rstart): insert_match = (read_lengths[1] - adapter_match1.rstart, read_lengths[1], 0, adapter_match1.rstart) correct_errors = True # If exactly one of the two alignments failed and symmetric is True, # duplicate the good alignment if self.symmetric and sum( bool(m) for m in (adapter_match1, adapter_match2)) == 1: def create_symmetric_match(match, read, read_len): if match.rstart > read_len: return None match = match.copy() # If we're not dealing with equal-length reads, and this read # is shorter than the other, adjust the match end to be the # read length. The 'matches' and 'errors' attributes will be # wrong, but it shouldn't matter. if match.rstop < read_len: match.astop -= (read_len - match.rstop) match.rstop = read_len return match if adapter_match1: adapter_match2 = create_symmetric_match( adapter_match1, read2, read_lengths[1]) else: adapter_match1 = create_symmetric_match( adapter_match2, read1, read_lengths[0]) if self.mismatch_action and not insert_match and adapter_match1 and adapter_match2: # Assume that the symmetric read segments overlap and # perform error correction insert_match = (read_lengths[1] - adapter_match1.rstart, read_lengths[1], 0, adapter_match1.rstart) correct_errors = True if correct_errors: self.correct_errors(read1, read2, insert_match) return (self.trim(read1, self.adapter1, adapter_match1, 0), self.trim(read2, self.adapter2, adapter_match2, 1)) def trim(self, read, adapter, match, read_idx): """Trim an adapter from a read. Args: read: The read to trim from. adapter: The Adapter to trim. match: The match details. read_idx: 0/1 """ if not match: read.match = None read.match_info = None return read match.adapter = adapter match.read = read match.front = False if self.action is None or match.rstart >= len(read): trimmed_read = read else: trimmed_read = adapter.trimmed(match) if self.action == 'mask': # add N from last modification masked_sequence = trimmed_read.sequence masked_sequence += 'N' * (len(read) - len(trimmed_read)) # set masked sequence as sequence with original quality trimmed_read.sequence = masked_sequence trimmed_read.qualities = read.qualities elif self.action == 'lower': # TODO: offer option to mask with lower-case of trimmed base # This will happen as part of the refactoring to modify # Sequences in-place. pass trimmed_read.match = match trimmed_read.match_info = [match.get_info_record()] self.with_adapters[read_idx] += 1 return trimmed_read def summarize(self): """Returns a summary dict. """ adapters_summary = tuple({adapter.name: adapter.summarize()} for adapter in (self.adapter1, self.adapter2)) summary = dict(records_with_adapters=self.with_adapters, adapters=adapters_summary) if self.mismatch_action: summary.update(ErrorCorrectorMixin.summarize(self)) return summary
class InsertAdapterCutter(ReadPairModifier, ErrorCorrectorMixin): """ AdapterCutter that uses InsertAligner to first try to identify insert overlap before falling back to semi-global adapter alignment. """ def __init__(self, adapter1, adapter2, action='trim', mismatch_action=None, symmetric=True, min_insert_overlap=1, **aligner_args): ErrorCorrectorMixin.__init__(self, mismatch_action) self.adapter1 = adapter1 self.adapter2 = adapter2 self.aligner = InsertAligner(adapter1.sequence, adapter2.sequence, min_insert_overlap=min_insert_overlap, **aligner_args) self.min_insert_len = min_insert_overlap self.action = action self.symmetric = symmetric self.with_adapters = [0, 0] def __call__(self, read1, read2): read_lengths = [len(r) for r in (read1, read2)] if any(l < self.min_insert_len for l in read_lengths): return (read1, read2) match = self.aligner.match_insert(read1.sequence, read2.sequence) read1.insert_overlap = read2.insert_overlap = (match is not None) insert_match = None correct_errors = False if match: insert_match, adapter_match1, adapter_match2 = match correct_errors = self.mismatch_action and insert_match[5] > 0 else: adapter_match1 = self.adapter1.match_to(read1) adapter_match2 = self.adapter2.match_to(read2) # If the adapter matches are complementary, perform error correction if (self.mismatch_action and adapter_match1 and adapter_match2 and adapter_match1.rstart == adapter_match2.rstart): insert_match = (read_lengths[1] - adapter_match1.rstart, read_lengths[1], 0, adapter_match1.rstart) correct_errors = True # If exactly one of the two alignments failed and symmetrix is True, # duplicate the good alignment if self.symmetric and sum( bool(m) for m in (adapter_match1, adapter_match2)) == 1: if adapter_match1: adapter_match2 = adapter_match1.copy() else: adapter_match1 = adapter_match2.copy() if self.mismatch_action and not insert_match: # Assume that the symmetric read segments overlap and # perform error correction insert_match = (read_lengths[1] - adapter_match1.rstart, read_lengths[1], 0, adapter_match1.rstart) correct_errors = True if correct_errors: self.correct_errors(read1, read2, insert_match) return (self.trim(read1, self.adapter1, adapter_match1, 0), self.trim(read2, self.adapter2, adapter_match2, 1)) def trim(self, read, adapter, match, read_idx): if not match: read.match = None read.match_info = None return read match.adapter = adapter match.read = read match.front = False if self.action is None or match.rstart >= len(read): trimmed_read = read else: trimmed_read = adapter.trimmed(match) if self.action == 'mask': # add N from last modification masked_sequence = trimmed_read.sequence masked_sequence += 'N' * (len(read) - len(trimmed_read)) # set masked sequence as sequence with original quality trimmed_read.sequence = masked_sequence trimmed_read.qualities = read.qualities elif self.action == 'lower': # TODO: offer option to mask with lower-case of trimmed base # This will happen as part of the refactoring to modify # Sequences in-place. pass trimmed_read.match = match trimmed_read.match_info = [match.get_info_record()] self.with_adapters[read_idx] += 1 return trimmed_read
class InsertAdapterCutter(ReadPairModifier, ErrorCorrectorMixin): """ AdapterCutter that uses InsertAligner to first try to identify insert overlap before falling back to semi-global adapter alignment. """ def __init__(self, adapter1, adapter2, action='trim', mismatch_action=None, symmetric=True, min_insert_overlap=1, **aligner_args): ErrorCorrectorMixin.__init__(self, mismatch_action) self.adapter1 = adapter1 self.adapter2 = adapter2 self.aligner = InsertAligner(adapter1.sequence, adapter2.sequence, min_insert_overlap=min_insert_overlap, **aligner_args) self.min_insert_len = min_insert_overlap self.action = action self.symmetric = symmetric self.with_adapters = [0, 0] def __call__(self, read1, read2): read_lengths = [len(r) for r in (read1, read2)] if any(l < self.min_insert_len for l in read_lengths): return (read1, read2) match = self.aligner.match_insert(read1.sequence, read2.sequence) read1.insert_overlap = read2.insert_overlap = (match is not None) insert_match = None correct_errors = False if match: insert_match, adapter_match1, adapter_match2 = match correct_errors = self.mismatch_action and insert_match[5] > 0 else: adapter_match1 = self.adapter1.match_to(read1) adapter_match2 = self.adapter2.match_to(read2) # If the adapter matches are complementary, perform error correction if (self.mismatch_action and adapter_match1 and adapter_match2 and adapter_match1.rstart == adapter_match2.rstart): insert_match = ( read_lengths[1] - adapter_match1.rstart, read_lengths[1], 0, adapter_match1.rstart) correct_errors = True # If exactly one of the two alignments failed and symmetrix is True, # duplicate the good alignment if self.symmetric and sum( bool(m) for m in (adapter_match1, adapter_match2)) == 1: if adapter_match1: adapter_match2 = adapter_match1.copy() else: adapter_match1 = adapter_match2.copy() if self.mismatch_action and not insert_match: # Assume that the symmetric read segments overlap and # perform error correction insert_match = ( read_lengths[1] - adapter_match1.rstart, read_lengths[1], 0, adapter_match1.rstart) correct_errors = True if correct_errors: self.correct_errors(read1, read2, insert_match) return ( self.trim(read1, self.adapter1, adapter_match1, 0), self.trim(read2, self.adapter2, adapter_match2, 1) ) def trim(self, read, adapter, match, read_idx): if not match: read.match = None read.match_info = None return read match.adapter = adapter match.read = read match.front = False if self.action is None or match.rstart >= len(read): trimmed_read = read else: trimmed_read = adapter.trimmed(match) if self.action == 'mask': # add N from last modification masked_sequence = trimmed_read.sequence masked_sequence += ('N' * len(read) - len(trimmed_read)) # set masked sequence as sequence with original quality trimmed_read.sequence = masked_sequence trimmed_read.qualities = read.qualities elif self.action == 'lower': # TODO: offer option to mask with lower-case of trimmed base # This will happen as part of the refactoring to modify # Sequences in-place. pass trimmed_read.match = match trimmed_read.match_info = [match.get_info_record()] self.with_adapters[read_idx] += 1 return trimmed_read
class InsertAdapterCutter(ReadPairModifier, ErrorCorrectorMixin): """AdapterCutter that uses InsertAligner to first try to identify insert overlap before falling back to semi-global adapter alignment. Args: adapter1, adapter2: Adapters. action: Action to take on adapter match: trim, mask (replace adapter with N's), lower (convert adapter bases to lower case), or None. mismatch_action: How to deal with mismatches. See :class:`ErrorCorrectorMixin`. symmetric: Whether to assume that the adapter should appear in the same place on overlapping reads. min_insert_overlap: Minimum overlap required between reads to be considered an insert match. aligner_args: Additional arguments to :class:`InsertAligner`. """ def __init__( self, adapter1, adapter2, action='trim', mismatch_action=None, symmetric=True, min_insert_overlap=1, **aligner_args): ErrorCorrectorMixin.__init__(self, mismatch_action) self.adapter1 = adapter1 self.adapter2 = adapter2 self.aligner = InsertAligner( adapter1.sequence, adapter2.sequence, min_insert_overlap=min_insert_overlap, **aligner_args) self.min_insert_len = min_insert_overlap self.action = action self.symmetric = symmetric self.with_adapters = [0, 0] def __call__(self, read1, read2): read_lengths = [len(r) for r in (read1, read2)] if any(l < self.min_insert_len for l in read_lengths): return (read1, read2) match = self.aligner.match_insert(read1.sequence, read2.sequence) read1.insert_overlap = read2.insert_overlap = (match is not None) insert_match = None correct_errors = False if match: insert_match, adapter_match1, adapter_match2 = match correct_errors = self.mismatch_action and insert_match[5] > 0 else: adapter_match1 = self.adapter1.match_to(read1) adapter_match2 = self.adapter2.match_to(read2) # If the adapter matches are complementary, perform error correction if ( self.mismatch_action and adapter_match1 and adapter_match2 and adapter_match1.rstart == adapter_match2.rstart): insert_match = ( read_lengths[1] - adapter_match1.rstart, read_lengths[1], 0, adapter_match1.rstart) correct_errors = True # If exactly one of the two alignments failed and symmetric is True, # duplicate the good alignment if self.symmetric and sum( bool(m) for m in (adapter_match1, adapter_match2)) == 1: def create_symmetric_match(match, read, read_len): if match.rstart > read_len: return None match = match.copy() # If we're not dealing with equal-length reads, and this read # is shorter than the other, adjust the match end to be the # read length. The 'matches' and 'errors' attributes will be # wrong, but it shouldn't matter. if match.rstop < read_len: match.astop -= (read_len - match.rstop) match.rstop = read_len return match if adapter_match1: adapter_match2 = create_symmetric_match(adapter_match1, read2, read_lengths[1]) else: adapter_match1 = create_symmetric_match(adapter_match2, read1, read_lengths[0]) if self.mismatch_action and not insert_match and adapter_match1 and adapter_match2: # Assume that the symmetric read segments overlap and # perform error correction insert_match = ( read_lengths[1] - adapter_match1.rstart, read_lengths[1], 0, adapter_match1.rstart) correct_errors = True if correct_errors: self.correct_errors(read1, read2, insert_match, truncate_seqs=True) return ( self.trim(read1, self.adapter1, adapter_match1, 0), self.trim(read2, self.adapter2, adapter_match2, 1)) def trim(self, read, adapter, match, read_idx): """Trim an adapter from a read. Args: read: The read to trim from. adapter: The Adapter to trim. match: The match details. read_idx: 0/1 """ if not match: read.match = None read.match_info = None return read match.adapter = adapter match.read = read match.front = False if self.action is None or match.rstart >= len(read): trimmed_read = read else: trimmed_read = adapter.trimmed(match) if self.action == 'mask': # add N from last modification masked_sequence = trimmed_read.sequence masked_sequence += 'N' * (len(read) - len(trimmed_read)) # set masked sequence as sequence with original quality trimmed_read.sequence = masked_sequence trimmed_read.qualities = read.qualities elif self.action == 'lower': # TODO: offer option to mask with lower-case of trimmed base # This will happen as part of the refactoring to modify # Sequences in-place. pass trimmed_read.match = match trimmed_read.match_info = [match.get_info_record()] self.with_adapters[read_idx] += 1 return trimmed_read def summarize(self): """Returns a summary dict. """ adapters_summary = tuple( { adapter.name : adapter.summarize() } for adapter in (self.adapter1, self.adapter2)) summary = dict( records_with_adapters=self.with_adapters, adapters=adapters_summary) if self.mismatch_action: summary.update(ErrorCorrectorMixin.summarize(self)) return summary