def test_slice(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTGGTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n']) expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta') assert slice_seq(seq, 1, 5) == expected_seq # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aata\n', '+\n', '!?!?\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') seq = slice_seq(seq, 1, 3) assert list(get_int_qualities(seq)) == [30, 0] assert get_str_seq(seq) == 'at' assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n'] # with multiline fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaatcaaa\n', '+\n', '@AAABBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina') seq_ = slice_seq(seq, 1, 5) assert list(get_int_qualities(seq_)) == [1, 1, 1, 2] assert get_str_seq(seq_) == get_str_seq(seq)[1: 5] # It tests the stop is None seq = SeqItem('seq', ['>seq\n', 'aCTG']) seq = SeqWrapper(SEQITEM, seq, 'fasta') assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:] assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
def test_slice(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTGGTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n']) expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta') assert slice_seq(seq, 1, 5) == expected_seq # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aata\n', '+\n', '!?!?\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') seq = slice_seq(seq, 1, 3) assert list(get_int_qualities(seq)) == [30, 0] assert get_str_seq(seq) == 'at' assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n'] # with multiline fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaatcaaa\n', '+\n', '@AAABBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina') seq_ = slice_seq(seq, 1, 5) assert list(get_int_qualities(seq_)) == [1, 1, 1, 2] assert get_str_seq(seq_) == get_str_seq(seq)[1:5] # It tests the stop is None seq = SeqItem('seq', ['>seq\n', 'aCTG']) seq = SeqWrapper(SEQITEM, seq, 'fasta') assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:] assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
def _do_trim(self, seq): 'It trims the edges of the given seqs.' annots = get_annotations(seq) if not TRIMMING_RECOMMENDATIONS in annots: return seq trim_rec = annots[TRIMMING_RECOMMENDATIONS] # fixing the trimming recommendations if TRIMMING_RECOMMENDATIONS in annots: del annots[TRIMMING_RECOMMENDATIONS] trim_segments = [] for trim_kind in TRIMMING_KINDS: trim_segments.extend(trim_rec.get(trim_kind, [])) # masking if self.mask: seq = _mask_sequence(seq, trim_segments) else: # trimming if trim_segments: trim_limits = get_longest_complementary_segment( trim_segments, get_length(seq)) if trim_limits is None: # there's no sequence left return None else: trim_limits = [] if trim_limits: seq = slice_seq(seq, trim_limits[0], trim_limits[1] + 1) return seq
class MatePairSplitter(object): 'It splits the input sequences with the provided linkers.' def __init__(self, linkers=None): 'The initiator' if linkers is None: linkers = get_setting('LINKERS') linkers = [ SeqItem(str(i), '>%d\n%s\n' % (i, l)) for i, l in enumerate(linkers) ] linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta') self.linkers = list(linkers) def __call__(self, seqs): 'It splits a list of sequences with the provided linkers' seq_fhand = write_seqs(seqs, file_format='fasta') seq_fhand.flush() min_identity = 87.0 min_len = 13 filters = [{ 'kind': 'min_length', 'min_num_residues': min_len, 'length_in_query': False, 'filter_match_parts': True }, { 'kind': 'score_threshold', 'score_key': 'identity', 'min_score': min_identity }] matcher = BlasterForFewSubjects(seq_fhand.name, self.linkers, program='blastn', filters=filters, params={'task': 'blastn-short'}, elongate_for_global=True, seqs_type=NUCL) new_seqs = [] for seq in seqs: segments = matcher.get_matched_segments_for_read(get_name(seq)) if segments is not None: split_seqs = self._split_by_mate_linker(seq, segments) else: split_seqs = [seq] for seq in split_seqs: new_seqs.append(seq) return new_seqs def _split_by_mate_linker(self, seq, (segments, is_partial)): 'It splits the seqs using segments' if not segments: return [copy_seq(seq)] elongated_match = is_partial if len(segments) == 1: segment_start = segments[0][0] segment_end = segments[0][1] seq_end = get_length(seq) - 1 if segment_start == 0: new_seq = slice_seq(seq, segment_end + 1, None) return [new_seq] elif segment_end == seq_end: new_seq = slice_seq(seq, None, segment_start) return [new_seq] elif segment_end > seq_end: msg = 'The segment ends after the sequence has ended' raise RuntimeError(msg) else: new_seq1 = slice_seq(seq, None, segment_start) new_seq2 = slice_seq(seq, segment_end + 1, None) if elongated_match: name = get_name(seq) + '_pl' else: name = get_name(seq) new_seq1 = copy_seq(new_seq1, name=name + r'\1') new_seq2 = copy_seq(new_seq2, name=name + r'\2') return [new_seq1, new_seq2] else: seqs = [] counter = 1 seq_start = 0 for segment_start, segment_end in segments: if segment_start == 0: continue new_seq = slice_seq(seq, seq_start, segment_start) seq_name = get_name(seq) + '_mlc.part{0:d}'.format(counter) new_seq = copy_seq(new_seq, name=seq_name) seqs.append(new_seq) counter += 1 seq_start = segment_end + 1 else: if segment_end != get_length(seq) + 1: new_seq = slice_seq(seq, segment_end + 1, None) name = get_name(seq) + '_mlc.part{0:d}'.format(counter) new_seq = copy_seq(new_seq, name=name) seqs.append(new_seq) return seqs