def test_copy(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'], annotations={'a': 'b'}) seq = SeqWrapper(SEQITEM, seq, 'fasta') seq2 = copy_seq(seq, seq='ACTG') assert seq2.object == SeqItem(name='s1', lines=['>s1\n', 'ACTG\n'], annotations={'a': 'b'}) assert seq.object is not seq2.object assert seq.object.lines is not seq2.object.lines # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') seq2 = copy_seq(seq, seq='ACTG') assert seq2.object == SeqItem( name='seq', lines=['@seq\n', 'ACTG\n', '+\n', '!???\n']) # with multiline fastq seq = SeqItem( name='seq', lines=['@seq\n', 'aaaa\n', 'aaaa\n', '+\n', '@AAA\n', 'BBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline') seq2 = copy_seq(seq, seq='ACTGactg') assert seq2.object == SeqItem( name='seq', lines=['@seq\n', 'ACTGactg\n', '+\n', '@AAABBBB\n'])
def test_copy(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'], annotations={'a': 'b'}) seq = SeqWrapper(SEQITEM, seq, 'fasta') seq2 = copy_seq(seq, seq='ACTG') assert seq2.object == SeqItem(name='s1', lines=['>s1\n', 'ACTG\n'], annotations={'a': 'b'}) assert seq.object is not seq2.object assert seq.object.lines is not seq2.object.lines # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') seq2 = copy_seq(seq, seq='ACTG') assert seq2.object == SeqItem(name='seq', lines=['@seq\n', 'ACTG\n', '+\n', '!???\n']) # with multiline fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', 'aaaa\n', '+\n', '@AAA\n', 'BBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline') seq2 = copy_seq(seq, seq='ACTGactg') assert seq2.object == SeqItem(name='seq', lines=['@seq\n', 'ACTGactg\n', '+\n', '@AAABBBB\n'])
def test_change_name(self): seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+seq\n', '!???\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') seq = copy_seq(seq, name='seq2') assert seq.object == ('seq2', ['@seq2\n', 'aaaa\n', '+\n', '!???\n'], {}) seq = SeqItem(name='seq', lines=['>seq\n', 'aaaa\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') seq = copy_seq(seq, name='seq2') assert seq.object == ('seq2', ['>seq2\n', 'aaaa\n'], {})
def __call__(self, seqs): 'It changes the case of the seqrecords.' action = self.action processed_seqs = [] for seq in seqs: str_seq = get_str_seq(seq) if action == UPPERCASE: str_seq = str_seq.upper() elif action == LOWERCASE: str_seq = str_seq.lower() elif action == SWAPCASE: str_seq = str_seq.swapcase() else: raise NotImplementedError() seq = copy_seq(seq, seq=str_seq) processed_seqs.append(seq) return processed_seqs
def _mask_sequence(seq, segments): 'It masks the given segments of the sequence' if not segments: return seq segments = merge_overlaping_segments(segments) segments = get_all_segments(segments, get_length(seq)) str_seq = get_str_seq(seq) new_seq = '' for segment in segments: start = segment[0][0] end = segment[0][1] + 1 str_seq_ = str_seq[start:end] if segment[1]: str_seq_ = str_seq_.lower() new_seq += str_seq_ if seq.kind == SEQRECORD: new_seq = Seq(new_seq, alphabet=seq.object.seq.alphabet) return copy_seq(seq, seq=new_seq)
def __call__(self, seqs): 'It trims the edges of the given seqs.' mask = self.mask processed_seqs = [] for seq in seqs: annots = get_annotations(seq) if not TRIMMING_RECOMMENDATIONS in annots: processed_seqs.append(copy_seq(seq)) continue trim_rec = annots[TRIMMING_RECOMMENDATIONS] # fixing the trimming recommendations if TRIMMING_RECOMMENDATIONS in annots: del annots[TRIMMING_RECOMMENDATIONS] trim_segments = [] for trim_kind in TRIMMING_KINDS: trim_segments.extend(trim_rec.get(trim_kind, [])) # masking if mask: seq = _mask_sequence(seq, trim_segments) else: # trimming if trim_segments: trim_limits = get_longest_complementary_segment( trim_segments, get_length(seq)) if trim_limits is None: # there's no sequence left continue else: trim_limits = [] if trim_limits: seq = slice_seq(seq, trim_limits[0], trim_limits[1] + 1) processed_seqs.append(seq) return processed_seqs
class MatePairSplitter(object): 'It splits the input sequences with the provided linkers.' def __init__(self, linkers=None): 'The initiator' if linkers is None: linkers = get_setting('LINKERS') linkers = [SeqItem(str(i), '>%d\n%s\n' % (i, l)) for i, l in enumerate(linkers)] linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta') self.linkers = list(linkers) def __call__(self, seqs): 'It splits a list of sequences with the provided linkers' seq_fhand = write_seqs(seqs, file_format='fasta') seq_fhand.flush() min_identity = 87.0 min_len = 13 filters = [{'kind': 'min_length', 'min_num_residues': min_len, 'length_in_query': False, 'filter_match_parts': True}, {'kind': 'score_threshold', 'score_key': 'identity', 'min_score': min_identity}] matcher = BlasterForFewSubjects(seq_fhand.name, self.linkers, program='blastn', filters=filters, params={'task': 'blastn-short'}, elongate_for_global=True, seqs_type=NUCL) new_seqs = [] for seq in seqs: segments = matcher.get_matched_segments_for_read(get_name(seq)) if segments is not None: split_seqs = self._split_by_mate_linker(seq, segments) else: split_seqs = [seq] for seq in split_seqs: new_seqs.append(seq) return new_seqs def _split_by_mate_linker(self, seq, (segments, is_partial)): 'It splits the seqs using segments' if not segments: return [copy_seq(seq)] elongated_match = is_partial if len(segments) == 1: segment_start = segments[0][0] segment_end = segments[0][1] seq_end = get_length(seq) - 1 if segment_start == 0: new_seq = slice_seq(seq, segment_end + 1, None) return [new_seq] elif segment_end == seq_end: new_seq = slice_seq(seq, None, segment_start) return [new_seq] elif segment_end > seq_end: msg = 'The segment ends after the sequence has ended' raise RuntimeError(msg) else: new_seq1 = slice_seq(seq, None, segment_start) new_seq2 = slice_seq(seq, segment_end + 1, None) if elongated_match: name = get_name(seq) + '_pl' else: name = get_name(seq) new_seq1 = copy_seq(new_seq1, name=name + r'\1') new_seq2 = copy_seq(new_seq2, name=name + r'\2') return [new_seq1, new_seq2] else: seqs = [] counter = 1 seq_start = 0 for segment_start, segment_end in segments: if segment_start == 0: continue new_seq = slice_seq(seq, seq_start, segment_start) seq_name = get_name(seq) + '_mlc.part{0:d}'.format(counter) new_seq = copy_seq(new_seq, name=seq_name) seqs.append(new_seq) counter += 1 seq_start = segment_end + 1 else: if segment_end != get_length(seq) + 1: new_seq = slice_seq(seq, segment_end + 1, None) name = get_name(seq) + '_mlc.part{0:d}'.format(counter) new_seq = copy_seq(new_seq, name=name) seqs.append(new_seq) return seqs