Exemplo n.º 1
0
    def test_slice(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n'])
        expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta')
        assert slice_seq(seq, 1, 5) == expected_seq

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aata\n', '+\n', '!?!?\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq = slice_seq(seq, 1, 3)
        assert list(get_qualities(seq)) == [30, 0]
        assert get_str_seq(seq) == 'at'
        assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n']

        # with multiline fastq
        seq = SeqItem(
            name='seq',
            lines=['@seq\n', 'aaat\n', 'caaa\n', '+\n', '@AAA\n', 'BBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline')
        seq_ = slice_seq(seq, 1, 5)
        assert list(get_qualities(seq_)) == [1, 1, 1, 2]
        assert get_str_seq(seq_) == get_str_seq(seq)[1:5]

        # It tests the stop is None
        seq = SeqItem('seq', ['>seq\n', 'aCTG'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:]

        assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
Exemplo n.º 2
0
    def test_slice(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n'])
        expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta')
        assert slice_seq(seq, 1, 5) == expected_seq

        # with fastq
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aata\n', '+\n', '!?!?\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq = slice_seq(seq, 1, 3)
        assert list(get_qualities(seq)) == [30, 0]
        assert get_str_seq(seq) == 'at'
        assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n']

        # with multiline fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaat\n', 'caaa\n', '+\n',
                                         '@AAA\n', 'BBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline')
        seq_ = slice_seq(seq, 1, 5)
        assert list(get_qualities(seq_)) == [1, 1, 1, 2]
        assert get_str_seq(seq_) == get_str_seq(seq)[1: 5]

        # It tests the stop is None
        seq = SeqItem('seq', ['>seq\n', 'aCTG'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:]

        assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
Exemplo n.º 3
0
    def _do_trim(self, seq):
        'It trims the edges of the given seqs.'
        annots = get_annotations(seq)
        if not TRIMMING_RECOMMENDATIONS in annots:
            return seq

        trim_rec = annots[TRIMMING_RECOMMENDATIONS]
        # fixing the trimming recommendations
        if TRIMMING_RECOMMENDATIONS in annots:
            del annots[TRIMMING_RECOMMENDATIONS]

        trim_segments = []
        for trim_kind in TRIMMING_KINDS:
            trim_segments.extend(trim_rec.get(trim_kind, []))

        # masking
        if self.mask:
            seq = _mask_sequence(seq, trim_segments)
        else:
            # trimming
            if trim_segments:
                trim_limits = get_longest_complementary_segment(
                                            trim_segments, get_length(seq))
                if trim_limits is None:
                    # there's no sequence left
                    return None
            else:
                trim_limits = []

            if trim_limits:
                seq = slice_seq(seq, trim_limits[0], trim_limits[1] + 1)

        return seq
Exemplo n.º 4
0
    def _do_trim(self, seq):
        'It trims the edges of the given seqs.'
        annots = get_annotations(seq)
        if not TRIMMING_RECOMMENDATIONS in annots:
            return seq

        trim_rec = annots[TRIMMING_RECOMMENDATIONS]
        # fixing the trimming recommendations
        if TRIMMING_RECOMMENDATIONS in annots:
            del annots[TRIMMING_RECOMMENDATIONS]

        trim_segments = []
        for trim_kind in TRIMMING_KINDS:
            trim_segments.extend(trim_rec.get(trim_kind, []))

        # masking
        if self.mask:
            seq = _mask_sequence(seq, trim_segments)
        else:
            # trimming
            if trim_segments:
                trim_limits = get_longest_complementary_segment(
                                            trim_segments, get_length(seq))
                if trim_limits is None:
                    # there's no sequence left
                    return None
            else:
                trim_limits = []

            if trim_limits:
                seq = slice_seq(seq, trim_limits[0], trim_limits[1] + 1)

        return seq
Exemplo n.º 5
0
    def __call__(self, seqs):
        'It trims the edges of the given seqs.'
        mask = self.mask
        processed_seqs = []
        for seq in seqs:
            annots = get_annotations(seq)
            if not TRIMMING_RECOMMENDATIONS in annots:
                processed_seqs.append(copy_seq(seq))
                continue

            trim_rec = annots[TRIMMING_RECOMMENDATIONS]
            # fixing the trimming recommendations
            if TRIMMING_RECOMMENDATIONS in annots:
                del annots[TRIMMING_RECOMMENDATIONS]

            trim_segments = []
            for trim_kind in TRIMMING_KINDS:
                trim_segments.extend(trim_rec.get(trim_kind, []))

            # masking
            if mask:
                seq = _mask_sequence(seq, trim_segments)
            else:
                # trimming
                if trim_segments:
                    trim_limits = get_longest_complementary_segment(
                                                trim_segments, get_length(seq))
                    if trim_limits is None:
                        # there's no sequence left
                        continue
                else:
                    trim_limits = []

                if trim_limits:
                    seq = slice_seq(seq, trim_limits[0], trim_limits[1] + 1)

            processed_seqs.append(seq)

        return processed_seqs
Exemplo n.º 6
0
class MatePairSplitter(object):
    'It splits the input sequences with the provided linkers.'

    def __init__(self, linkers=None):
        'The initiator'
        if linkers is None:
            linkers = get_setting('LINKERS')
            linkers = [SeqItem(str(i), '>%d\n%s\n' % (i, l)) for i, l in enumerate(linkers)]
            linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta')
        self.linkers = list(linkers)

    def __call__(self, seqs):
        'It splits a list of sequences with the provided linkers'
        seq_fhand = write_seqs(seqs, file_format='fasta')
        seq_fhand.flush()

        min_identity = 87.0
        min_len = 13
        filters = [{'kind': 'min_length', 'min_num_residues': min_len,
                    'length_in_query': False, 'filter_match_parts': True},
                   {'kind': 'score_threshold', 'score_key': 'identity',
                   'min_score': min_identity}]

        matcher = BlasterForFewSubjects(seq_fhand.name, self.linkers,
                                        program='blastn', filters=filters,
                                        params={'task': 'blastn-short'},
                                        elongate_for_global=True,
                                        seqs_type=NUCL)
        new_seqs = []
        for seq in seqs:
            segments = matcher.get_matched_segments_for_read(get_name(seq))
            if segments is not None:
                split_seqs = self._split_by_mate_linker(seq, segments)
            else:
                split_seqs = [seq]
            for seq in split_seqs:
                new_seqs.append(seq)
        return new_seqs

    def _split_by_mate_linker(self, seq, (segments, is_partial)):
        'It splits the seqs using segments'

        if not segments:
            return [copy_seq(seq)]

        elongated_match = is_partial
        if len(segments) == 1:
            segment_start = segments[0][0]
            segment_end = segments[0][1]
            seq_end = get_length(seq) - 1
            if segment_start == 0:
                new_seq = slice_seq(seq, segment_end + 1, None)
                return [new_seq]
            elif segment_end == seq_end:
                new_seq = slice_seq(seq, None, segment_start)
                return [new_seq]
            elif segment_end > seq_end:
                msg = 'The segment ends after the sequence has ended'
                raise RuntimeError(msg)
            else:
                new_seq1 = slice_seq(seq, None, segment_start)
                new_seq2 = slice_seq(seq, segment_end + 1, None)
                if elongated_match:
                    name = get_name(seq) + '_pl'
                else:
                    name = get_name(seq)
                new_seq1 = copy_seq(new_seq1, name=name + r'\1')
                new_seq2 = copy_seq(new_seq2, name=name + r'\2')
                return [new_seq1, new_seq2]
        else:
            seqs = []
            counter = 1
            seq_start = 0
            for segment_start, segment_end in segments:
                if segment_start == 0:
                    continue
                new_seq = slice_seq(seq, seq_start, segment_start)
                seq_name = get_name(seq) + '_mlc.part{0:d}'.format(counter)
                new_seq = copy_seq(new_seq, name=seq_name)
                seqs.append(new_seq)
                counter += 1
                seq_start = segment_end + 1
            else:
                if segment_end != get_length(seq) + 1:
                    new_seq = slice_seq(seq, segment_end + 1, None)
                    name = get_name(seq) + '_mlc.part{0:d}'.format(counter)
                    new_seq = copy_seq(new_seq, name=name)
                    seqs.append(new_seq)
            return seqs