예제 #1
0
    def test_len(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTGGTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_length(seq) == 8

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '????\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert get_length(seq) == 4
예제 #2
0
    def test_len(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTGGTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_length(seq) == 8

        # with fastq
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaa\n', '+\n', '????\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert get_length(seq) == 4
예제 #3
0
파일: trim.py 프로젝트: fw1121/ngs_crumbs
    def _do_trim(self, seq):
        'It trims the edges of the given seqs.'
        annots = get_annotations(seq)
        if not TRIMMING_RECOMMENDATIONS in annots:
            return seq

        trim_rec = annots[TRIMMING_RECOMMENDATIONS]
        # fixing the trimming recommendations
        if TRIMMING_RECOMMENDATIONS in annots:
            del annots[TRIMMING_RECOMMENDATIONS]

        trim_segments = []
        for trim_kind in TRIMMING_KINDS:
            trim_segments.extend(trim_rec.get(trim_kind, []))

        # masking
        if self.mask:
            seq = _mask_sequence(seq, trim_segments)
        else:
            # trimming
            if trim_segments:
                trim_limits = get_longest_complementary_segment(
                                            trim_segments, get_length(seq))
                if trim_limits is None:
                    # there's no sequence left
                    return None
            else:
                trim_limits = []

            if trim_limits:
                seq = slice_seq(seq, trim_limits[0], trim_limits[1] + 1)

        return seq
예제 #4
0
파일: trim.py 프로젝트: fw1121/ngs_crumbs
 def _do_trim(self, aligned_reads):
     max_clipping = self.max_clipping
     primary_alignment = _get_primary_alignment(aligned_reads)
     _5end = _get_longest_5end_alinged_read(aligned_reads, max_clipping)
     seq = alignedread_to_seqitem(primary_alignment)
     segments = None
     if _5end is not None:
         if not _read_is_totally_mapped([_5end], max_clipping):
             if not _5end.is_reverse:
                 qend = _get_qend(_5end)
             else:
                 qend = get_length(seq) - _get_qstart(_5end)
             segments = [(qend, get_length(seq) - 1)]
     if segments is not None:
         _add_trim_segments(segments, seq, kind=OTHER)
     return seq
예제 #5
0
파일: trim.py 프로젝트: fw1121/ngs_crumbs
 def _do_trim(self, seq):
     'It trims the masked segments of the SeqWrappers.'
     segments = self._matcher.get_matched_segments_for_read(get_name(seq))
     if segments is not None:
         segments = [(segment[0], get_length(seq) - 1) for segment in segments[0]]
         _add_trim_segments(segments, seq, kind=OTHER)
     return seq
예제 #6
0
    def _do_trim(self, seq):
        'It trims the edges of the given seqs.'
        annots = get_annotations(seq)
        if not TRIMMING_RECOMMENDATIONS in annots:
            return seq

        trim_rec = annots[TRIMMING_RECOMMENDATIONS]
        # fixing the trimming recommendations
        if TRIMMING_RECOMMENDATIONS in annots:
            del annots[TRIMMING_RECOMMENDATIONS]

        trim_segments = []
        for trim_kind in TRIMMING_KINDS:
            trim_segments.extend(trim_rec.get(trim_kind, []))

        # masking
        if self.mask:
            seq = _mask_sequence(seq, trim_segments)
        else:
            # trimming
            if trim_segments:
                trim_limits = get_longest_complementary_segment(
                    trim_segments, get_length(seq))
                if trim_limits is None:
                    # there's no sequence left
                    return None
            else:
                trim_limits = []

            if trim_limits:
                seq = slice_seq(seq, trim_limits[0], trim_limits[1] + 1)

        return seq
예제 #7
0
 def _do_trim(self, aligned_reads):
     max_clipping = self.max_clipping
     primary_alignment = _get_primary_alignment(aligned_reads)
     _5end = _get_longest_5end_alinged_read(aligned_reads, max_clipping)
     seq = alignedread_to_seqitem(primary_alignment)
     segments = None
     if _5end is not None:
         if not _read_is_totally_mapped([_5end], max_clipping):
             if not _5end.is_reverse:
                 qend = _get_qend(_5end)
             else:
                 qend = get_length(seq) - _get_qstart(_5end)
             segments = [(qend, get_length(seq) - 1)]
     if segments is not None:
         _add_trim_segments(segments, seq, kind=OTHER)
     return seq
예제 #8
0
 def _do_trim(self, seq):
     'It trims the masked segments of the SeqWrappers.'
     segments = self._matcher.get_matched_segments_for_read(get_name(seq))
     if segments is not None:
         segments = [(segment[0], get_length(seq) - 1)
                     for segment in segments[0]]
         _add_trim_segments(segments, seq, kind=OTHER)
     return seq
예제 #9
0
def count_seqs(seqs):
    'It counts the number of sequences and the total length.'
    num_seqs = 0
    total_len = 0
    for seq in seqs:
        total_len += get_length(seq)
        num_seqs += 1

    return {'num_seqs': num_seqs, 'total_length': total_len}
예제 #10
0
파일: trim.py 프로젝트: fw1121/ngs_crumbs
 def _do_trim(self, seq):
     'It trims the edges of the given seqs.'
     left = self.left
     right = self.right
     segments = [(0, left - 1)] if left else []
     if right:
         seq_len = get_length(seq)
         segments.append((seq_len - right, seq_len - 1))
     _add_trim_segments(segments, seq, kind=OTHER)
     return seq
예제 #11
0
def count_seqs(seqs):
    'It counts the number of sequences and the total length.'
    num_seqs = 0
    total_len = 0
    for seq in seqs:
        total_len += get_length(seq)
        num_seqs += 1

    return {'num_seqs': num_seqs,
            'total_length': total_len}
예제 #12
0
 def _do_trim(self, seq):
     'It trims the edges of the given seqs.'
     left = self.left
     right = self.right
     segments = [(0, left - 1)] if left else []
     if right:
         seq_len = get_length(seq)
         segments.append((seq_len - right, seq_len - 1))
     _add_trim_segments(segments, seq, kind=OTHER)
     return seq
예제 #13
0
    def _do_check(self, seq):
        min_ = self.min
        max_ = self.max
        length = uppercase_length(get_str_seq(seq)) if self.ignore_masked else get_length(seq)

        passed = True
        if min_ is not None and length < min_:
            passed = False
        if max_ is not None and length > max_:
            passed = False
        return passed
예제 #14
0
    def _do_check(self, seq):
        min_ = self.min
        max_ = self.max
        length = uppercase_length(
            get_str_seq(seq)) if self.ignore_masked else get_length(seq)

        passed = True
        if min_ is not None and length < min_:
            passed = False
        if max_ is not None and length > max_:
            passed = False
        return passed
예제 #15
0
파일: trim.py 프로젝트: fw1121/ngs_crumbs
def _mask_sequence(seq, segments):
    'It masks the given segments of the sequence'

    if not segments:
        return seq
    segments = merge_overlaping_segments(segments)
    segments = get_all_segments(segments, get_length(seq))
    str_seq = get_str_seq(seq)
    new_seq = ''
    for segment in segments:
        start = segment[0][0]
        end = segment[0][1] + 1
        str_seq_ = str_seq[start:end]

        if segment[1]:
            str_seq_ = str_seq_.lower()
        new_seq += str_seq_
    if seq.kind == SEQRECORD:
        new_seq = Seq(new_seq, alphabet=seq.object.seq.alphabet)
    return copy_seq(seq, seq=new_seq)
예제 #16
0
def _mask_sequence(seq, segments):
    'It masks the given segments of the sequence'

    if not segments:
        return seq
    segments = merge_overlaping_segments(segments)
    segments = get_all_segments(segments, get_length(seq))
    str_seq = get_str_seq(seq)
    new_seq = ''
    for segment in segments:
        start = segment[0][0]
        end = segment[0][1] + 1
        str_seq_ = str_seq[start:end]

        if segment[1]:
            str_seq_ = str_seq_.lower()
        new_seq += str_seq_
    if seq.kind == SEQRECORD:
        new_seq = Seq(new_seq, alphabet=seq.object.seq.alphabet)
    return copy_seq(seq, seq=new_seq)
예제 #17
0
    def _get_chrom_lengths(self):
        chrom_lens = OrderedDict()
        if self._ref_fhand is None:
            vcf_fhand = gzip.open(self._reader.fhand.name)
            for line in vcf_fhand:
                line = line.strip()
                if line.startswith('#'):
                    continue
                items = line.split()
                chrom = items[0]
                loc = int(items[1])
                if chrom not in chrom_lens:
                    chrom_lens[chrom] = loc
                else:
                    if loc > chrom_lens[chrom]:
                        chrom_lens[chrom] = loc

        else:
            for read in read_seqs([self._ref_fhand]):
                chrom_lens[get_name(read)] = get_length(read)
        return chrom_lens
예제 #18
0
def _get_seq_lengths(fhand):
    return {get_name(seq): get_length(seq) for seq in read_seqs([fhand])}
예제 #19
0
def calculate_sequence_stats(seqs, kmer_size=None, do_dust_stats=False,
                             nxs=None):
    'It calculates some stats for the given seqs.'
    # get data
    lengths = IntCounter()
    quals_per_pos = IntBoxplot()
    nucl_freq = NuclFreqsPlot()
    kmer_counter = KmerCounter(kmer_size) if kmer_size else None
    dustscores = IntCounter()
    for seq in seqs:
        lengths[get_length(seq)] += 1
        try:
            quals = get_int_qualities(seq)
        except AttributeError:
            quals = []
        for index, qual in enumerate(quals):
            quals_per_pos.append(index + 1, qual)
        str_seq = get_str_seq(seq)
        for index, nucl in enumerate(str_seq):
            nucl_freq.append(index, nucl)
        if kmer_counter is not None:
            kmer_counter.count_seq(str_seq)
        if do_dust_stats:
            dustscore = calculate_dust_score(seq)
            if dustscore is not None:
                dustscores[int(dustscore)] += 1

    lengths.update_labels({'sum': 'tot. residues', 'items': 'num. seqs.'})

    # length distribution
    lengths_srt = 'Length stats and distribution.\n'
    lengths_srt += '------------------------------\n'
    nxs = sorted(nxs) if nxs else []
    for nx in sorted(nxs):
        lengths_srt += 'N{:d}: {:d}\n'.format(nx, calculate_nx(lengths, nx))
    lengths_srt += str(lengths)
    lengths_srt += '\n'

    # agregate quals
    if quals_per_pos:
        quals = quals_per_pos.aggregated_array
        quals.update_labels({'sum': None, 'items': 'tot. base pairs'})

        q30 = quals.count_relative_to_value(30, operator.ge) / quals.count
        q30 *= 100

        q20 = quals.count_relative_to_value(20, operator.ge) / quals.count
        q20 *= 100

        # qual distribution
        qual_str = 'Quality stats and distribution.\n'
        qual_str += '-------------------------------\n'
        qual_str += 'Q20: {:.2f}\n'.format(q20)
        qual_str += 'Q30: {:.2f}\n'.format(q30)
        qual_str += str(quals)
        qual_str += '\n'

        # qual per position boxplot
        qual_boxplot = 'Boxplot for quality per position.\n'
        qual_boxplot += '---------------------------------\n'
        qual_boxplot += quals_per_pos.ascii_plot
        qual_boxplot += '\n'
    else:
        qual_str = ''
        qual_boxplot = ''

    # nucl freqs
    freq_str = 'Nucleotide frequency per position.\n'
    freq_str += '----------------------------------\n'
    freq_str += nucl_freq.ascii_plot
    freq_str += '\n'

    # kmer_distriubution
    kmer_str = ''
    if kmer_counter is not None:
        kmers = IntCounter(kmer_counter.values)
        if kmers:
            kmers.update_labels({'sum': None, 'items': 'num. kmers'})
            kmer_str = 'Kmer distribution\n'
            kmer_str += '-----------------\n'
            kmer_str += str(kmers)
            kmer_str += '\n'
            kmer_str += 'Most common kmers:\n'
            for kmer, number in kmer_counter.most_common(20):
                kmer_str += '\t{}: {}\n'.format(kmer, number)

    dust_str = ''
    if dustscores:
        dustscores.update_labels({'sum': None, 'items': 'num. seqs.'})
        dust_str = 'Dustscores stats and distribution.\n'
        dust_str += '----------------------------------\n'
        dust7 = (dustscores.count_relative_to_value(7, operator.gt) /
                 dustscores.count)
        dust_str += '% above 7 (low complexity): {:.2f}\n'.format(dust7)
        dust_str += str(dustscores)
        dust_str += '\n'

    return {'length': lengths_srt,
            'quality': qual_str,
            'nucl_freq': freq_str,
            'qual_boxplot': qual_boxplot,
            'kmer': kmer_str,
            'dustscore': dust_str}
예제 #20
0
class MatePairSplitter(object):
    'It splits the input sequences with the provided linkers.'

    def __init__(self, linkers=None):
        'The initiator'
        if linkers is None:
            linkers = get_setting('LINKERS')
            linkers = [
                SeqItem(str(i), '>%d\n%s\n' % (i, l))
                for i, l in enumerate(linkers)
            ]
            linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta')
        self.linkers = list(linkers)

    def __call__(self, seqs):
        'It splits a list of sequences with the provided linkers'
        seq_fhand = write_seqs(seqs, file_format='fasta')
        seq_fhand.flush()

        min_identity = 87.0
        min_len = 13
        filters = [{
            'kind': 'min_length',
            'min_num_residues': min_len,
            'length_in_query': False,
            'filter_match_parts': True
        }, {
            'kind': 'score_threshold',
            'score_key': 'identity',
            'min_score': min_identity
        }]

        matcher = BlasterForFewSubjects(seq_fhand.name,
                                        self.linkers,
                                        program='blastn',
                                        filters=filters,
                                        params={'task': 'blastn-short'},
                                        elongate_for_global=True,
                                        seqs_type=NUCL)
        new_seqs = []
        for seq in seqs:
            segments = matcher.get_matched_segments_for_read(get_name(seq))
            if segments is not None:
                split_seqs = self._split_by_mate_linker(seq, segments)
            else:
                split_seqs = [seq]
            for seq in split_seqs:
                new_seqs.append(seq)
        return new_seqs

    def _split_by_mate_linker(self, seq, (segments, is_partial)):
        'It splits the seqs using segments'

        if not segments:
            return [copy_seq(seq)]

        elongated_match = is_partial
        if len(segments) == 1:
            segment_start = segments[0][0]
            segment_end = segments[0][1]
            seq_end = get_length(seq) - 1
            if segment_start == 0:
                new_seq = slice_seq(seq, segment_end + 1, None)
                return [new_seq]
            elif segment_end == seq_end:
                new_seq = slice_seq(seq, None, segment_start)
                return [new_seq]
            elif segment_end > seq_end:
                msg = 'The segment ends after the sequence has ended'
                raise RuntimeError(msg)
            else:
                new_seq1 = slice_seq(seq, None, segment_start)
                new_seq2 = slice_seq(seq, segment_end + 1, None)
                if elongated_match:
                    name = get_name(seq) + '_pl'
                else:
                    name = get_name(seq)
                new_seq1 = copy_seq(new_seq1, name=name + r'\1')
                new_seq2 = copy_seq(new_seq2, name=name + r'\2')
                return [new_seq1, new_seq2]
        else:
            seqs = []
            counter = 1
            seq_start = 0
            for segment_start, segment_end in segments:
                if segment_start == 0:
                    continue
                new_seq = slice_seq(seq, seq_start, segment_start)
                seq_name = get_name(seq) + '_mlc.part{0:d}'.format(counter)
                new_seq = copy_seq(new_seq, name=seq_name)
                seqs.append(new_seq)
                counter += 1
                seq_start = segment_end + 1
            else:
                if segment_end != get_length(seq) + 1:
                    new_seq = slice_seq(seq, segment_end + 1, None)
                    name = get_name(seq) + '_mlc.part{0:d}'.format(counter)
                    new_seq = copy_seq(new_seq, name=name)
                    seqs.append(new_seq)
            return seqs
예제 #21
0
def _get_seq_lengths(fhand):
    return {get_name(seq): get_length(seq) for seq in read_seqs([fhand])}
예제 #22
0
def calculate_sequence_stats(seqs,
                             kmer_size=None,
                             do_dust_stats=False,
                             nxs=None):
    'It calculates some stats for the given seqs.'
    # get data
    lengths = IntCounter()
    quals_per_pos = IntBoxplot()
    nucl_freq = NuclFreqsPlot()
    kmer_counter = KmerCounter(kmer_size) if kmer_size else None
    dustscores = IntCounter()
    for seq in seqs:
        lengths[get_length(seq)] += 1
        try:
            quals = get_int_qualities(seq)
        except AttributeError:
            quals = []
        for index, qual in enumerate(quals):
            quals_per_pos.append(index + 1, qual)
        str_seq = get_str_seq(seq)
        for index, nucl in enumerate(str_seq):
            nucl_freq.append(index, nucl)
        if kmer_counter is not None:
            kmer_counter.count_seq(str_seq)
        if do_dust_stats:
            dustscore = calculate_dust_score(seq)
            if dustscore is not None:
                dustscores[int(dustscore)] += 1

    lengths.update_labels({'sum': 'tot. residues', 'items': 'num. seqs.'})

    # length distribution
    lengths_srt = 'Length stats and distribution.\n'
    lengths_srt += '------------------------------\n'
    nxs = sorted(nxs) if nxs else []
    for nx in sorted(nxs):
        lengths_srt += 'N{:d}: {:d}\n'.format(nx, calculate_nx(lengths, nx))
    lengths_srt += str(lengths)
    lengths_srt += '\n'

    # agregate quals
    if quals_per_pos:
        quals = quals_per_pos.aggregated_array
        quals.update_labels({'sum': None, 'items': 'tot. base pairs'})

        q30 = quals.count_relative_to_value(30, operator.ge) / quals.count
        q30 *= 100

        q20 = quals.count_relative_to_value(20, operator.ge) / quals.count
        q20 *= 100

        # qual distribution
        qual_str = 'Quality stats and distribution.\n'
        qual_str += '-------------------------------\n'
        qual_str += 'Q20: {:.2f}\n'.format(q20)
        qual_str += 'Q30: {:.2f}\n'.format(q30)
        qual_str += str(quals)
        qual_str += '\n'

        # qual per position boxplot
        qual_boxplot = 'Boxplot for quality per position.\n'
        qual_boxplot += '---------------------------------\n'
        qual_boxplot += quals_per_pos.ascii_plot
        qual_boxplot += '\n'
    else:
        qual_str = ''
        qual_boxplot = ''

    # nucl freqs
    freq_str = 'Nucleotide frequency per position.\n'
    freq_str += '----------------------------------\n'
    freq_str += nucl_freq.ascii_plot
    freq_str += '\n'

    # kmer_distriubution
    kmer_str = ''
    if kmer_counter is not None:
        kmers = IntCounter(kmer_counter.values)
        if kmers:
            kmers.update_labels({'sum': None, 'items': 'num. kmers'})
            kmer_str = 'Kmer distribution\n'
            kmer_str += '-----------------\n'
            kmer_str += str(kmers)
            kmer_str += '\n'
            kmer_str += 'Most common kmers:\n'
            for kmer, number in kmer_counter.most_common(20):
                kmer_str += '\t{}: {}\n'.format(kmer, number)

    dust_str = ''
    if dustscores:
        dustscores.update_labels({'sum': None, 'items': 'num. seqs.'})
        dust_str = 'Dustscores stats and distribution.\n'
        dust_str += '----------------------------------\n'
        dust7 = (dustscores.count_relative_to_value(7, operator.gt) /
                 dustscores.count)
        dust_str += '% above 7 (low complexity): {:.2f}\n'.format(dust7)
        dust_str += str(dustscores)
        dust_str += '\n'

    return {
        'length': lengths_srt,
        'quality': qual_str,
        'nucl_freq': freq_str,
        'qual_boxplot': qual_boxplot,
        'kmer': kmer_str,
        'dustscore': dust_str
    }