예제 #1
0
    def test_rolling_window(self):
        'We get the items along a rolling window'
        # with series
        serie = '12345'
        assert [''.join(win) for win in rolling_window(serie, 3)] == ['123',
                                                                  '234', '345']
        assert not [''.join(win) for win in rolling_window(serie, 6)]
        assert [''.join(w) for w in rolling_window(serie, 5)] == ['12345']

        # with iterator
        iterator = iter(serie)
        assert [''.join(win) for win in rolling_window(iterator, 3)] == ['123',
                                                                  '234', '345']
        iterator = iter(serie)
        assert not [''.join(win) for win in rolling_window(iterator, 6)]
        iterator = iter(serie)
        assert [''.join(w) for w in rolling_window(iterator, 5)] == ['12345']

        # with step
        series = ['1234567890', '123456789', '12345678', '1234567']
        expected = [['1234', '3456', '5678', '7890'], ['1234', '3456', '5678'],
                    ['1234', '3456', '5678'], ['1234', '3456']]
        for serie, exp in zip(series, expected):
            wins1 = [''.join(win) for win in rolling_window(serie, 4, 2)]
            assert wins1 == exp

            iterator = iter(serie)
            wins2 = [''.join(win) for win in rolling_window(iterator, 4, 2)]
            assert wins1 == wins2
예제 #2
0
    def test_rolling_window(self):
        'We get the items along a rolling window'
        # with series
        serie = '12345'
        assert [''.join(win) for win in rolling_window(serie, 3)] == ['123',
                                                                  '234', '345']
        assert not [''.join(win) for win in rolling_window(serie, 6)]
        assert [''.join(w) for w in rolling_window(serie, 5)] == ['12345']

        # with iterator
        iterator = iter(serie)
        assert [''.join(win) for win in rolling_window(iterator, 3)] == ['123',
                                                                  '234', '345']
        iterator = iter(serie)
        assert not [''.join(win) for win in rolling_window(iterator, 6)]
        iterator = iter(serie)
        assert [''.join(w) for w in rolling_window(iterator, 5)] == ['12345']

        # with step
        series = ['1234567890', '123456789', '12345678', '1234567']
        expected = [['1234', '3456', '5678', '7890'], ['1234', '3456', '5678'],
                    ['1234', '3456', '5678'], ['1234', '3456']]
        for serie, exp in zip(series, expected):
            wins1 = [''.join(win) for win in rolling_window(serie, 4, 2)]
            assert wins1 == exp

            iterator = iter(serie)
            wins2 = [''.join(win) for win in rolling_window(iterator, 4, 2)]
            assert wins1 == wins2
예제 #3
0
def _calculate_rawscore(string):
    'It returns a non-normalized dustscore'
    triplet_counts = Counter()
    for triplet in rolling_window(string, 3):
        # It should do something with non ATCG, but we sacrifice purity for
        # speed. Maybe we should reconsider this
        triplet_counts[triplet.upper()] += 1

    return sum(tc * (tc - 1) * 0.5 for tc in triplet_counts.viewvalues())
예제 #4
0
def _get_bad_quality_segments(quals,
                              window,
                              threshold,
                              trim_left=True,
                              trim_right=True):
    '''It returns the regions with quality above the threshold.

    The algorithm is similar to the one used by qclip in Staden.
    '''
    # do window quality means
    mean = lambda l: float(sum(l)) / len(l) if len(l) > 0 else float('nan')

    wquals = [mean(win_quals) for win_quals in rolling_window(quals, window)]

    if not wquals:
        return [(0, len(quals) - 1)]

    index_max, max_val = max(enumerate(wquals), key=itemgetter(1))

    if max_val < threshold:
        return [(0, len(quals) - 1)]

    if trim_left:
        wleft_index = 0
        for wleft_index in range(index_max - 1, -1, -1):
            if wquals[wleft_index] < threshold:
                wleft_index += 1
                break
    else:
        wleft_index = 0
    if trim_right:
        wright_index = index_max
        for wright_index in range(index_max, len(wquals)):
            if wquals[wright_index] < threshold:
                wright_index -= 1
                break
    else:
        wright_index = len(wquals) - 1
    left = wleft_index
    right = wright_index + window - 1
    segments = []
    if left:
        segments.append((0, left - 1))
    if right < len(quals) - 1:
        segments.append((right + 1, len(quals) - 1))
    if not segments:
        return None
    return segments
예제 #5
0
파일: trim.py 프로젝트: milw/seq_crumbs
def _get_bad_quality_segments(quals, window, threshold, trim_left=True,
                              trim_right=True):
    '''It returns the regions with quality above the threshold.

    The algorithm is similar to the one used by qclip in Staden.
    '''
    # do window quality means
    mean = lambda l: float(sum(l)) / len(l) if len(l) > 0 else float('nan')

    wquals = [mean(win_quals) for win_quals in rolling_window(quals, window)]

    if not wquals:
        return [(0, len(quals) - 1)]

    index_max, max_val = max(enumerate(wquals), key=itemgetter(1))

    if max_val < threshold:
        return [(0, len(quals) - 1)]

    if trim_left:
        wleft_index = 0
        for wleft_index in range(index_max - 1, -1, -1):
            if wquals[wleft_index] < threshold:
                wleft_index += 1
                break
    else:
        wleft_index = 0
    if trim_right:
        wright_index = index_max
        for wright_index in range(index_max, len(wquals)):
            if wquals[wright_index] < threshold:
                wright_index -= 1
                break
    else:
        wright_index = len(wquals) - 1
    left = wleft_index
    right = wright_index + window - 1
    segments = []
    if left:
        segments.append((0, left - 1))
    if right < len(quals) - 1:
        segments.append((right + 1, len(quals) - 1))
    if not segments:
        return None
    return segments
예제 #6
0
def calculate_dust_score(seq):
    '''It returns the dust score.

    From: "A Fast and Symmetric DUST Implementation to Mask Low-Complexity DNA
    Sequences"
    doi:10.1089/cmb.2006.13.1028

    and re-implemented from PRINSEQ
    '''
    seq = get_str_seq(seq)
    length = len(seq)
    if length == 3:
        return 0
    if length <= 5:
        return None

    windowsize = get_setting('DUST_WINDOWSIZE')
    windowstep = get_setting('DUST_WINDOWSTEP')

    dustscores = []
    if length > windowsize:
        windows = 0
        for seq_in_win in rolling_window(seq, windowsize, windowstep):
            score = _calculate_rawscore(seq_in_win)
            dustscores.append(score / (windowsize - 2))
            windows += 1
        remaining_seq = seq[windows * windowstep:]
    else:
        remaining_seq = seq

    if remaining_seq > 5:
        length = len(remaining_seq)
        score = _calculate_rawscore(remaining_seq)
        dustscore = score / (length - 3) * (windowsize - 2) / (length - 2)
        dustscores.append(dustscore)

    # max score should be 100 not 31
    dustscore = sum(dustscores) / len(dustscores) * 100 / 31
    return dustscore
예제 #7
0
    def test_rolling_window(self):
        'We get the items along a rolling window'
        #with series
        serie = '12345'
        assert [''.join(win) for win in rolling_window(serie, 3)] == ['123',
                                                                  '234', '345']
        assert not [''.join(win) for win in rolling_window(serie, 6)]
        assert [''.join(w) for w in rolling_window(serie, 5)] == ['12345']

        #with iterator
        iterator = iter(serie)
        assert [''.join(win) for win in rolling_window(iterator, 3)] == ['123',
                                                                  '234', '345']
        iterator = iter(serie)
        assert not [''.join(win) for win in rolling_window(iterator, 6)]
        iterator = iter(serie)
        assert [''.join(w) for w in rolling_window(iterator, 5)] == ['12345']
예제 #8
0
 def count_seq(self, serie):
     'It adds the kmers of the given iterable/serie'
     for kmer in rolling_window(serie, self._kmer_size):
         self._counter[kmer] += 1
예제 #9
0
 def count_seq(self, serie):
     'It adds the kmers of the given iterable/serie'
     for kmer in rolling_window(serie, self._kmer_size):
         self._counter[kmer] += 1