def test_rolling_window(self): 'We get the items along a rolling window' # with series serie = '12345' assert [''.join(win) for win in rolling_window(serie, 3)] == ['123', '234', '345'] assert not [''.join(win) for win in rolling_window(serie, 6)] assert [''.join(w) for w in rolling_window(serie, 5)] == ['12345'] # with iterator iterator = iter(serie) assert [''.join(win) for win in rolling_window(iterator, 3)] == ['123', '234', '345'] iterator = iter(serie) assert not [''.join(win) for win in rolling_window(iterator, 6)] iterator = iter(serie) assert [''.join(w) for w in rolling_window(iterator, 5)] == ['12345'] # with step series = ['1234567890', '123456789', '12345678', '1234567'] expected = [['1234', '3456', '5678', '7890'], ['1234', '3456', '5678'], ['1234', '3456', '5678'], ['1234', '3456']] for serie, exp in zip(series, expected): wins1 = [''.join(win) for win in rolling_window(serie, 4, 2)] assert wins1 == exp iterator = iter(serie) wins2 = [''.join(win) for win in rolling_window(iterator, 4, 2)] assert wins1 == wins2
def _calculate_rawscore(string): 'It returns a non-normalized dustscore' triplet_counts = Counter() for triplet in rolling_window(string, 3): # It should do something with non ATCG, but we sacrifice purity for # speed. Maybe we should reconsider this triplet_counts[triplet.upper()] += 1 return sum(tc * (tc - 1) * 0.5 for tc in triplet_counts.viewvalues())
def _get_bad_quality_segments(quals, window, threshold, trim_left=True, trim_right=True): '''It returns the regions with quality above the threshold. The algorithm is similar to the one used by qclip in Staden. ''' # do window quality means mean = lambda l: float(sum(l)) / len(l) if len(l) > 0 else float('nan') wquals = [mean(win_quals) for win_quals in rolling_window(quals, window)] if not wquals: return [(0, len(quals) - 1)] index_max, max_val = max(enumerate(wquals), key=itemgetter(1)) if max_val < threshold: return [(0, len(quals) - 1)] if trim_left: wleft_index = 0 for wleft_index in range(index_max - 1, -1, -1): if wquals[wleft_index] < threshold: wleft_index += 1 break else: wleft_index = 0 if trim_right: wright_index = index_max for wright_index in range(index_max, len(wquals)): if wquals[wright_index] < threshold: wright_index -= 1 break else: wright_index = len(wquals) - 1 left = wleft_index right = wright_index + window - 1 segments = [] if left: segments.append((0, left - 1)) if right < len(quals) - 1: segments.append((right + 1, len(quals) - 1)) if not segments: return None return segments
def calculate_dust_score(seq): '''It returns the dust score. From: "A Fast and Symmetric DUST Implementation to Mask Low-Complexity DNA Sequences" doi:10.1089/cmb.2006.13.1028 and re-implemented from PRINSEQ ''' seq = get_str_seq(seq) length = len(seq) if length == 3: return 0 if length <= 5: return None windowsize = get_setting('DUST_WINDOWSIZE') windowstep = get_setting('DUST_WINDOWSTEP') dustscores = [] if length > windowsize: windows = 0 for seq_in_win in rolling_window(seq, windowsize, windowstep): score = _calculate_rawscore(seq_in_win) dustscores.append(score / (windowsize - 2)) windows += 1 remaining_seq = seq[windows * windowstep:] else: remaining_seq = seq if remaining_seq > 5: length = len(remaining_seq) score = _calculate_rawscore(remaining_seq) dustscore = score / (length - 3) * (windowsize - 2) / (length - 2) dustscores.append(dustscore) # max score should be 100 not 31 dustscore = sum(dustscores) / len(dustscores) * 100 / 31 return dustscore
def test_rolling_window(self): 'We get the items along a rolling window' #with series serie = '12345' assert [''.join(win) for win in rolling_window(serie, 3)] == ['123', '234', '345'] assert not [''.join(win) for win in rolling_window(serie, 6)] assert [''.join(w) for w in rolling_window(serie, 5)] == ['12345'] #with iterator iterator = iter(serie) assert [''.join(win) for win in rolling_window(iterator, 3)] == ['123', '234', '345'] iterator = iter(serie) assert not [''.join(win) for win in rolling_window(iterator, 6)] iterator = iter(serie) assert [''.join(w) for w in rolling_window(iterator, 5)] == ['12345']
def count_seq(self, serie): 'It adds the kmers of the given iterable/serie' for kmer in rolling_window(serie, self._kmer_size): self._counter[kmer] += 1