def find_in_index_range(start_index): while True: try: first_index = sequence.index(first_item, start_index, first_item_last_index) start_index = first_index + 1 except ValueError: return -1 for subseq_index in xrange(1, len(subsequence)): if sequence[first_index + subseq_index] != subsequence[subseq_index]: break else: return first_index
def find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist): subseq_len = len(subsequence) seq_len = len(sequence) ngram_len = subseq_len // (max_l_dist + 1) if ngram_len == 0: raise ValueError( 'the subsequence length must be greater than max_l_dist') def make_match(start, end, dist): return Match(start, end, dist, matched=sequence[start:end]) for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len): ngram_end = ngram_start + ngram_len subseq_before_reversed = subsequence[:ngram_start][::-1] subseq_after = subsequence[ngram_end:] start_index = max(0, ngram_start - max_l_dist) end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist) for index in search_exact(subsequence[ngram_start:ngram_end], sequence, start_index, end_index): # try to expand left and/or right according to n_ngram dist_right, right_expand_size = _expand( subseq_after, sequence[index + ngram_len:index - ngram_start + subseq_len + max_l_dist], max_l_dist, ) if dist_right is None: continue dist_left, left_expand_size = _expand( subseq_before_reversed, sequence[max(0, index - ngram_start - (max_l_dist - dist_right)):index][::-1], max_l_dist - dist_right, ) if dist_left is None: continue assert dist_left + dist_right <= max_l_dist yield make_match( start=index - left_expand_size, end=index + ngram_len + right_expand_size, dist=dist_left + dist_right, )
def find_near_matches_generic_ngrams(subsequence, sequence, search_params): """search for near-matches of subsequence in sequence This searches for near-matches, where the nearly-matching parts of the sequence must meet the following limitations (relative to the subsequence): * the maximum allowed number of character substitutions * the maximum allowed number of new characters inserted * and the maximum allowed number of character deletions * the total number of substitutions, insertions and deletions """ if not subsequence: raise ValueError('Given subsequence is empty!') max_l_dist = search_params.max_l_dist # optimization: prepare some often used things in advance subseq_len = len(subsequence) seq_len = len(sequence) ngram_len = subseq_len // (max_l_dist + 1) if ngram_len == 0: raise ValueError( 'the subsequence length must be greater than max_l_dist') for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len): ngram_end = ngram_start + ngram_len start_index = max(0, ngram_start - max_l_dist) end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist) for index in search_exact(subsequence[ngram_start:ngram_end], sequence, start_index, end_index): # try to expand left and/or right according to n_ngram for match in find_near_matches_generic_linear_programming( subsequence, sequence[max(0, index - ngram_start - max_l_dist):index - ngram_start + subseq_len + max_l_dist], search_params, ): yield attr.evolve( match, start=match.start + max(0, index - ngram_start - max_l_dist), end=match.end + max(0, index - ngram_start - max_l_dist), )
def find_near_matches_levenshtein_ngrams(subsequence, sequence, max_l_dist): subseq_len = len(subsequence) seq_len = len(sequence) ngram_len = subseq_len // (max_l_dist + 1) if ngram_len == 0: raise ValueError('the subsequence length must be greater than max_l_dist') for ngram_start in xrange(0, subseq_len - ngram_len + 1, ngram_len): ngram_end = ngram_start + ngram_len subseq_before_reversed = subsequence[:ngram_start][::-1] subseq_after = subsequence[ngram_end:] start_index = max(0, ngram_start - max_l_dist) end_index = min(seq_len, seq_len - subseq_len + ngram_end + max_l_dist) for index in search_exact(subsequence[ngram_start:ngram_end], sequence, start_index, end_index): # try to expand left and/or right according to n_ngram dist_right, right_expand_size = _expand( subseq_after, sequence[index + ngram_len:index - ngram_start + subseq_len + max_l_dist], max_l_dist, ) if dist_right is None: continue dist_left, left_expand_size = _expand( subseq_before_reversed, sequence[max(0, index - ngram_start - (max_l_dist - dist_right)):index][::-1], max_l_dist - dist_right, ) if dist_left is None: continue assert dist_left + dist_right <= max_l_dist yield Match( start=index - left_expand_size, end=index + ngram_len + right_expand_size, dist=dist_left + dist_right, )
def find_near_matches_levenshtein_linear_programming(subsequence, sequence, max_l_dist): if not subsequence: raise ValueError('Given subsequence is empty!') subseq_len = len(subsequence) def make_match(start, end, dist): return Match(start, end, dist, matched=sequence[start:end]) if max_l_dist >= subseq_len: for index in xrange(len(sequence) + 1): yield make_match(index, index, subseq_len) return # optimization: prepare some often used things in advance char2first_subseq_index = make_char2first_subseq_index(subsequence, max_l_dist) candidates = [] for index, char in enumerate(sequence): new_candidates = [] idx_in_subseq = char2first_subseq_index.get(char, None) if idx_in_subseq is not None: if idx_in_subseq + 1 == subseq_len: yield make_match(index, index + 1, idx_in_subseq) else: new_candidates.append(Candidate(index, idx_in_subseq + 1, idx_in_subseq)) for cand in candidates: # if this sequence char is the candidate's next expected char if subsequence[cand.subseq_index] == char: # if reached the end of the subsequence, return a match if cand.subseq_index + 1 == subseq_len: yield make_match(cand.start, index + 1, cand.dist) # otherwise, update the candidate's subseq_index and keep it else: new_candidates.append(cand._replace( subseq_index=cand.subseq_index + 1, )) # if this sequence char is *not* the candidate's next expected char else: # we can try skipping a sequence or sub-sequence char (or both), # unless this candidate has already skipped the maximum allowed # number of characters if cand.dist == max_l_dist: continue # add a candidate skipping a sequence char new_candidates.append(cand._replace(dist=cand.dist + 1)) if index + 1 < len(sequence) and cand.subseq_index + 1 < subseq_len: # add a candidate skipping both a sequence char and a # subsequence char new_candidates.append(cand._replace( dist=cand.dist + 1, subseq_index=cand.subseq_index + 1, )) # try skipping subsequence chars for n_skipped in xrange(1, max_l_dist - cand.dist + 1): # if skipping n_skipped sub-sequence chars reaches the end # of the sub-sequence, yield a match if cand.subseq_index + n_skipped == subseq_len: yield make_match(cand.start, index + 1, cand.dist + n_skipped) break # otherwise, if skipping n_skipped sub-sequence chars # reaches a sub-sequence char identical to this sequence # char, add a candidate skipping n_skipped sub-sequence # chars elif subsequence[cand.subseq_index + n_skipped] == char: # if this is the last char of the sub-sequence, yield # a match if cand.subseq_index + n_skipped + 1 == subseq_len: yield make_match(cand.start, index + 1, cand.dist + n_skipped) # otherwise add a candidate skipping n_skipped # subsequence chars else: new_candidates.append(cand._replace( dist=cand.dist + n_skipped, subseq_index=cand.subseq_index + 1 + n_skipped, )) break # note: if the above loop ends without a break, that means that # no candidate could be added / yielded by skipping sub-sequence # chars candidates = new_candidates for cand in candidates: dist = cand.dist + subseq_len - cand.subseq_index if dist <= max_l_dist: yield make_match(cand.start, len(sequence), dist)
def _find_near_matches_generic_linear_programming(subsequence, sequence, search_params): """search for near-matches of subsequence in sequence This searches for near-matches, where the nearly-matching parts of the sequence must meet the following limitations (relative to the subsequence): * the maximum allowed number of character substitutions * the maximum allowed number of new characters inserted * and the maximum allowed number of character deletions * the total number of substitutions, insertions and deletions """ if not subsequence: raise ValueError('Given subsequence is empty!') max_substitutions, max_insertions, max_deletions, max_l_dist = search_params.unpacked # optimization: prepare some often used things in advance subseq_len = len(subsequence) candidates = [] for index, char in enumerate(sequence): candidates.append(GenericSearchCandidate(index, 0, 0, 0, 0, 0)) new_candidates = [] for cand in candidates: # if this sequence char is the candidate's next expected char if char == subsequence[cand.subseq_index]: # if reached the end of the subsequence, return a match if cand.subseq_index + 1 == subseq_len: yield Match(cand.start, index + 1, cand.l_dist) # otherwise, update the candidate's subseq_index and keep it else: new_candidates.append(cand._replace( subseq_index=cand.subseq_index + 1, )) # if this sequence char is *not* the candidate's next expected char else: # we can try skipping a sequence or sub-sequence char (or both), # unless this candidate has already skipped the maximum allowed # number of characters if cand.l_dist == max_l_dist: continue if cand.n_ins < max_insertions: # add a candidate skipping a sequence char new_candidates.append(cand._replace( n_ins=cand.n_ins + 1, l_dist=cand.l_dist + 1, )) if cand.subseq_index + 1 < subseq_len: if cand.n_subs < max_substitutions: # add a candidate skipping both a sequence char and a # subsequence char new_candidates.append(cand._replace( n_subs=cand.n_subs + 1, subseq_index=cand.subseq_index + 1, l_dist=cand.l_dist + 1, )) elif cand.n_dels < max_deletions and cand.n_ins < max_insertions: # add a candidate skipping both a sequence char and a # subsequence char new_candidates.append(cand._replace( n_ins=cand.n_ins + 1, n_dels=cand.n_dels + 1, subseq_index=cand.subseq_index + 1, l_dist=cand.l_dist + 1, )) else: # cand.subseq_index == _subseq_len - 1 if ( cand.n_subs < max_substitutions or ( cand.n_dels < max_deletions and cand.n_ins < max_insertions ) ): yield Match(cand.start, index + 1, cand.l_dist + 1) # try skipping subsequence chars for n_skipped in xrange(1, min(max_deletions - cand.n_dels, max_l_dist - cand.l_dist) + 1): # if skipping n_dels sub-sequence chars reaches the end # of the sub-sequence, yield a match if cand.subseq_index + n_skipped == subseq_len: yield Match(cand.start, index + 1, cand.l_dist + n_skipped) break # otherwise, if skipping n_skipped sub-sequence chars # reaches a sub-sequence char identical to this sequence # char ... elif subsequence[cand.subseq_index + n_skipped] == char: # if this is the last char of the sub-sequence, yield # a match if cand.subseq_index + n_skipped + 1 == subseq_len: yield Match(cand.start, index + 1, cand.l_dist + n_skipped) # otherwise add a candidate skipping n_skipped # subsequence chars else: new_candidates.append(cand._replace( n_dels=cand.n_dels + n_skipped, subseq_index=cand.subseq_index + 1 + n_skipped, l_dist=cand.l_dist + n_skipped, )) break # note: if the above loop ends without a break, that means that # no candidate could be added / yielded by skipping sub-sequence # chars candidates = new_candidates for cand in candidates: # note: index + 1 == length(sequence) n_skipped = subseq_len - cand.subseq_index if cand.n_dels + n_skipped <= max_deletions and \ cand.l_dist + n_skipped <= max_l_dist: yield Match(cand.start, index + 1, cand.l_dist + n_skipped)
def find_near_matches_levenshtein_linear_programming(subsequence, sequence, max_l_dist): if not subsequence: raise ValueError('Given subsequence is empty!') subseq_len = len(subsequence) if max_l_dist >= subseq_len: for index in xrange(len(sequence) + 1): yield Match(index, index, subseq_len) return # optimization: prepare some often used things in advance char2first_subseq_index = make_char2first_subseq_index(subsequence, max_l_dist) candidates = [] for index, char in enumerate(sequence): new_candidates = [] idx_in_subseq = char2first_subseq_index.get(char, None) if idx_in_subseq is not None: if idx_in_subseq + 1 == subseq_len: yield Match(index, index + 1, idx_in_subseq) else: new_candidates.append(Candidate(index, idx_in_subseq + 1, idx_in_subseq)) for cand in candidates: # if this sequence char is the candidate's next expected char if subsequence[cand.subseq_index] == char: # if reached the end of the subsequence, return a match if cand.subseq_index + 1 == subseq_len: yield Match(cand.start, index + 1, cand.dist) # otherwise, update the candidate's subseq_index and keep it else: new_candidates.append(cand._replace( subseq_index=cand.subseq_index + 1, )) # if this sequence char is *not* the candidate's next expected char else: # we can try skipping a sequence or sub-sequence char (or both), # unless this candidate has already skipped the maximum allowed # number of characters if cand.dist == max_l_dist: continue # add a candidate skipping a sequence char new_candidates.append(cand._replace(dist=cand.dist + 1)) if index + 1 < len(sequence) and cand.subseq_index + 1 < subseq_len: # add a candidate skipping both a sequence char and a # subsequence char new_candidates.append(cand._replace( dist=cand.dist + 1, subseq_index=cand.subseq_index + 1, )) # try skipping subsequence chars for n_skipped in xrange(1, max_l_dist - cand.dist + 1): # if skipping n_skipped sub-sequence chars reaches the end # of the sub-sequence, yield a match if cand.subseq_index + n_skipped == subseq_len: yield Match(cand.start, index + 1, cand.dist + n_skipped) break # otherwise, if skipping n_skipped sub-sequence chars # reaches a sub-sequence char identical to this sequence # char, add a candidate skipping n_skipped sub-sequence # chars elif subsequence[cand.subseq_index + n_skipped] == char: # if this is the last char of the sub-sequence, yield # a match if cand.subseq_index + n_skipped + 1 == subseq_len: yield Match(cand.start, index + 1, cand.dist + n_skipped) # otherwise add a candidate skipping n_skipped # subsequence chars else: new_candidates.append(cand._replace( dist=cand.dist + n_skipped, subseq_index=cand.subseq_index + 1 + n_skipped, )) break # note: if the above loop ends without a break, that means that # no candidate could be added / yielded by skipping sub-sequence # chars candidates = new_candidates for cand in candidates: dist = cand.dist + subseq_len - cand.subseq_index if dist <= max_l_dist: yield Match(cand.start, len(sequence), dist)