def _find_near_matches_substitutions_ngrams(subsequence, sequence, max_substitutions): subseq_len = len(subsequence) seq_len = len(sequence) def make_match(start, end, dist): return Match(start, end, dist, matched=sequence[start:end]) ngram_len = subseq_len // (max_substitutions + 1) if ngram_len == 0: raise ValueError( "The subsequence's length must be greater than max_substitutions!") for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len): ngram_end = ngram_start + ngram_len subseq_before = subsequence[:ngram_start] subseq_after = subsequence[ngram_end:] for index in search_exact( subsequence[ngram_start:ngram_end], sequence, ngram_start, seq_len - (subseq_len - ngram_end), ): n_substitutions = 0 seq_before = sequence[index - ngram_start:index] if subseq_before != seq_before: n_substitutions += count_differences_with_maximum( seq_before, subseq_before, max_substitutions - n_substitutions + 1) if n_substitutions > max_substitutions: continue seq_after = sequence[index + ngram_len:index - ngram_start + subseq_len] if subseq_after != seq_after: if n_substitutions == max_substitutions: continue n_substitutions += count_differences_with_maximum( seq_after, subseq_after, max_substitutions - n_substitutions + 1) if n_substitutions > max_substitutions: continue yield make_match( start=index - ngram_start, end=index - ngram_start + subseq_len, dist=n_substitutions, )
def find_near_matches_substitutions_ngrams(subsequence, sequence, max_substitutions): if not ( isinstance(subsequence, text_type) or isinstance(sequence, text_type) ): try: results = _subs_only_fnm_ngram_byteslike( subsequence, sequence, max_substitutions) except TypeError: pass else: matches = [ Match( index, index + len(subsequence), count_differences_with_maximum( sequence[index:index+len(subsequence)], subsequence, max_substitutions + 1, ), ) for index in results ] return [ get_best_match_in_group(group) for group in group_matches(matches) ] return py_find_near_matches_substitutions_ngrams( subsequence, sequence, max_substitutions)
def _find_near_matches_substitutions_ngrams(subsequence, sequence, max_substitutions): subseq_len = len(subsequence) seq_len = len(sequence) ngram_len = subseq_len // (max_substitutions + 1) if ngram_len == 0: raise ValueError( "The subsequence's length must be greater than max_substitutions!" ) for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len): ngram_end = ngram_start + ngram_len subseq_before = subsequence[:ngram_start] subseq_after = subsequence[ngram_end:] for index in search_exact( subsequence[ngram_start:ngram_end], sequence, ngram_start, seq_len - (subseq_len - ngram_end), ): n_substitutions = 0 seq_before = sequence[index - ngram_start:index] if subseq_before != seq_before: n_substitutions += count_differences_with_maximum( seq_before, subseq_before, max_substitutions - n_substitutions + 1) if n_substitutions > max_substitutions: continue seq_after = sequence[index + ngram_len:index - ngram_start + subseq_len] if subseq_after != seq_after: if n_substitutions == max_substitutions: continue n_substitutions += count_differences_with_maximum( seq_after, subseq_after, max_substitutions - n_substitutions + 1) if n_substitutions > max_substitutions: continue yield Match( start=index - ngram_start, end=index - ngram_start + subseq_len, dist=n_substitutions, )
def search(self, subsequence, sequence, max_subs): results = fnm_subs_lp_byteslike(subsequence, sequence, max_subs) matches = [ Match(index, index + len(subsequence), count_differences_with_maximum( sequence[index:index + len(subsequence)], subsequence, max_subs + 1, ), matched=sequence[index:index + len(subsequence)]) for index in results ] return matches
def search(self, subsequence, sequence, max_subs): results = fnm_subs_lp_byteslike(subsequence, sequence, max_subs) matches = [ Match( index, index + len(subsequence), count_differences_with_maximum( sequence[index:index+len(subsequence)], subsequence, max_subs + 1, ), ) for index in results ] return matches
def search(self, subsequence, sequence, max_subs): results = fnm_subs_ngrams_byteslike(subsequence, sequence, max_subs) matches = [ Match(index, index + len(subsequence), count_differences_with_maximum( sequence[index:index + len(subsequence)], subsequence, max_subs + 1, ), matched=sequence[index:index + len(subsequence)]) for index in results ] return [ get_best_match_in_group(group) for group in group_matches(matches) ]
def search(self, subsequence, sequence, max_subs): results = fnm_subs_ngrams_byteslike(subsequence, sequence, max_subs) matches = [ Match( index, index + len(subsequence), count_differences_with_maximum( sequence[index:index+len(subsequence)], subsequence, max_subs + 1, ), ) for index in results ] return [ get_best_match_in_group(group) for group in group_matches(matches) ]
def count_diffs(self, seq1, seq2, max_diffs): return count_differences_with_maximum(seq1, seq2, max_diffs)