예제 #1
0
def _find_near_matches_substitutions_ngrams(subsequence, sequence,
                                            max_substitutions):
    subseq_len = len(subsequence)
    seq_len = len(sequence)

    def make_match(start, end, dist):
        return Match(start, end, dist, matched=sequence[start:end])

    ngram_len = subseq_len // (max_substitutions + 1)
    if ngram_len == 0:
        raise ValueError(
            "The subsequence's length must be greater than max_substitutions!")

    for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        subseq_before = subsequence[:ngram_start]
        subseq_after = subsequence[ngram_end:]
        for index in search_exact(
                subsequence[ngram_start:ngram_end],
                sequence,
                ngram_start,
                seq_len - (subseq_len - ngram_end),
        ):
            n_substitutions = 0
            seq_before = sequence[index - ngram_start:index]
            if subseq_before != seq_before:
                n_substitutions += count_differences_with_maximum(
                    seq_before, subseq_before,
                    max_substitutions - n_substitutions + 1)
                if n_substitutions > max_substitutions:
                    continue

            seq_after = sequence[index + ngram_len:index - ngram_start +
                                 subseq_len]
            if subseq_after != seq_after:
                if n_substitutions == max_substitutions:
                    continue
                n_substitutions += count_differences_with_maximum(
                    seq_after, subseq_after,
                    max_substitutions - n_substitutions + 1)
                if n_substitutions > max_substitutions:
                    continue

            yield make_match(
                start=index - ngram_start,
                end=index - ngram_start + subseq_len,
                dist=n_substitutions,
            )
    def find_near_matches_substitutions_ngrams(subsequence, sequence,
                                               max_substitutions):
        if not (
            isinstance(subsequence, text_type) or
            isinstance(sequence, text_type)
        ):
            try:
                results = _subs_only_fnm_ngram_byteslike(
                    subsequence, sequence, max_substitutions)
            except TypeError:
                pass
            else:
                matches = [
                    Match(
                        index,
                        index + len(subsequence),
                        count_differences_with_maximum(
                            sequence[index:index+len(subsequence)],
                            subsequence,
                            max_substitutions + 1,
                        ),
                    )
                    for index in results
                ]
                return [
                    get_best_match_in_group(group)
                    for group in group_matches(matches)
                ]

        return py_find_near_matches_substitutions_ngrams(
            subsequence, sequence, max_substitutions)
예제 #3
0
def _find_near_matches_substitutions_ngrams(subsequence, sequence,
                                            max_substitutions):
    subseq_len = len(subsequence)
    seq_len = len(sequence)

    ngram_len = subseq_len // (max_substitutions + 1)
    if ngram_len == 0:
        raise ValueError(
            "The subsequence's length must be greater than max_substitutions!"
        )

    for ngram_start in range(0, len(subsequence) - ngram_len + 1, ngram_len):
        ngram_end = ngram_start + ngram_len
        subseq_before = subsequence[:ngram_start]
        subseq_after = subsequence[ngram_end:]
        for index in search_exact(
                subsequence[ngram_start:ngram_end], sequence,
                ngram_start, seq_len - (subseq_len - ngram_end),
        ):
            n_substitutions = 0
            seq_before = sequence[index - ngram_start:index]
            if subseq_before != seq_before:
                n_substitutions += count_differences_with_maximum(
                    seq_before, subseq_before,
                    max_substitutions - n_substitutions + 1)
                if n_substitutions > max_substitutions:
                    continue

            seq_after = sequence[index + ngram_len:index - ngram_start + subseq_len]
            if subseq_after != seq_after:
                if n_substitutions == max_substitutions:
                    continue
                n_substitutions += count_differences_with_maximum(
                    seq_after, subseq_after,
                    max_substitutions - n_substitutions + 1)
                if n_substitutions > max_substitutions:
                    continue

            yield Match(
                start=index - ngram_start,
                end=index - ngram_start + subseq_len,
                dist=n_substitutions,
            )
예제 #4
0
 def search(self, subsequence, sequence, max_subs):
     results = fnm_subs_lp_byteslike(subsequence, sequence, max_subs)
     matches = [
         Match(index,
               index + len(subsequence),
               count_differences_with_maximum(
                   sequence[index:index + len(subsequence)],
                   subsequence,
                   max_subs + 1,
               ),
               matched=sequence[index:index + len(subsequence)])
         for index in results
     ]
     return matches
 def search(self, subsequence, sequence, max_subs):
     results = fnm_subs_lp_byteslike(subsequence, sequence,
                                     max_subs)
     matches = [
         Match(
             index,
             index + len(subsequence),
             count_differences_with_maximum(
                 sequence[index:index+len(subsequence)],
                 subsequence,
                 max_subs + 1,
             ),
         )
         for index in results
     ]
     return matches
예제 #6
0
 def search(self, subsequence, sequence, max_subs):
     results = fnm_subs_ngrams_byteslike(subsequence, sequence,
                                         max_subs)
     matches = [
         Match(index,
               index + len(subsequence),
               count_differences_with_maximum(
                   sequence[index:index + len(subsequence)],
                   subsequence,
                   max_subs + 1,
               ),
               matched=sequence[index:index + len(subsequence)])
         for index in results
     ]
     return [
         get_best_match_in_group(group)
         for group in group_matches(matches)
     ]
 def search(self, subsequence, sequence, max_subs):
     results = fnm_subs_ngrams_byteslike(subsequence, sequence,
                                         max_subs)
     matches = [
         Match(
             index,
             index + len(subsequence),
             count_differences_with_maximum(
                 sequence[index:index+len(subsequence)],
                 subsequence,
                 max_subs + 1,
             ),
         )
         for index in results
     ]
     return [
         get_best_match_in_group(group)
         for group in group_matches(matches)
     ]
예제 #8
0
 def count_diffs(self, seq1, seq2, max_diffs):
     return count_differences_with_maximum(seq1, seq2, max_diffs)
예제 #9
0
 def count_diffs(self, seq1, seq2, max_diffs):
     return count_differences_with_maximum(seq1, seq2, max_diffs)