Exemplo n.º 1
0
 def test_list_of_words_one_extra(self):
     subsequence = "jumped over lazy dog".split()
     sequence = "the big brown fox jumped over the lazy dog".split()
     for params, expected_outcomes in [
         ((0, 0, 0, 0), []),
         ((1, 0, 0, 1), []),
         ((0, 1, 0, 1), [
             Match(start=4,
                   end=9,
                   dist=1,
                   matched="jumped over the lazy dog".split())
         ]),
         ((0, 0, 1, 1), []),
         ((1, 1, 1, 1), [
             Match(start=4,
                   end=9,
                   dist=1,
                   matched="jumped over the lazy dog".split())
         ]),
         ((2, 2, 2, 2), [
             Match(start=4,
                   end=9,
                   dist=1,
                   matched="jumped over the lazy dog".split())
         ]),
     ]:
         self.expectedOutcomes(
             self.search(subsequence, sequence, *params),
             expected_outcomes,
         )
Exemplo n.º 2
0
 def test_missing_second_item_complex(self):
     self.assertTrue(
         set(self.search(b('bde'), b('abcdefg'), 1, 1, 1, 1)).issubset([
             Match(start=1, end=5, dist=1, matched=b('bcde')),
             Match(start=2, end=5, dist=1, matched=b('cde')),
             Match(start=3, end=5, dist=1, matched=b('de')),
         ]))
Exemplo n.º 3
0
 def test_double_first_item_two_results(self):
     self.assertEqual(
         self.search(b('def'), b('abcddefg'), 0, 1, 0),
         [
             Match(start=3, end=7, dist=1, matched=b('ddef')),
             Match(start=4, end=7, dist=0, matched=b('def'))
         ],
     )
Exemplo n.º 4
0
 def test_double_first_item_two_results(self):
     self.expectedOutcomes(
         self.search(b('def'), b('abcddefg'), 0, 1, 0, 1),
         [
             Match(start=3, end=7, dist=1, matched=b('ddef')),
             Match(start=4, end=7, dist=0, matched=b('def'))
         ],
     )
Exemplo n.º 5
0
 def test_only_deletions(self):
     self.expectedOutcomes(
         self.search(b('TESTabc'), b('TEST123'), 0, 0, 5, None),
         [
             Match(start=0, end=4, dist=3, matched=b('TEST')),
             Match(start=1, end=4, dist=4, matched=b('EST')),
             Match(start=2, end=4, dist=5, matched=b('ST')),
         ],
     )
Exemplo n.º 6
0
 def test_separate(self):
     matches = [
         Match(start=19, end=29, dist=1, matched='x'*10),
         Match(start=42, end=52, dist=1, matched='x'*10),
         Match(start=99, end=109, dist=0, matched='x'*10),
     ]
     self.assertEqual(
         group_matches(matches),
         [{m} for m in matches],
     )
Exemplo n.º 7
0
    def test_two_identical(self):
        self.expectedOutcomes(
            self.search(b('abc'), b('abcabc'), max_subs=1),
            [Match(start=0, end=3, dist=0), Match(start=3, end=6, dist=0)],
        )

        self.expectedOutcomes(
            self.search(b('abc'), b('abcXabc'), max_subs=1),
            [Match(start=0, end=3, dist=0), Match(start=4, end=7, dist=0)],
        )
Exemplo n.º 8
0
    def test_null_bytes(self):
        self.assertEqual(
            self.search(b('abc'), b('xx\0abcxx'), 0, 0, 0, 0),
            [Match(start=3, end=6, dist=0, matched=b('abc'))],
        )

        self.assertEqual(
            self.search(b('a\0b'), b('xxa\0bcxx'), 0, 0, 0, 0),
            [Match(start=2, end=5, dist=0, matched=b('a\0b'))],
        )
Exemplo n.º 9
0
 def test_separate_with_duplicate(self):
     matches = [
         Match(start=19, end=29, dist=1),
         Match(start=42, end=52, dist=1),
         Match(start=99, end=109, dist=0),
     ]
     self.assertEqual(
         group_matches(matches + [matches[1]]),
         [set([m]) for m in matches],
     )
Exemplo n.º 10
0
    def test_missing_second_to_last_item(self):
        self.assertEqual(
            self.search('bce', 'abcdefg', max_l_dist=1),
            [Match(start=1, end=5, dist=1)],
        )

        self.assertEqual(
            self.search('bce', 'abcdefg', max_l_dist=2),
            [Match(start=1, end=5, dist=1)],
        )
Exemplo n.º 11
0
 def test_max_substitutions_gte_subseq_len(self):
     for max_subs in [1, 2, 5]:
         self.expectedOutcomes(
             self.search(b('b'), b('abc'), max_subs),
             [Match(0, 1, 1), Match(1, 2, 0), Match(2, 3, 1)]
         )
     for extra_subs in [0, 1, 7]:
         self.expectedOutcomes(
             self.search(b('PATTERN'), b('PATTERN'), len('PATTERN') + extra_subs),
             [Match(0, len('PATTERN'), 0)]
         )
    def test_double_first_item(self):
        self.expectedOutcomes(
            self.search(b('def'), b('abcddefg'), max_subs=1),
            [Match(start=4, end=7, dist=0)],
        )

        self.expectedOutcomes(
            self.search(b('def'), b('abcddefg'), max_subs=2),
            [Match(start=3, end=6, dist=2),
             Match(start=4, end=7, dist=0)],
        )
Exemplo n.º 13
0
 def test_list_of_words(self):
     subsequence = "over a lazy dog".split()
     sequence = "the big brown fox jumped over the lazy dog".split()
     for max_l_dist, expected_outcomes in [
         (0, []),
         (1, [Match(start=5, end=9, dist=1)]),
         (2, [Match(start=5, end=9, dist=1)]),
     ]:
         self.assertEqual(
             self.search(subsequence, sequence, max_l_dist),
             expected_outcomes,
         )
Exemplo n.º 14
0
    def test_protein_search1(self):
        # see:
        # * BioPython archives from March 14th, 2014
        #   http://lists.open-bio.org/pipermail/biopython/2014-March/009030.html
        # * https://github.com/taleinat/fuzzysearch/issues/3
        text = b(''.join('''\
            XXXXXXXXXXXXXXXXXXXGGGTTVTTSSAAAAAAAAAAAAAGGGTTLTTSSAAAAAAAAAAAA
            AAAAAAAAAABBBBBBBBBBBBBBBBBBBBBBBBBGGGTTLTTSS
        '''.split()))
        pattern = b("GGGTTLTTSS")

        self.expectedOutcomes(
            self.search(pattern, text, max_subs=0),
            [Match(start=42, end=52, dist=0),
             Match(start=99, end=109, dist=0)],
        )

        self.expectedOutcomes(
            self.search(pattern, text, max_subs=1),
            [Match(start=19, end=29, dist=1),
             Match(start=42, end=52, dist=0),
             Match(start=99, end=109, dist=0)],
        )

        self.expectedOutcomes(
            self.search(pattern, text, max_subs=2),
            [Match(start=19, end=29, dist=1),
             Match(start=42, end=52, dist=0),
             Match(start=99, end=109, dist=0)],
        )
 def test_non_string_sequences(self):
     supported_types = [list, tuple]
     for klass in supported_types:
         with self.subTest(klass.__name__):
             self.expectedOutcomes(self.search(klass([1, 2, 3]), klass([1, 2, 3]), 0, 0, 0, 0),
                                   [Match(start=0, end=3, dist=0)])
             self.expectedOutcomes(self.search(klass([1, 2, 3]), klass([1, 2, 3]), 1, 1, 1, 1),
                                   [Match(start=0, end=3, dist=0)])
             self.expectedOutcomes(self.search(klass([1, 2, 3]), klass([1, 2, 4]), 0, 0, 0, 0),
                                   [])
             self.expectedOutcomes(self.search(klass([1, 2, 3]), klass([1, 2, 4]), 1, 1, 1, 1),
                                   [Match(start=0, end=3, dist=1)])
             self.expectedOutcomes(self.search(klass([1, 2, 3]), klass([1, 2, 4]), 0, 0, 1, 1),
                                   [Match(start=0, end=3, dist=1)])
 def test_list_of_words_one_substituted(self):
     subsequence = "jumped over my lazy dog".split()
     sequence = "the big brown fox jumped over the lazy dog".split()
     for params, expected_outcomes in [
         ((0, 0, 0, 0), []),
         ((1, 0, 0, 1), [Match(start=4, end=9, dist=1)]),
         ((0, 1, 0, 1), []),
         ((0, 0, 1, 1), []),
         ((0, 1, 1, 1), [Match(start=4, end=9, dist=1)]), # substitution = insertion + deletion; dist = 1 !!
         ((1, 1, 1, 1), [Match(start=4, end=9, dist=1)]),
         ((2, 2, 2, 2), [Match(start=4, end=9, dist=1)]),
     ]:
         self.expectedOutcomes(
             self.search(subsequence, sequence, *params),
             expected_outcomes,
         )
Exemplo n.º 17
0
 def test_unicode_substring(self):
     pattern = u('\u03A3\u0393')
     text = u('\u03A0\u03A3\u0393\u0394')
     self.expectedOutcomes(
         self.search(pattern, text, max_subs=0),
         [Match(1, 3, 0)]
     )
Exemplo n.º 18
0
    def test_single_substitution_in_long_text(self):
        substring = b('PATTERN')
        text = b(''.join([
            x.strip() for x in '''\
            FySijRLMtLLWkMnWxTbzIWuxOUbfAahWYKUlOZyhoQhfExJPOSwXxBLrlqdoUwpRW
            FEtHFiepnOTbkttuagADQaUTvkvKzvqaFaMnAPfolPpmXitKLDQhAqDOJwFzdcKmk
            cfVStxZGDUbrHjrDwVVRihbklyfqLJjrzGuhVGDzgSpCHXvaGPHebbcUAnAgfqqpA
            uMOowtptcoQUeAbdqJAmieLDxCrOPivbSwmriQwfFCDTXbswFqClZPnSkDkCyvPCi
            bmAjVGnuVsrZlPypglXlVVQKzMpQuWQynOLGDqwrAnsvYTcArkEhFpEgahWVQGOvv
            CTvbYZRVqqPCDRsyWeTVgANxZIyVAtENnndbsHzpEcPUfqCBUroIGRNEIMHYIZANy
            LeeVKEwihbvWZVOWPeAlmNKnhhoEPIcpDJDzPOYHSltxhSsZeeWMqtAnuSoFOIrqB
            EPUFIlKkpamljHylnTIWqaESoWbYESVPEeZtlAzpInuwFaNIYUvzpJNIlPtuOjUuT
            efaGnOXvQeHdaRPrdHCepPATXERNDdnkzuLHQcVWKpgHhGifBySAkWkthrzfZDHDU
            HJxjpLXseKuldLRftyctGvVKyrRTUCRAakjwTSWivGdksOZabnkBoRtMstlNwXcwg
            UCFLaWFxjqjasOfNeThrbubVGtyYRROYUOTMUmeSdJcBKxVXiaWDZoHyKtQRXwpVO
            pEmlpdzKWkFpDtHHdImhDJIXwxzjwyNLaTgPLHmcyhJGqncCblxALMdPEDaRtGFMg
            BskUxPGATTLKMFeIjgFJpudyMWlASyFSiaDWrOCgRfwjfpMYfuNQIqzvZbguWsnaq
            tRaXcxavobetBbbfMDjstQLjoJLwiajVRKhFVspIdgrmTMEBbjtpMnSpTkmFcRBZZ
            GUOWnesGgZeKkIQhlxlRPTtjUbbpaPlmxeiBdUKHHApgvEybUwWwXCoXFsauNiINm
            AGATFdcaHzgoRpbBFhKdJkLMF'''.splitlines()
        ]))
        expected_match = Match(start=541,
                               end=548,
                               dist=1,
                               matched=text[541:548])

        self.assertEqual(
            self.search(substring, text, 1, 0, 0, 1),
            [expected_match],
        )

        self.assertEqual(
            self.search(substring, text, 1, 1, 1, 1),
            [expected_match],
        )
Exemplo n.º 19
0
def find_near_matches_levenshtein(subsequence, sequence, max_l_dist):
    """Find near-matches of the subsequence in the sequence.

    This chooses a suitable fuzzy search implementation according to the given
    parameters.

    Returns a list of fuzzysearch.Match objects describing the matching parts
    of the sequence.
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')
    if max_l_dist < 0:
        raise ValueError('Maximum Levenshtein distance must be >= 0!')

    if max_l_dist == 0:
        return [
            Match(start_index, start_index + len(subsequence), 0)
            for start_index in search_exact(subsequence, sequence)
        ]

    elif len(subsequence) // (max_l_dist + 1) >= 3:
        return find_near_matches_levenshtein_ngrams(subsequence, sequence,
                                                    max_l_dist)

    else:
        matches = find_near_matches_levenshtein_linear_programming(
            subsequence, sequence, max_l_dist)
        match_groups = group_matches(matches)
        best_matches = [
            get_best_match_in_group(group) for group in match_groups
        ]
        return sorted(best_matches)
Exemplo n.º 20
0
def find_near_matches_substitutions(subsequence, sequence, max_substitutions):
    """Find near-matches of the subsequence in the sequence.

    This chooses a suitable fuzzy search implementation according to the given
    parameters.

    Returns a list of fuzzysearch.Match objects describing the matching parts
    of the sequence.
    """
    _check_arguments(subsequence, sequence, max_substitutions)

    if max_substitutions == 0:
        return [
            Match(start_index, start_index + len(subsequence), 0)
            for start_index in search_exact(subsequence, sequence)
        ]

    elif len(subsequence) // (max_substitutions + 1) >= 3:
        return find_near_matches_substitutions_ngrams(
            subsequence, sequence, max_substitutions,
        )

    else:
        return find_near_matches_substitutions_lp(
            subsequence, sequence, max_substitutions,
        )
Exemplo n.º 21
0
    def find_near_matches_substitutions_ngrams(subsequence, sequence,
                                               max_substitutions):
        if not (
            isinstance(subsequence, text_type) or
            isinstance(sequence, text_type)
        ):
            try:
                results = _subs_only_fnm_ngram_byteslike(
                    subsequence, sequence, max_substitutions)
            except TypeError:
                pass
            else:
                matches = [
                    Match(
                        index,
                        index + len(subsequence),
                        count_differences_with_maximum(
                            sequence[index:index+len(subsequence)],
                            subsequence,
                            max_substitutions + 1,
                        ),
                    )
                    for index in results
                ]
                return [
                    get_best_match_in_group(group)
                    for group in group_matches(matches)
                ]

        return py_find_near_matches_substitutions_ngrams(
            subsequence, sequence, max_substitutions)
Exemplo n.º 22
0
def find_near_matches_generic(subsequence, sequence, search_params):
    """search for near-matches of subsequence in sequence

    This searches for near-matches, where the nearly-matching parts of the
    sequence must meet the following limitations (relative to the subsequence):

    * the maximum allowed number of character substitutions
    * the maximum allowed number of new characters inserted
    * and the maximum allowed number of character deletions
    * the total number of substitutions, insertions and deletions
    """
    if not subsequence:
        raise ValueError('Given subsequence is empty!')

    # if the limitations are so strict that only exact matches are allowed,
    # use search_exact()
    if search_params.max_l_dist == 0:
        return [
            Match(start_index, start_index + len(subsequence), 0)
            for start_index in search_exact(subsequence, sequence)
        ]

    # if the n-gram length would be at least 3, use the n-gram search method
    elif len(subsequence) // (search_params.max_l_dist + 1) >= 3:
        return find_near_matches_generic_ngrams(subsequence, sequence, search_params)

    # use the linear programming search method
    else:
        return find_near_matches_generic_linear_programming(subsequence, sequence, search_params)
Exemplo n.º 23
0
    def test_two_extra(self):
        sequence = '--abc--de--'
        pattern = 'abcde'

        self.assertEqual(
            fnm_nodels_ngrams(pattern, sequence, 0, 2, 2),
            [Match(start=2, end=9, dist=2)],
        )

        self.assertEqual(
            fnm_nodels_ngrams(pattern, sequence, 2, 0, 2),
            [Match(start=2, end=7, dist=2)],
        )

        self.assertEqual(
            fnm_nodels_ngrams(pattern, sequence, 2, 2, 2),
            [Match(start=2, end=7, dist=2), Match(start=2, end=9, dist=2)],
        )
Exemplo n.º 24
0
 def test_cases(self):
     for name, data in self.test_cases_data.items():
         substring, text, max_l_dist2expected_matches = data
         with self.subTest(name=name):
             for max_l_dist, expected_matches in max_l_dist2expected_matches:
                 self.assertEqual(
                     self.search(substring, text, max_l_dist=max_l_dist),
                     [Match(*x) for x in expected_matches],
                 )
Exemplo n.º 25
0
    def test_short_substring(self):
        substring = b('XY')
        text = b('abcdefXYghij')
        expected_match = Match(start=6, end=8, dist=0, matched=substring)

        self.assertEqual(
            self.search(substring, text, 0, 0, 0, 0),
            [expected_match],
        )
Exemplo n.º 26
0
    def test_substring(self):
        substring = b('PATTERN')
        text = b('aaaaaaaaaaPATTERNaaaaaaaaa')
        expected_match = Match(start=10, end=17, dist=0, matched=substring)

        self.assertEqual(
            self.search(substring, text, 0, 0, 0, 0),
            [expected_match],
        )
Exemplo n.º 27
0
def _find_near_matches_substitutions_lp(subsequence, sequence,
                                        max_substitutions):
    # simple optimization: prepare some often used things in advance
    _SUBSEQ_LEN = len(subsequence)
    _SUBSEQ_LEN_MINUS_ONE = _SUBSEQ_LEN - 1

    # prepare quick lookup of where a character appears in the subsequence
    char_indexes_in_subsequence = defaultdict(list)
    for (index, char) in enumerate(subsequence):
        char_indexes_in_subsequence[char].append(index)

    # we'll iterate over the sequence once, but the iteration is split into two
    # for loops; therefore we prepare an iterator in advance which will be used
    # in both of the loops
    sequence_enum_iter = enumerate(sequence)

    # We'll count the number of matching characters assuming various attempted
    # alignments of the subsequence to the sequence. At any point in the
    # sequence there will be N such alignments to update. We'll keep
    # these in a "circular array" (a.k.a. a ring) which we'll rotate after each
    # iteration to re-align the indexing.

    # Initialize the candidate counts by iterating over the first N-1 items in
    # the sequence. No possible matches in this step!
    candidates = deque([0], maxlen=_SUBSEQ_LEN)
    for (index, char) in islice(sequence_enum_iter, _SUBSEQ_LEN_MINUS_ONE):
        for subseq_index in [
                idx for idx in char_indexes_in_subsequence[char]
                if idx <= index
        ]:
            candidates[subseq_index] += 1
        candidates.appendleft(0)

    # From the N-th item onwards, we'll update the candidate counts exactly as
    # above, and additionally check if the part of the sequence whic began N-1
    # items before the current index was a near enough match to the given
    # sub-sequence.
    for (index, char) in sequence_enum_iter:
        for subseq_index in char_indexes_in_subsequence[char]:
            candidates[subseq_index] += 1

        # rotate the ring of candidate counts
        candidates.rotate(1)
        # fetch the count for the candidate which started N-1 items ago
        n_substitutions = _SUBSEQ_LEN - candidates[0]
        # set the count for the next index to zero
        candidates[0] = 0

        # if the candidate had few enough mismatches, yield a match
        if n_substitutions <= max_substitutions:
            yield Match(
                start=index - _SUBSEQ_LEN_MINUS_ONE,
                end=index + 1,
                dist=n_substitutions,
            )
Exemplo n.º 28
0
    def test_missing_second_item(self):
        self.assertEqual(
            self.search(b('bde'), b('abcdefg'), 0, 1, 0, 1),
            [Match(start=1, end=5, dist=1, matched=b('bcde'))],
        )

        self.assertEqual(
            self.search(b('bde'), b('abcdefg'), 0, 0, 0, 0),
            [],
        )

        self.assertEqual(
            self.search(b('bde'), b('abcdefg'), 1, 0, 0, 1),
            [Match(start=2, end=5, dist=1, matched=b('cde'))],
        )

        self.assertEqual(
            self.search(b('bde'), b('abcdefg'), 0, 0, 1, 1),
            [Match(start=3, end=5, dist=1, matched=b('de'))],
        )
Exemplo n.º 29
0
 def test_one_sub_one_ins(self):
     sequence = 'abcdefghij'
     pattern = 'bceXghi'
     expected_match = Match(start=1, end=9, dist=2)
     self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 0, 0, 0), [])
     self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 0, 1, 2), [])
     self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 1, 0, 2), [])
     self.assertEqual(fnm_nodels_ngrams(pattern, sequence, 1, 1, 1), [])
     self.assertEqual(
         fnm_nodels_ngrams(pattern, sequence, 1, 1, 2),
         [expected_match],
     )
Exemplo n.º 30
0
 def search(self, subsequence, sequence, max_subs):
     results = fnm_subs_lp_byteslike(subsequence, sequence, max_subs)
     matches = [
         Match(index,
               index + len(subsequence),
               count_differences_with_maximum(
                   sequence[index:index + len(subsequence)],
                   subsequence,
                   max_subs + 1,
               ),
               matched=sequence[index:index + len(subsequence)])
         for index in results
     ]
     return matches