def build_sp(P): sp = [0 for x in P] Z = fundamental_preprocess(P) for j in range(len(P) - 1, 0, -1): i = j + Z[j] - 1 sp[i] = Z[j] return sp
def full_shift_table(S): F = [0 for c in S] Z = fundamental_preprocess(S) longest = 0 for i, zv in enumerate(reversed(Z)): longest = max(zv, longest) if zv == i+1 else longest F[-i-1] = longest return F
def good_suffix_table(S): L = [-1 for c in S] N = fundamental_preprocess(S[::-1]) # S[::-1] reverses S N.reverse() for j in range(0, len(S)-1): i = len(S) - N[j] if i != len(S): L[i] = j return L
def test_no_prefix_matches(self): S = "abcdefgh" expected = [8, 0, 0, 0, 0, 0, 0, 0] self.assertEqual(expected, fundamental_preprocess(S))
def test_single_char(self): self.assertEqual([1], fundamental_preprocess("a"))
def test_empty_string(self): self.assertEqual([], fundamental_preprocess(""))
def test_character_repeated(self): S = "aaaaaaaa" expected = [8, 7, 6, 5, 4, 3, 2, 1] self.assertEqual(expected, fundamental_preprocess(S))
def test_overlapping_prefix_match(self): S = "aabaabaaba" expected = [10, 1, 0, 7, 1, 0, 4, 1, 0, 1] self.assertEqual(expected, fundamental_preprocess(S))
def test_multiple_prefix_match(self): S = "aabaacaab" expected = [9, 1, 0, 2, 1, 0, 3, 1, 0] self.assertEqual(expected, fundamental_preprocess(S))
def test_single_prefix_match(self): S = "abab" expected = [4, 0, 2, 0] self.assertEqual(expected, fundamental_preprocess(S))
def string_search(P, T): if len(P) == 0 or len(T) < len(P): return [] matches = [] # Preprocessing N = fundamental_preprocess(T[::-1]) # S[::-1] reverses S N.reverse() R = bad_character_table(P) L = good_suffix_table(P) F = full_shift_table(P) M = [-1 for c in T] k = len(P) - 1 # Represents alignment of end of P relative to T i = len(P) - 1 # Character to compare in P h = k # Character to compare in T match = False # Indicates whether an exact match has been found in this phase mismatch = False # Indicates whether a mismatch has occurred while k < len(T): if M[h] == -1 or M[h] == 0 or N[i] == 0: # Phase case 1 #print 'Case 1' if T[h] == P[i]: if i == 0: # Case 1a match = True mismatch = False else: # Case 1b i -= 1 h -= 1 match = False mismatch = False else: # Case 1c match = False mismatch = True elif (M[h] < N[i] and M[h] != -1) or (M[h] == N[i] and 0 < N[i] < i+1): # Case 2 & 5 #print 'Case 2 & 5' i -= M[h] h -= M[h] match = False mismatch = False elif M[h] >= N[i] and N[i] == i+1 > 0: # Phase case 3 #print 'Case 3' match = True mismatch = False elif M[h] > N[i] and N[i] < i+1: # Phase case 4 #print 'Case 4' i -= N[i] h -= N[i] match = False mismatch = True if match: matches.append(k - len(P) + 1) M[k] = k - h k += len(P)-F[1] if len(P) > 1 else 1 i = len(P) - 1 h = k match = False mismatch = False if mismatch: char_shift = i - R[alphabet_index(T[h])][i] if i+1 == len(P): # Mismatch happened on first attempt suffix_shift = 1 elif L[i+1] == -1: # Matched suffix does not appear anywhere in P suffix_shift = len(P) - F[i+1] else: # Matched suffix appears in P suffix_shift = len(P) - L[i+1] M[k] = k - h k += max(char_shift, suffix_shift) i = len(P) - 1 h = k match = False mismatch = False return matches