def string_search(P, T): matches = [] if len(P) == 0 or len(T) < len(P): return matches # Preprocessing R = bad_character_table(P) L = good_suffix_table(P) F = full_shift_table(P) k = len(P) - 1 # Represents alignment of end of P relative to T previous_k = -1 # Represents alignment in previous phase (Galil's rule) while k < len(T): i = len(P) - 1 # Character to compare in P h = k # Character to compare in T while i >= 0 and h > previous_k and P[i] == T[h]: # Matches starting from end of P i -= 1 h -= 1 if i == -1 or h == previous_k: # Match has been found (Galil's rule) matches.append(k - len(P) + 1) k += len(P)-F[1] if len(P) > 1 else 1 else: # No match, shift by max of bad character and good suffix rules char_shift = i - R[alphabet_index(T[h])][i] if i+1 == len(P): # Mismatch happened on first attempt suffix_shift = 1 elif L[i+1] == -1: # Matched suffix does not appear anywhere in P suffix_shift = len(P) - F[i+1] else: # Matched suffix appears in P suffix_shift = len(P) - L[i+1] shift = max(char_shift, suffix_shift) previous_k = k if shift >= i+1 else previous_k # Galil's rule k += shift return matches
def bad_character_table(S): if len(S) == 0: return [[] for a in range(26)] R = [[-1] for a in range(26)] alpha = [-1 for a in range(26)] for i, c in enumerate(S): alpha[alphabet_index(c)] = i for j, a in enumerate(alpha): R[j].append(a) return R
def test_single_char_string(self): alpha = 'abcdefghijklmnopqrstuvwxyz' for c in alpha: expected = [[-1,-1] for x in range(26)] expected[alphabet_index(c)][1] = 0 self.assertEqual(expected, bad_character_table(c))
def string_search(P, T): if len(P) == 0 or len(T) < len(P): return [] matches = [] # Preprocessing N = fundamental_preprocess(T[::-1]) # S[::-1] reverses S N.reverse() R = bad_character_table(P) L = good_suffix_table(P) F = full_shift_table(P) M = [-1 for c in T] k = len(P) - 1 # Represents alignment of end of P relative to T i = len(P) - 1 # Character to compare in P h = k # Character to compare in T match = False # Indicates whether an exact match has been found in this phase mismatch = False # Indicates whether a mismatch has occurred while k < len(T): if M[h] == -1 or M[h] == 0 or N[i] == 0: # Phase case 1 #print 'Case 1' if T[h] == P[i]: if i == 0: # Case 1a match = True mismatch = False else: # Case 1b i -= 1 h -= 1 match = False mismatch = False else: # Case 1c match = False mismatch = True elif (M[h] < N[i] and M[h] != -1) or (M[h] == N[i] and 0 < N[i] < i+1): # Case 2 & 5 #print 'Case 2 & 5' i -= M[h] h -= M[h] match = False mismatch = False elif M[h] >= N[i] and N[i] == i+1 > 0: # Phase case 3 #print 'Case 3' match = True mismatch = False elif M[h] > N[i] and N[i] < i+1: # Phase case 4 #print 'Case 4' i -= N[i] h -= N[i] match = False mismatch = True if match: matches.append(k - len(P) + 1) M[k] = k - h k += len(P)-F[1] if len(P) > 1 else 1 i = len(P) - 1 h = k match = False mismatch = False if mismatch: char_shift = i - R[alphabet_index(T[h])][i] if i+1 == len(P): # Mismatch happened on first attempt suffix_shift = 1 elif L[i+1] == -1: # Matched suffix does not appear anywhere in P suffix_shift = len(P) - F[i+1] else: # Matched suffix appears in P suffix_shift = len(P) - L[i+1] M[k] = k - h k += max(char_shift, suffix_shift) i = len(P) - 1 h = k match = False mismatch = False return matches
def test_uppercase(self): for i in range(26): self.assertEqual(i, alphabet_index(self.alpha[i].upper()))
def test_lowercase(self): for i in range(26): self.assertEqual(i, alphabet_index(self.alpha[i]))