Пример #1
0
def build_sp(P):
    sp = [0 for x in P]
    Z = fundamental_preprocess(P)
    for j in range(len(P) - 1, 0, -1):
        i = j + Z[j] - 1
        sp[i] = Z[j]
    return sp
Пример #2
0
def full_shift_table(S):
    F = [0 for c in S]
    Z = fundamental_preprocess(S)
    longest = 0
    for i, zv in enumerate(reversed(Z)):
        longest = max(zv, longest) if zv == i+1 else longest
        F[-i-1] = longest
    return F
Пример #3
0
def good_suffix_table(S):
    L = [-1 for c in S]
    N = fundamental_preprocess(S[::-1]) # S[::-1] reverses S
    N.reverse()
    for j in range(0, len(S)-1):
        i = len(S) - N[j]
        if i != len(S):
            L[i] = j
    return L
Пример #4
0
 def test_no_prefix_matches(self):
     S = "abcdefgh"
     expected = [8, 0, 0, 0, 0, 0, 0, 0]
     self.assertEqual(expected, fundamental_preprocess(S))
Пример #5
0
 def test_single_char(self):
     self.assertEqual([1], fundamental_preprocess("a"))
Пример #6
0
 def test_empty_string(self):
     self.assertEqual([], fundamental_preprocess(""))
Пример #7
0
 def test_character_repeated(self):
     S = "aaaaaaaa"
     expected = [8, 7, 6, 5, 4, 3, 2, 1]
     self.assertEqual(expected, fundamental_preprocess(S))
Пример #8
0
 def test_overlapping_prefix_match(self):
     S = "aabaabaaba"
     expected = [10, 1, 0, 7, 1, 0, 4, 1, 0, 1]
     self.assertEqual(expected, fundamental_preprocess(S))
Пример #9
0
 def test_multiple_prefix_match(self):
     S = "aabaacaab"
     expected = [9, 1, 0, 2, 1, 0, 3, 1, 0]
     self.assertEqual(expected, fundamental_preprocess(S))
Пример #10
0
 def test_single_prefix_match(self):
     S = "abab"
     expected = [4, 0, 2, 0]
     self.assertEqual(expected, fundamental_preprocess(S))
Пример #11
0
def string_search(P, T):
    if len(P) == 0 or len(T) < len(P):
        return []

    matches = []

    # Preprocessing
    N = fundamental_preprocess(T[::-1]) # S[::-1] reverses S
    N.reverse()
    R = bad_character_table(P)
    L = good_suffix_table(P)
    F = full_shift_table(P)
    M = [-1 for c in T]

    k = len(P) - 1      # Represents alignment of end of P relative to T
    i = len(P) - 1      # Character to compare in P
    h = k               # Character to compare in T
    match = False       # Indicates whether an exact match has been found in this phase
    mismatch = False    # Indicates whether a mismatch has occurred

    while k < len(T):
        if M[h] == -1 or M[h] == 0 or N[i] == 0:    # Phase case 1
            #print 'Case 1'
            if T[h] == P[i]: 
                if i == 0:  # Case 1a
                    match = True
                    mismatch = False
                else:       # Case 1b
                    i -= 1
                    h -= 1
                    match = False
                    mismatch = False
            else:           # Case 1c
                match = False
                mismatch = True
        elif (M[h] < N[i] and M[h] != -1) or (M[h] == N[i] and 0 < N[i] < i+1): # Case 2 & 5
            #print 'Case 2 & 5'
            i -= M[h]
            h -= M[h]
            match = False
            mismatch = False
        elif M[h] >= N[i] and N[i] == i+1 > 0:  # Phase case 3
            #print 'Case 3'
            match = True 
            mismatch = False
        elif M[h] > N[i] and N[i] < i+1:    # Phase case 4
            #print 'Case 4'
            i -= N[i]
            h -= N[i]
            match = False
            mismatch = True
        if match:
            matches.append(k - len(P) + 1)
            M[k] = k - h
            k += len(P)-F[1] if len(P) > 1 else 1
            i = len(P) - 1
            h = k
            match = False
            mismatch = False
        if mismatch:
            char_shift = i - R[alphabet_index(T[h])][i]
            if i+1 == len(P):   # Mismatch happened on first attempt
                suffix_shift = 1
            elif L[i+1] == -1:   # Matched suffix does not appear anywhere in P
                suffix_shift = len(P) - F[i+1]
            else:               # Matched suffix appears in P
                suffix_shift = len(P) - L[i+1]
            M[k] = k - h
            k += max(char_shift, suffix_shift)
            i = len(P) - 1
            h = k
            match = False
            mismatch = False
    return matches