Пример #1
0
def search_editdist_zalg(text, pat):
    pat_len = len(pat)
    txt_len = len(text)
    first_string = pat + "$" + text
    # create the first string by concatenating pattern, "$", and text
    str_len = len(first_string)
    first_lis = calculateZ(first_string)
    # calculate the z values for the first string
    second_string = pat[::-1] + "$" + text[::-1]
    # create the second string by concatenating reversed pattern, "$", and reversed text
    second_lis = calculateZ(second_string)
    # calculate the z values for the second string
    result = []
    for i in range(pat_len + 1, str_len - pat_len + 2):
        substitute_sum = first_lis[i] + second_lis[txt_len + pat_len + 2 - i]
        # sum the z value of the first character of the pattern in the first string, and the z value of the
        # last character in the second string. Case substitution
        delete_sum = first_lis[i] + second_lis[txt_len + pat_len + 2 - i - 1]
        # sum the z value of the first character of the pattern in the first string, and the z value of the
        # one before the last character in the second string so that the corresponding substring with length
        # of len(pat) + 1. Case insertion
        insert_sum = first_lis[i] + second_lis[txt_len + pat_len + 2 - i + 1]

        # sum the z value of the first character of the pattern in the first string, and the z value of the
        # second last character in the second string so that the corresponding substring with length
        # of len(pat) - 1. Case deletion
        if substitute_sum == pat_len * 2:
            print("match", i, substitute_sum)
            print(first_string[i:i + 5])
            result.append([i - len(pat), 0])
            # if they exactly match
        elif substitute_sum == pat_len - 1:
            print("sub", i, substitute_sum)
            print(first_string[i:i + 5])
            result.append([i - len(pat), 1])
            # if they match with hamming distance = 1
        elif insert_sum == pat_len - 1:
            print("insert", i, insert_sum)
            print(first_string[i:i + 5])
            result.append([i - len(pat), 1])
            # if they match by deleting one character from pattern
        elif delete_sum == pat_len:
            # if they match by inserting one character into pattern
            if first_lis[i + 1] + second_lis[
                    txt_len + pat_len + 1 -
                    i] != 2 * pat_len:  #删除的时候,防止下一位开始正好完全匹配
                print("delete", i, delete_sum)
                print(first_string[i:i + 5])
                result.append([i - len(pat), 1])
        # 1. This conditional statement will check cases in order, if any cases matched, it will not consider
        # the other cases. For example, if xyz matches xyz, then xy matches xyz with edit distance = 1 will
        # not be included
        # 2. When insertion case happens at i, the position will be stored only if there is not any exactly match at
        # i+1
        # 3. When there is an exactly match at i, the deletion case at i+1 will not be considered because
        # its corresponding z_value in the second string will be the length of the pattern, exceeding the len -1
    return result
def goodsufix(pat):
    m = len(pat)
    z = list(reversed(calculateZ(pat[::-1])))
    good_suffix = [-1 for _ in range(m + 1)]
    for p in range(m):
        j = m - z[p]
        good_suffix[j] = p
    good_suffix.pop()
    return good_suffix
Пример #3
0
def computeSP(pat):
    m = len(pat)
    z = calculateZ(pat)
    SP = [0 for _ in range(m)]
    for j in range(m - 1, -1, -1):
        i = j + z[j] - 1
        if i == -1:
            break
        SP[i] = z[j]
    return SP
Пример #4
0
def edit_distance(text, pat):
    str = pat + "$" + text
    z1 = calculateZ(str)
    reverse_str = pat[::-1] + "$" + text[::-1]
    z2 = calculateZ(reverse_str)
    str_len = len(str)
    m = len(pat)
    target = []
    for i in range(m + 1, str_len - m + 2):
        if str_len - i + 2 <= str_len - 1:
            # sum the z value of the first character and the z value of the last character in current pattern
            substitution = z1[i] + z2[str_len - i + 1]
            #sum the z value of the first character and the z value of  the next character of the last character in current pattern
            deletion = z1[i] + z2[str_len - i]
            # sum the z value of the first character and the z value of the character that before the last character in current pattern
            # if str_len-i+2<=str_len-1:
            insertion = z1[i] + z2[str_len - i + 2]
        else:
            continue

        if z1[i] == m:
            print(str[i:i + 5])
            print("match", i)
            target.append([i - m, 0])
        # the extra case 'i<=str_len-m 'make sure if the length of remain text < the length of pattern, then it should not use substitution
        elif substitution == m - 1 and i <= str_len - m:
            print(str[i:i + 5])
            print("sub", i, substitution)
            target.append([i - m, 1])
        elif insertion == m - 1 and i <= str_len - m + 1:
            print(str[i:i + 5])
            print("insert", i, insertion)
            target.append([i - m, 1])
        elif deletion == m and i <= str_len - m - 1:
            print(str[i:i + 5])
            if z1[i +
                  1] != m:  # if the pattern that start from i+1 matches the pat,ignore the case that start from i as redundant
                print("delete", i, deletion)
                target.append([i - m, 1])
    return target
def matchedprefix(pat):
    m = len(pat)
    matched_prefix = [-1 for _ in range(m)]
    z = calculateZ(pat)
    for i in range(m - 1, -1, -1):
        if i + z[i] - 1 == m - 1:
            matched_prefix[i] = z[i]
        elif i == m - 1:  # 最后一位且不匹配
            matched_prefix[i] = 0
        else:
            matched_prefix[i] = matched_prefix[i + 1]
    matched_prefix[0] = m - 1
    return matched_prefix
Пример #6
0
def computeSPx(pat):
    m = len(pat)
    z = calculateZ(pat)
    #The size of Spi is m*75, each sublist records the length of the longest proper suffix of pat[1...i] that matches its prefix, with the extra condition
    # that pat[spi(x)+1]=x
    SP = [[0 for _ in range(CHARACTER)] for _ in range(m)]
    for j in range(m - 1, -1, -1):
        #for each j,compute its i will record spi with the longest when multiple z_box with different start point j but same end point i
        i = j + z[j] - 1
        if i == -1:  # if i=-1 sp[i] will be recognised as the last value of SP
            break
        SP[i][ord(pat[z[j]]) -
              ord('0')] = z[j]  #recording spi and the next character
    return SP