Python Levenshtein.matching_blocks示例，Levenshtein.matching_blocks, spark-data-repair-plugin Python示例

示例#1

0

显示文件

文件： utils.py 项目： vallant/universalmutator

def change(m):
    (mfile, sourcefile, pos, orig, mutant) = m
    eops = Levenshtein.editops(orig, mutant)
    blocks = Levenshtein.matching_blocks(eops, orig, mutant)
    if len(blocks) > 4:
        return mutant[:-1]
    keep = ''.join([orig[x[0]:x[0] + x[2]] for x in blocks])
    notKeep = ""
    pos = 0
    wasDot = False
    for c in range(0, len(orig)):
        if orig[c] == keep[pos]:
            pos += 1
            if not wasDot:
                notKeep += "..."
                wasDot = True
        else:
            notKeep += orig[c]
            wasDot = False
    notKeep += "==>"
    pos = 0
    wasDot = False
    for c in range(0, len(mutant)):
        if (pos < len(keep)) and mutant[c] == keep[pos]:
            pos += 1
            if not wasDot:
                notKeep += "..."
                wasDot = True
        else:
            notKeep += mutant[c]
            wasDot = False
    return notKeep

示例#2

0

显示文件

文件： Normalization_Helper.py 项目： praty170122032/Knowledge-Panel

def distance_simliarity(entity_list):
    for name1, name2 in combinations(entity_list, 2):
        dist_lvst = lvst.distance(name1, name2)
        dist_jaro = lvst.jaro_winkler(name1, name2)
        edit_ops = lvst.editops(name1, name2)
        match_blocks = lvst.matching_blocks(edit_ops, name1, name2)

        yield ((name1, name2), dist_lvst, dist_jaro, edit_ops, match_blocks)

示例#3

0

显示文件

文件： test_align.py 项目： simwiki/circleseq

def main():

    # a = 'GCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAAC'
    # b = 'GAGTCGAGCAGAAGAAGAANGG'

    a = 'AATGTGTGTCTGCTGGAAGCTCCTATTCTTCCGCCATTTTCCAGTCCTCCAGAAGTTTCCTGATGGTCCATGTCTGAATTAGACACCCCTCTTCTTTGTTCCAGTTGCACCTGTAATTCTTCAGCATAGTACTTCTTAAACTGTTTTTAA'
    b = 'TTTNCTGATGGTCCATGTCTGTTACTC'

    print(l.distance(a, b))
    print(l.editops(a, b))
    print(l.matching_blocks(l.editops(a, b), a, b))

示例#4

0

显示文件

def new_change_in_line(a, b):
    mb = levenshtein.matching_blocks(levenshtein.editops(a, b), a, b)
    common = ''.join([a[x[0]:x[0] + x[2]] for x in mb])
    diff_in_b = ""
    common_index = 0
    for c in b:
        if (common_index >= len(common)):
            diff_in_b += c
            continue
        if (c == common[common_index]):
            common_index += 1
        else:
            diff_in_b += c
    return diff_in_b

示例#5

0

显示文件

文件： __init__.py 项目： ecastrow/pl2mind

 def compile_channels(self):
     """
     Compiles the list of channels found.
     This will attempt to group channels by edit distance.
     """
     group_name_omits = ["train_", "valid_", "test_"]
     edit_thresh = 6
     for channel in self.channels:
         edit_distances = dict((c, Levenshtein.distance(channel, c))
                           for c in self.channel_groups.keys())
         if len(edit_distances) == 0:
             group_name = channel
             for omit in group_name_omits:
                 group_name = group_name.replace(omit, "")
             self.channel_groups[group_name] = [channel]
         else:
             group = None
             min_ed = len(channel)
             for c, d in edit_distances.iteritems():
                 if d <= min_ed:
                     min_ed = d
                     group = c
             if min_ed > edit_thresh or group is None:
                 group_name = channel
                 for omit in group_name_omits:
                     group_name = group_name.replace(omit, "")
                 self.channel_groups[group_name] = [channel]
             else:
                 # Now we reduce the group to the minimum shared string
                 # mb = matching blocks (see Levenshtein docs).
                 mb =\
                     Levenshtein.matching_blocks(
                     Levenshtein.editops(channel, group), channel, group)
                 new_group = "".join([group[x[1]:x[1]+x[2]] for x in mb])
                 if new_group != group:
                     self.channel_groups[new_group] =\
                         copy.deepcopy(self.channel_groups[group])
                     self.channel_groups.pop(group)
                 self.channel_groups[new_group].append(channel)
     for group, channels in self.channel_groups.iteritems():
         self.d["logs"][group] = {}
         for channel in channels:
             self.d["logs"][group][channel] = []
     self.logger.info("Channels: %r" % self.d["logs"].keys())

示例#6

0

显示文件

文件： __init__.py 项目： ecastrow/pl2mind

 def compile_channels(self):
     """
     Compiles the list of channels found.
     This will attempt to group channels by edit distance.
     """
     group_name_omits = ["train_", "valid_", "test_"]
     edit_thresh = 8
     for channel in self.channels:
         edit_distances = dict((c, Levenshtein.distance(channel, c))
                               for c in self.channel_groups.keys())
         if len(edit_distances) == 0:
             group_name = channel
             for omit in group_name_omits:
                 group_name = group_name.replace(omit, "")
             self.channel_groups[group_name] = [channel]
         else:
             group = None
             min_ed = len(channel)
             for c, d in edit_distances.iteritems():
                 if d <= min_ed:
                     min_ed = d
                     group = c
             if min_ed > edit_thresh or group is None:
                 group_name = channel
                 for omit in group_name_omits:
                     group_name = group_name.replace(omit, "")
                 self.channel_groups[group_name] = [channel]
             else:
                 # Now we reduce the group to the minimum shared string
                 # mb = matching blocks (see Levenshtein docs).
                 mb =\
                     Levenshtein.matching_blocks(
                     Levenshtein.editops(channel, group), channel, group)
                 new_group = "".join([group[x[1]:x[1] + x[2]] for x in mb])
                 if new_group != group:
                     self.channel_groups[new_group] =\
                         copy.deepcopy(self.channel_groups[group])
                     self.channel_groups.pop(group)
                 self.channel_groups[new_group].append(channel)
     for group, channels in self.channel_groups.iteritems():
         self.d["logs"][group] = {}
         for channel in channels:
             self.d["logs"][group][channel] = []
     print self.d["logs"]

示例#7

0

显示文件

文件： substring_parser.py 项目： notnami/signify

def get_parts(string1, string2):
    length1 = len(string1)
    length2 = len(string2)
    editops = lev.editops(string1, string2)

    # only include strings which are different?

    equal_blocks = lev.matching_blocks(editops, length1, length2)
    get_distance1 = functools.partial(get_index_distance, length=length1)
    get_distance2 = functools.partial(get_index_distance, length=length2)

    # there is always one zero-length 'matching block' at the end
    if len(equal_blocks) > 1:
        # for each matching block, get the corresponding substring
        # and store the indexes from both strings
        # this will allow us to keep track of where the blocks come from in the strings
        equal_parts = [(string1[index1:index1 + block_length],
                        get_distance1(index1), get_distance2(index2))
                       for index1, index2, block_length in equal_blocks if block_length]
        return equal_parts
    else:
        return []

示例#8

0

显示文件

文件： wers.py 项目： saber5433/MultiToneNet

def wers2(originals, results):
    ops = {'insert': 0, 'delete': 0, 'replace': 0}
    t_each_tone = {'0': 0, '1': 0, '2': 0, '3': 0, '4': 0}
    total_each_tone = {'0': 0, '1': 0, '2': 0, '3': 0, '4': 0}

    count = len(originals)
    try:
        assert count > 0
    except:
        print(originals)
        raise ("ERROR assert count>0 - looks like data is missing")
    rates = []
    mean = 0.0
    assert count == len(results)
    for i in range(count):
        rate = wer(originals[i], results[i])
        mean = mean + rate
        rates.append(rate)

        ops_list = Levenshtein.editops(originals[i], results[i])
        for op in ops_list:
            ops[op[0]] += 1
        #找相同部分
        mb = Levenshtein.matching_blocks(ops_list, originals[i], results[i])
        same = ''.join([originals[i][x[0]:x[0] + x[2]] for x in mb])
        for s in same:
            t_each_tone[s] += 1
        for o in originals[i]:
            total_each_tone[o] += 1
    for i in total_each_tone:
        #if total_each_tone[i] != 0:
        t_each_tone[i] = t_each_tone[i] / total_each_tone[i]
        #else:
        #    t_each_tone[i] = 0
    # print(ops)
    return rates, mean / float(count), ops, t_each_tone

示例#9

0

显示文件

文件： ssa.py 项目： klein203/Python-Learning

 def _match_block(self, ref_seq, sample_seq):
     return lev.matching_blocks(lev.editops(ref_seq, sample_seq), ref_seq,
                                sample_seq)

示例#10

0

显示文件

文件： T_StringLike.py 项目： sephirothz87/PyUtil

str5 = "KaraRecorder.RecordThread-1547039321216"
str6 = "KaraRecorder.ScheduleThread-1547039321061"
str7 = "KaraRecorder.EvaluateThread-1547039323866"
str8 = "KaraM4aPlayer-PlayThread-1547039321284"

# doCompare(str5,str5)
# doCompare(str5,str1)
# doCompare(str5,str6)
# doCompare(str5,str7)
# doCompare(str5,str8)

# 这里会计算一个类似于公共字符串的东西出来，以下几种情况都可能会出现
# print(Levenshtein.median([str1,str2]))#thread-1 是其中一个字符串的值
# print(Levenshtein.median([str1,str2,str3,str4]))#hhiead  是一个新造出来的值
# print(Levenshtein.median([str1,str2,str3,str4,str5,str6,str7,str8]))#KaraRea    是一部分最相似的字符串的公共部分

#这个方法看不出来有什么意义，结果都是thread-1
# print(Levenshtein.setmedian([str1,str2]))
# print(Levenshtein.setmedian([str1,str2,str3,str4]))
# print(Levenshtein.setmedian([str1,str2,str3,str4,str5,str6,str7,str8]))

# print(Levenshtein.quickmedian([str1,str2]))#thread-1
# print(Levenshtein.quickmedian([str1,str2,str3,str4]))#hippy-1
# print(Levenshtein.quickmedian([str1,str2,str3,str4,str5,str6,str7,str8]))#KrReodr.reaeTrad14733226

print(Levenshtein.matching_blocks([str1, str2]))
print(Levenshtein.matching_blocks([str1, str2, str3, str4]))
print(
    Levenshtein.matching_blocks(
        [str1, str2, str3, str4, str5, str6, str7, str8]))