def change(m): (mfile, sourcefile, pos, orig, mutant) = m eops = Levenshtein.editops(orig, mutant) blocks = Levenshtein.matching_blocks(eops, orig, mutant) if len(blocks) > 4: return mutant[:-1] keep = ''.join([orig[x[0]:x[0] + x[2]] for x in blocks]) notKeep = "" pos = 0 wasDot = False for c in range(0, len(orig)): if orig[c] == keep[pos]: pos += 1 if not wasDot: notKeep += "..." wasDot = True else: notKeep += orig[c] wasDot = False notKeep += "==>" pos = 0 wasDot = False for c in range(0, len(mutant)): if (pos < len(keep)) and mutant[c] == keep[pos]: pos += 1 if not wasDot: notKeep += "..." wasDot = True else: notKeep += mutant[c] wasDot = False return notKeep
def distance_simliarity(entity_list): for name1, name2 in combinations(entity_list, 2): dist_lvst = lvst.distance(name1, name2) dist_jaro = lvst.jaro_winkler(name1, name2) edit_ops = lvst.editops(name1, name2) match_blocks = lvst.matching_blocks(edit_ops, name1, name2) yield ((name1, name2), dist_lvst, dist_jaro, edit_ops, match_blocks)
def main(): # a = 'GCCTGAGTCCGAGCAGAAGAAGAAGGGCTCCCATCACATCAAC' # b = 'GAGTCGAGCAGAAGAAGAANGG' a = 'AATGTGTGTCTGCTGGAAGCTCCTATTCTTCCGCCATTTTCCAGTCCTCCAGAAGTTTCCTGATGGTCCATGTCTGAATTAGACACCCCTCTTCTTTGTTCCAGTTGCACCTGTAATTCTTCAGCATAGTACTTCTTAAACTGTTTTTAA' b = 'TTTNCTGATGGTCCATGTCTGTTACTC' print(l.distance(a, b)) print(l.editops(a, b)) print(l.matching_blocks(l.editops(a, b), a, b))
def new_change_in_line(a, b): mb = levenshtein.matching_blocks(levenshtein.editops(a, b), a, b) common = ''.join([a[x[0]:x[0] + x[2]] for x in mb]) diff_in_b = "" common_index = 0 for c in b: if (common_index >= len(common)): diff_in_b += c continue if (c == common[common_index]): common_index += 1 else: diff_in_b += c return diff_in_b
def compile_channels(self): """ Compiles the list of channels found. This will attempt to group channels by edit distance. """ group_name_omits = ["train_", "valid_", "test_"] edit_thresh = 6 for channel in self.channels: edit_distances = dict((c, Levenshtein.distance(channel, c)) for c in self.channel_groups.keys()) if len(edit_distances) == 0: group_name = channel for omit in group_name_omits: group_name = group_name.replace(omit, "") self.channel_groups[group_name] = [channel] else: group = None min_ed = len(channel) for c, d in edit_distances.iteritems(): if d <= min_ed: min_ed = d group = c if min_ed > edit_thresh or group is None: group_name = channel for omit in group_name_omits: group_name = group_name.replace(omit, "") self.channel_groups[group_name] = [channel] else: # Now we reduce the group to the minimum shared string # mb = matching blocks (see Levenshtein docs). mb =\ Levenshtein.matching_blocks( Levenshtein.editops(channel, group), channel, group) new_group = "".join([group[x[1]:x[1]+x[2]] for x in mb]) if new_group != group: self.channel_groups[new_group] =\ copy.deepcopy(self.channel_groups[group]) self.channel_groups.pop(group) self.channel_groups[new_group].append(channel) for group, channels in self.channel_groups.iteritems(): self.d["logs"][group] = {} for channel in channels: self.d["logs"][group][channel] = [] self.logger.info("Channels: %r" % self.d["logs"].keys())
def compile_channels(self): """ Compiles the list of channels found. This will attempt to group channels by edit distance. """ group_name_omits = ["train_", "valid_", "test_"] edit_thresh = 8 for channel in self.channels: edit_distances = dict((c, Levenshtein.distance(channel, c)) for c in self.channel_groups.keys()) if len(edit_distances) == 0: group_name = channel for omit in group_name_omits: group_name = group_name.replace(omit, "") self.channel_groups[group_name] = [channel] else: group = None min_ed = len(channel) for c, d in edit_distances.iteritems(): if d <= min_ed: min_ed = d group = c if min_ed > edit_thresh or group is None: group_name = channel for omit in group_name_omits: group_name = group_name.replace(omit, "") self.channel_groups[group_name] = [channel] else: # Now we reduce the group to the minimum shared string # mb = matching blocks (see Levenshtein docs). mb =\ Levenshtein.matching_blocks( Levenshtein.editops(channel, group), channel, group) new_group = "".join([group[x[1]:x[1] + x[2]] for x in mb]) if new_group != group: self.channel_groups[new_group] =\ copy.deepcopy(self.channel_groups[group]) self.channel_groups.pop(group) self.channel_groups[new_group].append(channel) for group, channels in self.channel_groups.iteritems(): self.d["logs"][group] = {} for channel in channels: self.d["logs"][group][channel] = [] print self.d["logs"]
def get_parts(string1, string2): length1 = len(string1) length2 = len(string2) editops = lev.editops(string1, string2) # only include strings which are different? equal_blocks = lev.matching_blocks(editops, length1, length2) get_distance1 = functools.partial(get_index_distance, length=length1) get_distance2 = functools.partial(get_index_distance, length=length2) # there is always one zero-length 'matching block' at the end if len(equal_blocks) > 1: # for each matching block, get the corresponding substring # and store the indexes from both strings # this will allow us to keep track of where the blocks come from in the strings equal_parts = [(string1[index1:index1 + block_length], get_distance1(index1), get_distance2(index2)) for index1, index2, block_length in equal_blocks if block_length] return equal_parts else: return []
def wers2(originals, results): ops = {'insert': 0, 'delete': 0, 'replace': 0} t_each_tone = {'0': 0, '1': 0, '2': 0, '3': 0, '4': 0} total_each_tone = {'0': 0, '1': 0, '2': 0, '3': 0, '4': 0} count = len(originals) try: assert count > 0 except: print(originals) raise ("ERROR assert count>0 - looks like data is missing") rates = [] mean = 0.0 assert count == len(results) for i in range(count): rate = wer(originals[i], results[i]) mean = mean + rate rates.append(rate) ops_list = Levenshtein.editops(originals[i], results[i]) for op in ops_list: ops[op[0]] += 1 #找相同部分 mb = Levenshtein.matching_blocks(ops_list, originals[i], results[i]) same = ''.join([originals[i][x[0]:x[0] + x[2]] for x in mb]) for s in same: t_each_tone[s] += 1 for o in originals[i]: total_each_tone[o] += 1 for i in total_each_tone: #if total_each_tone[i] != 0: t_each_tone[i] = t_each_tone[i] / total_each_tone[i] #else: # t_each_tone[i] = 0 # print(ops) return rates, mean / float(count), ops, t_each_tone
def _match_block(self, ref_seq, sample_seq): return lev.matching_blocks(lev.editops(ref_seq, sample_seq), ref_seq, sample_seq)
str5 = "KaraRecorder.RecordThread-1547039321216" str6 = "KaraRecorder.ScheduleThread-1547039321061" str7 = "KaraRecorder.EvaluateThread-1547039323866" str8 = "KaraM4aPlayer-PlayThread-1547039321284" # doCompare(str5,str5) # doCompare(str5,str1) # doCompare(str5,str6) # doCompare(str5,str7) # doCompare(str5,str8) # 这里会计算一个类似于公共字符串的东西出来,以下几种情况都可能会出现 # print(Levenshtein.median([str1,str2]))#thread-1 是其中一个字符串的值 # print(Levenshtein.median([str1,str2,str3,str4]))#hhiead 是一个新造出来的值 # print(Levenshtein.median([str1,str2,str3,str4,str5,str6,str7,str8]))#KaraRea 是一部分最相似的字符串的公共部分 #这个方法看不出来有什么意义,结果都是thread-1 # print(Levenshtein.setmedian([str1,str2])) # print(Levenshtein.setmedian([str1,str2,str3,str4])) # print(Levenshtein.setmedian([str1,str2,str3,str4,str5,str6,str7,str8])) # print(Levenshtein.quickmedian([str1,str2]))#thread-1 # print(Levenshtein.quickmedian([str1,str2,str3,str4]))#hippy-1 # print(Levenshtein.quickmedian([str1,str2,str3,str4,str5,str6,str7,str8]))#KrReodr.reaeTrad14733226 print(Levenshtein.matching_blocks([str1, str2])) print(Levenshtein.matching_blocks([str1, str2, str3, str4])) print( Levenshtein.matching_blocks( [str1, str2, str3, str4, str5, str6, str7, str8]))