def issameword(word1, word2): WordDistance.generate_weight_table() dist = lev(word1, word2, substitute_costs=WordDistance.substitute_costs) #print(word1, word2, dist) if(dist < 0.3): return True return False
def check_word(corpus, term): ''' check's if the word exists in the dictionary, if so return same word. else, checks weighted distance with word and dictionary word, if distance is <= 1 then return word if not, then the smallest weight is returned ''' if corpus == 'uottawa': df = pd.read_csv('./uottawa_dictionary.csv', index_col=0) else: df = pd.read_csv('./reuters_dictionary.csv', index_col=0) res = {} insert_costs = np.ones(128, dtype=np.float64) delete_costs = np.ones(128, dtype=np.float64) substitute_costs = np.ones((128, 128), dtype=np.float64) for i in range(df.shape[0]): if term == df.iat[i,0]: return df.iat[i,0] else: if len(str(df.iat[i,0])) >= len(term): weight = lev(term, str(df.iat[i,0]), insert_costs, delete_costs, substitute_costs) res[df.iat[i,0]] = weight sortedWeights = {k: v for k, v in sorted(res.items(), key=lambda item: item[1])} print("new spelling: ", next(iter(sortedWeights))) return (next(iter(sortedWeights)))
def edit_distance_normalized_cost(word, target): cost = lev(word, target, insert_costs=insert_costs, delete_costs=delete_costs, substitute_costs=substitute_costs) return (cost + alfha) / len(target)
def match(self, string1, string2): # Testing return lev(string1.lower(), string2.lower(), substitute_costs=self.substitute_costs, delete_costs=self.delete_costs, insert_costs=self.insert_costs)
def calculate(self, wrong_word, words_dict): change_costs = self.get_change_costs() insert_costs = self.get_insert_cost() delete_costs = self.get_delete_cost() ### https://weighted-levenshtein.readthedocs.io/en/master/ lev_dict = { cnd: lev(wrong_word, cnd, insert_costs=insert_costs, delete_costs=delete_costs, substitute_costs=change_costs) for cnd in words_dict } top_rated = sorted(lev_dict.items(), key=lambda kv: kv[1] )[:300] ### Dictionary -> Word : Levenstein metric ranking = [x[0] for x in top_rated] max_levenstein = max(top_rated, key=lambda vector: vector[1])[1] Pwc = [(x[0], 1 - x[1] / max_levenstein) for x in top_rated] ### sorted probability -> word : p dict_count = [(x, words_dict[x] + 1) for x in ranking] max_count = max(dict_count, key=lambda v: v[1])[1] Pc = [(x[0], x[1] / max_count) for x in dict_count] Pcw_probability = self.calculate_probability(Pwc=Pwc, Pc=Pc) return Pcw_probability
def lev_similarity(aa: str, bb: str) -> float: """ Get a Levenshtein similarity score. :param aa: first string :param bb: second string :return: The similarity of the two strings (0=bad, 1=match): 1- lev(aa,bb)/max(len(aa), len(bb)) """ # Since weighted levenshtein can't handle unicode, # convert to ASCII first: def convert_to_ascii(text: str, label: str) -> str: try: text_out = text.encode('ascii', 'ignore') return text_out except Exception as ex: raise Exception(f'Could not encode f{label}: f{aa}') from ex aa = convert_to_ascii(aa, 'aa') bb = convert_to_ascii(bb, 'bb') # TODO, consider penalizing whitespace alterations less return 1.0 - lev(aa, bb) / max(len(aa), len(bb))
def test_lev(self): self.assertEqual(lev('1234', '1234'), 0.0) self.assertEqual(lev('', '1234'), 4.0) self.assertEqual(lev('1234', ''), 4.0) self.assertEqual(lev('', ''), 0.0) self.assertEqual(lev('1234', '12'), 2.0) self.assertEqual(lev('1234', '14'), 2.0) self.assertEqual(lev('1111', '1'), 3.0)
def test_lev(self): self.assertEqual(lev('1234', '1234'), 0.0) self.assertEqual(lev('', '1234'), 4.0) self.assertEqual(lev('1234', ''), 4.0) self.assertEqual(lev('', ''), 0.0) self.assertEqual(lev('1234', '12'), 2.0) self.assertEqual(lev('1234', '14'), 2.0) self.assertEqual(lev('1111', '1'), 3.0)
def edit_distance(self): if c_levenshtein: return min( self.max_threshold, int( lev(self.word1, self.word2, insert_costs=insertion_costs, delete_costs=deletion_costs, substitute_costs=substitution_costs))) else: return self.levenshtein_distance()
def EditDistance(str1, str2): alphabet = [ u'\u02c0', u'b', u'g', u'd', u'h', u'w', u'z', u'\u1e25', u'\u1e6d', u'y', u'k', u'k', u'l', u'm', u'm', u'n', u'n', u's', u'\u02c1', u'p', u'p', u'\u00e7', u'\u00e7', u'q', u'r', u'\u0161', u't' ] + list(u'euioa*-') + list(u'qwertyuiopasdfghjklzxcvbnm') alphabet = list(set(alphabet)) int2char = {i + 32: ch for i, ch in enumerate(alphabet)} char2int = {char: ind for ind, char in int2char.items()} str1_ = ''.join([chr(char2int[x]) for x in str1]) str2_ = ''.join([chr(char2int[x]) for x in str2]) return lev(str1_, str2_)
def main(): """Run Weighted Levenshtein on two inputted words.""" df = pd.read_pickle( get_absolute_path('confusion_matrix') + '/confusion_matrix_base.pkl') # use the base pickle df = df.drop('other', axis=1) # drop the ' ' column df = normalise(df) substitution_costs = get_subsitution_costs(df) # get the distance of truth from read: truth = sys.argv[1] read = sys.argv[2] print(wl.lev(read, truth, substitute_costs=substitution_costs))
def avg_edit2(assembled_f, labels_f, seqnum): avg = 0 # Make array of costs, can control the edit cost for each opeartion and their application to each character # Example: insert_costs[ord('D')] = 1.5, make inserting the character 'D' have cost 1.5 (instead of 1) insert_costs = np.ones(128, dtype=np.float64) * 2 delete_costs = np.ones(128, dtype=np.float64) * 2 # Substitution costs can be specified independently in both directions, i.e. a->b can have different cost from b->a # Example: substitute_costs[ord('H'), ord('B')] = 1.25, make substituting 'H' for 'B' cost 1.25 subs_costs = np.ones((128, 128), dtype=np.float64) * 1 for (ass, lab) in zip(assembled_f, labels_f): avg += lev(ass, lab, insert_costs=insert_costs, delete_costs=delete_costs, substitute_costs=subs_costs) / len(lab) avg = avg / seqnum return avg
def calculate_levenshtein(gold_transcripts, silver_transcripts): average_accuracy = 0 for key, value in tqdm.tqdm(gold_transcripts.items()): average_word_length = 0 for token in value: average_word_length += len(token) average_word_length /= len(value) if key in silver_transcripts: distance = lev(" ".join(value), " ".join(silver_transcripts[key])) distance /= average_word_length error_rate = distance / len(value) accuracy = 1 - error_rate average_accuracy += accuracy print(key + " : " + str(accuracy)) print("\n" + "Average Transcription Accuracy : " + str(average_accuracy / len(gold_transcripts)))
def distance(self, text1, text2): # filter text text1 = text1.lower() # unfortunately weighted_levenshtein doesn't support unicode so diacritics have a cost of 0 text1 = text1.translate(diacritics) text1 = ''.join(filter(self.onlyascii, text1)) text2 = text2.lower() # unfortunately weighted_levenshtein doesn't support unicode so diacritics have a cost of 0 text2 = text2.translate(diacritics) text2 = ''.join(filter(self.onlyascii, text2)) return lev(text1, text2, insert_costs=insert_costs, delete_costs=delete_costs, substitute_costs=substitute_costs)
def bayes(wrong_word, words_dict, polish_chars_dict): insert_costs, delete_costs, substitute_costs = init_weighted_lev_dicts() polish_mistakes_dict = { 'r': ['Z', 's'], 'c': ['h'], 'z': ['Z', 'X', 'r'], 'n': ['ń'], 's': ['S'], 'l': ['L'], 'o': ['O', 'u'], 'u': ['O'], 'h': ['c'] } Pwc_dict = { cnd: lev(wrong_word, cnd, insert_costs=insert_costs, delete_costs=delete_costs, substitute_costs=substitute_costs) for cnd in words_dict if cnd.startswith( find_by_first_chars(wrong_word, polish_mistakes_dict)) } top_rated = sorted(Pwc_dict.items(), key=lambda kv: kv[1])[:200] top_rated_words = [x[0] for x in top_rated] top_rated_words_max = max(top_rated, key=lambda v: v[1])[1] Pwc = [(x[0], 1 - x[1] / top_rated_words_max) for x in top_rated] # P(w|c) occurences_in_dict = [(x, words_dict[x] + 1) for x in top_rated_words] occurences_in_dict_max = max(occurences_in_dict, key=lambda v: v[1])[1] Pc = [(x[0], x[1] / occurences_in_dict_max) for x in occurences_in_dict] # P(c) Pcw = [] for p in range(len(Pwc)): Pcw.append( (handle_polish_word({v: k for k, v in polish_chars_dict.items()}, Pwc[p][0]), 0.8 * Pwc[p][1] + 0.2 * Pc[p][1])) # P(c|w) return [x[0] for x in sorted(Pcw, key=lambda v: v[1], reverse=True)[:5]]
def check_word_against_term(input_word, given_term): new_str = "" for chr in input_word: if ord(chr) < 128: new_str += chr else: new_str += "?" return lev(new_str, given_term, substitute_costs=substitute_costs)<2 #Tests (can be commented out after implementing) # print ("DISPO and DISPO "+ str(lev('DISPO', 'DISPO', substitute_costs=substitute_costs))) # print ("DISPO and DAMN "+str(lev('DISPO', 'DAMN', substitute_costs=substitute_costs))) # print ("DISPO and D1SPO "+str(lev('DISPO', 'D1SPO', substitute_costs=substitute_costs))) # print ("DISPO and D15PO "+str(lev('DISPO', 'D15PO', substitute_costs=substitute_costs))) # print ("DISPO and D15P0 "+str(lev('DISPO', 'D15P0', substitute_costs=substitute_costs))) # print ("NAM 001 and NAM OO1 "+str(lev('NAM 001', 'NAM OO1', substitute_costs=substitute_costs))) # print ("COURT and C0URT "+str(lev('COURT', 'C0URT', substitute_costs=substitute_costs))) # print (check_word_similarity("D15P0")) # print (check_word_similarity("FELONY"))
def handle(self, *args, **options): # websites = Website.objects.raw( # "select w.id, w.domain, count(distinct wc.id) from website_contacts wc " # "JOIN website_locations wl on wc.website_id = wl.website_id " # "JOIN category_websites cw on wc.website_id = cw.website_id " # "JOIN websites w on wc.website_id = w.id " # "left join ( " # "select distinct website_id from website_contacts wc " # "JOIN ( " # "select id from lawyer_dir_part1 " # "union " # "select id from lawyer_dir_part2 " # "union " # "select id from lawyer_dir_part3 " # ") doo ON doo.id = wc.id " # ") doo ON doo.website_id = w.id " # "where country_code = 'us' and region_code in ('ca','tx','fl','ny','il') " # "and category_id IN (10368) and doo.website_id is null and ((64 & wc.score) or (32 & wc.score)) " # "group by wc.website_id " # "order by count(distinct wc.id) desc" # ) websites = Website.objects.raw( "select w.id, w.domain, count(distinct wc.id) from websites w " "JOIN website_locations wl on w.id = wl.website_id " "JOIN category_websites cw on w.id = cw.website_id " "JOIN website_contacts wc on w.id = wc.website_id " "where country_code = 'us' and category_id IN (10368) and ((64 & wc.score) or (32 & wc.score)) " "group by w.id " "having count(distinct wc.id) < 20 " "order by count(distinct wc.id) desc") progress_bar = tqdm(desc="Processing", total=len(websites)) for website in websites: progress_bar.update(1) filter_contact = Q( Q(score=WebsiteContact.score.has_matching_email) | Q(score=WebsiteContact.score.has_unique_email) | Q(score=WebsiteContact.score.has_unique_phone)) website_contacts = WebsiteContact.objects. \ filter(website_id=website.id). \ filter(filter_contact) if len(website_contacts) < 2: continue query = Q() for website_contact in website_contacts: if website_contact.first_name and website_contact.last_name: first_and_last_name_query = Q( first_name=website_contact.first_name, last_name=website_contact.last_name, organization_key__isnull=False, ) query.add(first_and_last_name_query, Q.OR) if len(query): organization_keys = DirectoryContact.objects.values_list('organization_key'). \ annotate(dcount=Count('name', distinct=True)). \ filter(query).exclude(organization_key__isnull=True).order_by('-dcount')[:3] delete_costs = np.zeros(128, dtype=np.float64) print("website_id: %s, domain: %s, website_contacts: %s" % (website.id, website.domain, len(website_contacts))) for organization_key in organization_keys: if organization_key[1] / len(website_contacts) < 0.03: break website_director = None head = website.domain.partition('.')[0] lev_cost = 10 try: if len(head) > len(organization_key[0]): lev_cost = lev(head, organization_key[0], delete_costs=delete_costs) else: lev_cost = lev(organization_key[0], head, delete_costs=delete_costs) except: pass if organization_key[1] > 5 and len(website_contacts) > 5 and \ (organization_key[1] / len(website_contacts)) > 0.1: website_director = WebsiteDirector( organization_key=organization_key[0], website_id=website.id) print( "organization_key: %s, matching_contacts: %s, lev: %s, type: %s" % (organization_key[0], organization_key[1], lev_cost, 1)) elif (organization_key[1] / len(website_contacts)) > 0.3 and lev_cost <= 2: website_director = WebsiteDirector( organization_key=organization_key[0], website_id=website.id) print( "organization_key: %s, matching_contacts: %s, lev: %s, type: %s" % (organization_key[0], organization_key[1], lev_cost, 2)) if website_director: WebsiteDirector.objects.bulk_create( [website_director], ignore_conflicts=True) # break progress_bar.close()
substitute_costs[i] = np.array([3] * 128) output = [] correct_response = 0 attempted_response = 0 for i in range(0, len(misspell)): temp_dis = 6 temp_word = '' count = 0 for j in range(0, len(dictionary)): if abs(len(misspell[i]) - len(dictionary[j])) > 5: j += 1 else: if weighted_levenshtein.lev( misspell[i], dictionary[j], delete_costs=delete_costs, substitute_costs=substitute_costs) < temp_dis: temp_dis = weighted_levenshtein.lev( misspell[i], dictionary[j], delete_costs=delete_costs, substitute_costs=substitute_costs) temp_word = str(dictionary[j]) count = 1 elif weighted_levenshtein.lev( misspell[i], dictionary[j], delete_costs=delete_costs, substitute_costs=substitute_costs) == temp_dis: temp_word = temp_word + ' ' + str(dictionary[j])
def missingparameterstext(psflist, parameterlist): # Pass a directory that has psfs in it and a list of parameter files that contain analogous parameters, and return # a string containing all the parameters that need to be added for simulation. params = CharmmParameterSet() for p in parameterlist: params.read_parameter_file(p) returntext = "! This file was written by analogy from the following input parameter files: {}" \ "\n\n".format(str(prmlist)[1:-1]) missingset = [set(), set(), set(), set()] for psf in psflist: mol = pmd.load_file(psf) newmissing = mol.findmissingparameters(params) for i in range(4): for el in newmissing[i]: missingset[i].add(el) # Eliminate duplicate dihedrals. if len(missingset[0]): print( "The following atomtypes are missing nonbonded terms from the input parameter files. This is probably an " "input problem, and so we are exiting.") print(missingset[0]) exit() for i in range(1, 4): missingset[i] = removeduplicates(missingset[i]) # Write bonds section returntext += "BONDS\n" for missingbondtype in missingset[1]: mindistance = 1000 typekey = "-".join(missingbondtype) for k in params.bond_types: compkey = "-".join(k) ed = lev(typekey, compkey, substitute_costs=subcost, delete_costs=delcost) if ed < mindistance: replacementtype = k mindistance = ed returntext += "%-8s %-8s %.3f %.4f ! From %-8s %-8s\n" % ( missingbondtype[0], missingbondtype[1], params.bond_types[replacementtype].k, params.bond_types[replacementtype].req, replacementtype[0], replacementtype[1]) # Write angles section returntext += "\nANGLES\n" for missingangletype in missingset[2]: mindistance = 1000 typekey = "-".join(missingangletype) for k in params.angle_types: compkey = "-".join(k) ed = lev(typekey, compkey, substitute_costs=subcost, delete_costs=delcost) if ed < mindistance: replacementtype = k mindistance = ed if params.urey_bradley_types[replacementtype].k == 0: returntext += "%-8s %-8s %-8s %.3f %.4f ! From %-8s %-8s %-8s\n" % ( missingangletype[0], missingangletype[1], missingangletype[2], params.angle_types[replacementtype].k, params.angle_types[replacementtype].theteq, replacementtype[0], replacementtype[1], replacementtype[2]) else: returntext += "%-8s %-8s %-8s %.3f %.4f %.2f %.4f ! From %-8s %-8s %-8s\n" % ( missingangletype[0], missingangletype[1], missingangletype[2], params.angle_types[replacementtype].k, params.angle_types[replacementtype].theteq, params.urey_bradley_types[replacementtype].k, params.urey_bradley_types[replacementtype].req, replacementtype[0], replacementtype[1], replacementtype[2]) # Write dihedrals section returntext += "\nDIHEDRALS\n" for missingangletype in missingset[3]: mindistance = 1000 typekey = "-".join(missingangletype) for k in params.dihedral_types: compkey = "-".join(k) compkey2 = "-".join(k[::-1]) ed = lev(typekey, compkey, substitute_costs=subcost, delete_costs=delcost) ed2 = lev(typekey, compkey2, substitute_costs=subcost, delete_costs=delcost) if ed < mindistance or ed2 < mindistance: replacementtype = k mindistance = ed for prm in params.dihedral_types[replacementtype]: returntext += "%-8s %-8s %-8s %-8s %.4f %d %5.1f ! From %-8s %-8s %-8s %-8s\n" % ( missingangletype[0], missingangletype[1], missingangletype[2], missingangletype[3], prm.phi_k, prm.per, prm.phase, replacementtype[0], replacementtype[1], replacementtype[2], replacementtype[3]) return returntext + "\n"
def _lev(self, x, y): return lev(x, y, self.iw, self.dw, self.sw)
import csv import configparser import weighted_levenshtein config = configparser.ConfigParser() LEV_WEIGHTS = config['DEFAULT']['LevenshteinWeights'] LEV_TRESHOLD = 0.2 with open(LevenshteinWeights, 'r') as readFile: csvreader = csv.reader(readFile) lines = list(csvreader)[1:] for l in lines: substitute_costs[ord(l[0]), ord(l[1])] = l[2] def lev_distance(a, b, weights=LEV_WEIGHTS) wlev_dist = weighted_levenshtein.lev(a, b, substitute_costs=weights) return wlev_dist
def __call__(self, input1, input2): return lev(input1, input2, substitute_costs=self.substitute_costs, insert_costs=self.insertion_costs, delete_costs=self.deletion_costs)
def _lev(self, x, y): return lev(x, y, self.iw, self.dw, self.sw)
def check_word_similarity(word): for term in term_list: if lev(term, word, substitute_costs=substitute_costs)<len(term): return term return None
elif hamming == hamming_result["Hamming distance"]: hamming_result["match_positions"][start] = ref_sub return hamming_result letter_to_letter_matches_uppercase = seqdistance.make_letter_to_letter_matches( genealloy.ambiguity_code_to_nt_set) letter_to_letter_matches = seqdistance.make_dict_both_case( letter_to_letter_matches_uppercase) nt_substitute_costs = seqdistance.make_penalty_table(letter_to_letter_matches) seq = "ATGGATCGGCGGGCG" # ||||||||||| || ref = "ggGGGCATGGATCGGCGAACGAGSCtgATAAGGTGCTAGCTAAAAAAAAAA" lev(seq, ref, substitute_costs=nt_substitute_costs) # 36.0 # Hamming distance with positions and sequences: find_shortest_hamming(seq, ref, substitute_costs=nt_substitute_costs) # {'Hamming distance': 2.0, 'match_positions': {6: 'ATGGATCGGCGAACG'}} ######################################################################################## # Calculate distance for complement sequences from Examples.EpiJinn import epijinn seq = "AAAAAAAAAACCC" ref = "GGGTTTTTTTTTT" print(lev(seq, ref, substitute_costs=nt_substitute_costs)) # 13.0 seq_rc = epijinn.Methylase.reverse_complement(seq) print(lev(seq_rc, ref, substitute_costs=nt_substitute_costs))
def merge_levenshtein(pool, buffers, cols, lens, matrix, *flags): """ pool contains offsets of buffers that are to be aligned """ #print("merge_levenshtein(",pool,"buffers",cols,lens,matrix,flags,")") if len(buffers) != 2: raise Exception("no support for aligning more than two files, yet") result = [] if len(pool[0]) == 0: for y in pool[1]: if "force" in flags and len(result) > 0: result[-1] = result[-1][0:lens[0]] + [ re.sub(r"(^[\?_\*]\+)?(.*)(\+[\?_\*])?$", r"\2", val1 + "+" + val2) for val1, val2 in zip(result[-1][lens[0]:], buffers[1][y]) ] else: # default mode if len(buffers[1][y]) > cols[1]: # we skip empty lines newrow = ["?"] * (cols[0]) + [ "*" + buffers[1][y][cols[1]] + "*" ] + (["?"] * (lens[0] - cols[0] - 1)) + buffers[1][y] result.append(newrow) return result if len(pool[1]) == 0: for x in pool[0]: if len(buffers[0][x]) == 1 and buffers[0][x][0] == "": result.append(buffers[0][x]) else: result.append(buffers[0][x] + ["?"] * lens[1]) return result if matrix == None: matrix = [] for x in pool[0]: matrix.append([]) src = buffers[0][x][0] try: src = buffers[0][x][cols[0]] except: pass for y in pool[1]: tgt = buffers[1][y][0] try: tgt = buffers[1][y][cols[1]] except: pass src = norm(src) tgt = norm(tgt) if (max(len(src), len(tgt)) == 0): matrix[-1].append(1.0) else: #print(src,tgt,lev(src,tgt)) matrix[-1].append( 1.0 - lev(src, tgt) / max(len(src), len(tgt))) # leventhein similarity! max_x = 0 max_y = 0 min_dist = 0 max_sim = 0 try: min_dist = abs( max_x / len(pool[0]) - max_y / len(pool[1])) # secondary criterion for equal similarity max_sim = matrix[0][0] except: pass #print("matrix:",matrix) for x in range(len(pool[0])): for y in range(len(pool[1])): sim = matrix[x][y] dist = abs(x / len(pool[0]) - y / len(pool[1])) #print(sim,max_sim,dist,min_dist) if (sim > max_sim or (sim == max_sim and min_dist > dist)): min_dist = dist max_sim = sim max_x = x max_y = y result = [] # "i" and "r" before alignment if max_x == 0: # "i" for y in pool[1][0:max_y]: if "force" in flags and len(result) > 0: result[-1] = result[-1][0:lens[0]] + [ re.sub(r"(^[\?_\*]\+)?(.*)(\+[\?_\*])?$", r"\2", val1 + "+" + val2) for val1, val2 in zip(result[-1][lens[0]:], buffers[1][y]) ] else: # default mode if len(buffers[1][y]) > cols[1]: # print(buffers[1][y]) result.append(["?"] * cols[0] + ["*" + buffers[1][y][cols[1]] + "*"] + ["?"] * (lens[0] - cols[0] - 1) + buffers[1][y]) elif max_y == 0: # "r" for x in pool[0][0:max_x]: if len(buffers[0][x]) == 1 and buffers[0][x][0] == "": result.append(buffers[0][x]) else: result.append(buffers[0][x] + ["?"] * lens[1]) else: sub_pool = [pool[0][0:max_x], pool[1][0:max_y]] #print(pool,max_x,max_y,"=>",sub_pool) result = merge_levenshtein(sub_pool, buffers, cols, lens, matrix, *flags) # max alignment if len(pool[0]) > 0 and len(pool[1]) > 0: result.append(buffers[0][pool[0][max_x]] + buffers[1][pool[1][max_y]]) # align final elements if max_x == len(pool[0]) - 1: # "i" for y in pool[1][max_y + 1:]: if "force" in flags and len(result) > 0: result[-1] = result[-1][0:lens[0]] + [ re.sub(r"(^[\?_\*]\+)?(.*)(\+[\?_\*])?$", r"\2", val1 + "+" + val2) for val1, val2 in zip(result[-1][lens[0]:], buffers[1][y]) ] else: # default mode if len(buffers[1][y]) > cols[1]: result.append(["?"] * cols[0] + ["*" + buffers[1][y][cols[1]] + "*"] + ["?"] * (lens[0] - cols[0] - 1) + buffers[1][y]) elif max_y == len(pool[1]) - 1: # "r" for x in pool[0][max_x + 1:]: if len(buffers[0][x]) == 1 and buffers[0][x][0] == "": result.append(buffers[0][x]) else: result.append(buffers[0][x] + ["?"] * lens[1]) else: # recursion for non final elements sub_pool = [pool[0][max_x + 1:], pool[1][max_y + 1:]] result = result + merge_levenshtein(sub_pool, buffers, cols, lens, matrix, *flags) return result