def calNameScore(profile1, profile2): name1 = profile1["name"] name2 = profile2["name"] lang1 = profile1["nameLang"] lang2 = profile2["nameLang"] if lang1 == lang2: if lang1 in langs_not_western: return calNotWesternName(name1, name2) else: # return name_tools.match(name1, name2) return calNotWesternName(name1, name2) else: if len(name1)>3 and len(name2)>3: # 避免有時候翻譯錯,所以不用name_tools 例如chen shih ying 陳世穎卻翻成chen shiying try: if lang1 != "en": name1 = str(TextBlob(name1).translate(to="en")) except: pass try: if lang2 != "en": name2 = str(TextBlob(name2).translate(to="en")) except: pass return 1-pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(name1, name2) else: return 1-pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(name1, name2)
def calNameScore(profile1, profile2): name1 = profile1["name"] name2 = profile2["name"] lang1 = profile1["nameLang"] lang2 = profile2["nameLang"] if lang1 == lang2: if lang1 in langs_not_western: return calNotWesternName(name1, name2) else: # return name_tools.match(name1, name2) return calNotWesternName(name1, name2) else: if len(name1) > 3 and len(name2) > 3: # 避免有時候翻譯錯,所以不用name_tools 例如chen shih ying 陳世穎卻翻成chen shiying try: if lang1 != "en": name1 = str(TextBlob(name1).translate(to="en")) except: pass try: if lang2 != "en": name2 = str(TextBlob(name2).translate(to="en")) except: pass return 1 - pyxdameraulevenshtein.normalized_damerau_levenshtein_distance( name1, name2) else: return 1 - pyxdameraulevenshtein.normalized_damerau_levenshtein_distance( name1, name2)
def flag_na_n_ad(InputKamus, InputFTS): InputKamus = str(InputKamus) InputFTS = str(InputFTS) digits_input = ' '.join(re.findall(r'\d+', InputKamus)) digits_fts = ' '.join(re.findall(r'\d+', InputFTS)) no_int_kamus = ' '.join([ x for x in InputKamus.split() if not (x.isdigit() or x[0] == '-' and x[1:].isdigit()) ]) no_int_fts = ' '.join([ x for x in InputFTS.split() if not (x.isdigit() or x[0] == '-' and x[1:].isdigit()) ]) # try: if digits_input or digits_fts: nilai_digits = normalized_damerau_levenshtein_distance( digits_fts, digits_input) # print "number", nilai_digits # print 'DIGITS', nilai_digits if nilai_digits >= 0.5: return 'CONFIDENT' elif nilai_digits <= 0.4: return 'CONFIDENT' if no_int_kamus or no_int_fts: nilai_kalimat = normalized_damerau_levenshtein_distance( no_int_kamus, no_int_fts) # print "word", nilai_kalimat # print 'KATA', nilai_kalimat if nilai_kalimat >= 0.5: return 'CONFIDENT' elif nilai_kalimat <= 0.4: return 'CONFIDENT'
def get_similarities(Features, url_input): """ similarity metrics include: Levenshtein, jaro, damerau levenshtein, normalized_damerau_levenshtein, and hamming distance :param Features: input dictionary to add things to :param url_input :return: Features: after adding all similarity metrics """ for n in itertools.chain(product_domain_names, brand_names): Features['url_levenshtein_distance_' + n] = Levenshtein.distance( url_input, n) Features['fqdn_levenshtein_distance_' + n] = Levenshtein.distance( Features['fqdn'], n) Features['url_jaro_winkler_distance_' + n] = jw.get_jaro_distance( url_input, n) Features['fqdn_jaro_winkler_distance_' + n] = jw.get_jaro_distance( Features['fqdn'], n) Features['url_damerau_levenshtein_distance_' + n] = dl.damerau_levenshtein_distance(url_input, n) Features['fqdn_damerau_levenshtein_distance_' + n] = dl.damerau_levenshtein_distance(Features['fqdn'], n) Features['url_damerau_levenshtein_normalized_distance_' + n] = dl.normalized_damerau_levenshtein_distance(url_input, n) Features['fqdn_damerau_levenshtein_normalized_distance_' + n] = dl.normalized_damerau_levenshtein_distance( Features['fqdn'], n) if len(n) == len(url_input): Features['url_length_equals_' + n] = 1 Features['url_hamming_distance_' + n] = hamming(url_input, n) Features['fqdn_hamming_distance_' + n] = hamming( Features['fqdn'], n) else: Features['url_length_equals_' + n] = 0 return Features
def get_str_similarity(source, target, label): try: source = source.lower() target = target.lower() long_sub = combine_google_fsq_for_gt.longest_substring(source, target) similarities = { label + "rosimilarity": combine_google_fsq_for_gt.get_ro_similarity(source, target), label + "dlevensimilarity": 1 - normalized_damerau_levenshtein_distance(source, target), label + "levensimilarity": Levenshtein.ratio(source, target), label + "phoneticsimilarity": combine_google_fsq_for_gt.get_levenshtein_phonetic_similarity( source, target), } if (len(long_sub)): similarities[label + "lenlongsubstring"] = len(long_sub) / len(source) else: similarities[label + "lenlongsubstring"] = 0 except: similarities = { label + "rosimilarity": None, label + "dlevensimilarity": None, label + "levensimilarity": None, label + "phoneticsimilarity": None, label + "lenlongsubstring": None, } res = {k: v for k, v in similarities.items()} return res
def rank(self, src, tgt): ''' Returns the rank of the source and target paths. ''' p = len(set(tgt) - set(src)) a = normalized_damerau_levenshtein_distance(str(src), str(tgt)) + p b = max(len(src), len(tgt)) + p candidateScore = 1 - (a / b) return candidateScore
def rank(self, src, tgt): ''' Returns the rank of the source and target paths. ''' p = len(set(tgt) - set(src)) a = normalized_damerau_levenshtein_distance(unicode(src), unicode(tgt)) + p b = max(len(src), len(tgt)) + p candidateScore = 1 - (a/b) return candidateScore
def norm_dld(l1, l2): ascii_start = 0 # make a string for l1 # all triples are unique... s1 = ''.join((chr(ascii_start+i) for i in range(len(l1)))) s1_upd = list(s1) for i in range(len(l1)): for j in range(i+1, len(l1)): if trip_match(l1[i], l1[j]): s1_upd[j] = s1[i] s1_upd = ''.join(s1_upd) s2 = '' next_char = ascii_start + len(s1) for j in range(len(l2)): found = None #next_char = chr(ascii_start+len(s1)+j) for k in range(len(l1)): if trip_match(l2[j], l1[k]): found = s1_upd[k] #next_char = s1[k] break if found is None: s2 += chr(next_char) next_char += 1 #assert next_char <= 128 else: s2 += found # return 1- , since this thing gives 0 to perfect matches etc return 1.0-normalized_damerau_levenshtein_distance(s1_upd, s2)
def compare_DL(filename1, filename2): # Sanity check if not os.path.isfile(filename1): print('\nERROR: First source file ' + filename1 + ' was not found.\n') return (-3) if not os.path.isfile(filename2): print('\nERROR: Second source file ' + filename2 + ' was not found.\n') return (-4) # Read the content of the first file text1 = "" f1 = None with open(filename1) as f1: lines1 = [line.rstrip('\n') for line in f1] for line in lines1: text1 += line + ' ' text1 = text1[:-1] # Read the content of the second file text2 = "" f2 = None with open(filename2) as f2: lines2 = [line.rstrip('\n') for line in f2] for line in lines2: text2 += line + ' ' text2 = text2[:-1] sim = 1.0 - normalized_damerau_levenshtein_distance(text1, text2) return (sim)
def winner(v_): n = v_.size sim = [0.0] * n if n == 1: return 0, 1.0 i = 0 while (i < n): j = i + 1 while (j < n): s = 1.0 - normalized_damerau_levenshtein_distance(v_[i], v_[j]) if s == 1.0: # Two identical values return i, 1.0 sim[i] = sim[i] + s sim[j] = sim[j] + s j = j + 1 i = i + 1 # Search maximum and save the index to return it sim_max = sim[0] i_max = 0 i = 1 while (i < n): if (sim_max < sim[i]): i_max = i sim_max = sim[i] i = i + 1 return i_max, (sim_max / (n - 1))
def winner(v_): n = len(v_) sim = [0.0] * n if n == 1: return -1, 1.0 i = 0 while (i < n): j = i + 1 while (j < n): if v_[i] == v_[j]: return (i, 1.0) s = 1.0 - normalized_damerau_levenshtein_distance(v_[i], v_[j]) sim[i] = sim[i] + s sim[j] = sim[j] + s j = j + 1 i = i + 1 # Search maximum and return the index sim_max = sim[0] i_max = 0 i = 1 while (i < n): if (sim_max < sim[i]): i_max = i sim_max = sim[i] i = i + 1 return i_max, (sim_max / (n - 1))
def calc(self, search_word: str): word1_string = mojimoji.han_to_zen(search_word.lower()) word2_string = mojimoji.han_to_zen(self.keyword.lower()) self.distance = Decimal("1.0") - Decimal( str( normalized_damerau_levenshtein_distance( word1_string, word2_string))) return self
def calDisplayNameScore(profile1, profile2): name1 = profile1["displayName"] name2 = profile2["displayName"] lang1 = profile1["displayNameLang"] lang2 = profile2["displayNameLang"] if lang1 == lang2: return 1-pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(name1, name2) else: if len(name1)>3 and len(name2)>3: try: if lang1 != "en": name1 = str(TextBlob(name1).translate(to="en")) if lang2 != "en": name2 = str(TextBlob(name2).translate(to="en")) except: pass return 1-pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(name1, name2) else: return 1-pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(name1, name2)
def is_similar(source: str, target: str, threshold: float) -> bool: """ 使用現成模組,來源 https://github.com/gfairchild/pyxDamerauLevenshtein Args: source: 使用者的查詢字串 target: 資料庫的字串 threshold: 一個介於0到1之間的值,表示可以容許兩個字串相異程度的最大值 """ distance = normalized_damerau_levenshtein_distance(source, target) return distance < threshold
def suffix_measure(x, y, x2, res): for k in range(len(y)): b = y[k] h = b != 1 b = b[h].clone() a = x2[:, x.shape[1]:][k] a = a[h].clone() a_res = a.cpu().numpy() b_res = b.cpu().numpy() res.append( 1 - normalized_damerau_levenshtein_distance(list(a_res), list(b_res)))
def calDisplayNameScore(profile1, profile2): name1 = profile1["displayName"] name2 = profile2["displayName"] lang1 = profile1["displayNameLang"] lang2 = profile2["displayNameLang"] if lang1 == lang2: return 1 - pyxdameraulevenshtein.normalized_damerau_levenshtein_distance( name1, name2) else: if len(name1) > 3 and len(name2) > 3: try: if lang1 != "en": name1 = str(TextBlob(name1).translate(to="en")) if lang2 != "en": name2 = str(TextBlob(name2).translate(to="en")) except: pass return 1 - pyxdameraulevenshtein.normalized_damerau_levenshtein_distance( name1, name2) else: return 1 - pyxdameraulevenshtein.normalized_damerau_levenshtein_distance( name1, name2)
def test_normalized_damerau_levenshtein_distance(self): assert normalized_damerau_levenshtein_distance( 'smtih', 'smith') == 0.20000000298023224 assert normalized_damerau_levenshtein_distance( 'snapple', 'apple') == 0.2857142984867096 assert normalized_damerau_levenshtein_distance( 'testing', 'testtn') == 0.2857142984867096 assert normalized_damerau_levenshtein_distance('saturday', 'sunday') == 0.375 assert normalized_damerau_levenshtein_distance('Saturday', 'saturday') == 0.125 assert normalized_damerau_levenshtein_distance('orange', 'pumpkin') == 1.0 assert normalized_damerau_levenshtein_distance( 'gifts', 'profit') == 0.8333333134651184 assert normalized_damerau_levenshtein_distance('Sjöstedt', 'Sjostedt') == 0.125
def calculatePathDistance(self, pathA, pathB): semA, semB = self.createSemesterWords(pathA, pathB) for i, s1 in enumerate(semA): distRow = np.array([]) for j, path2 in enumerate(self.paths): if j <= i: distRow = np.append(distRow, [0]) else: distRow = np.append(distRow, [self.calculatePathDistance(path, path2)]) if self.distanceMatrix is None: self.distanceMatrix = np.array([distRow]) else: self.distanceMatrix = np.vstack((self.distanceMatrix, distRow)) distance = normalized_damerau_levenshtein_distance(strSemesterA, strSemesterB) return distance
def norm_dld(l1, l2): ascii_start = 0 assert len(l1) + len(l2) <= 128 # make a string for l1 # all triples are unique... s1 = ''.join((chr(ascii_start+i) for i in xrange(len(l1)))) s2 = '' for j in xrange(len(l2)): next_char = chr(ascii_start+len(s1)+j) for k in xrange(len(l1)): if trip_match(l2[j], l1[k]): next_char = s1[k] break s2 += next_char # return 1- , since this thing gives 0 to perfect matches etc return 1.0-normalized_damerau_levenshtein_distance(s1, s2)
def string_similarity_ratio(s1, s2): """ A string compare function, using the Redcliff-Obershelp algorithm. For further details see: http://docs.python.org/3.3/library/difflib.html TODO: Levenshtein might be better for this purpose. :params s1, s2: Two input strings which will be compared :returns: A ratio between 0.0 (not similar at all) and 1.0 (probably the same string). """ if s1 and s2: return 1 - normalized_damerau_levenshtein_distance( _clean_movie_title(s1), _clean_movie_title(s2) )
def generate_reward(gold_summary, summary, gold_cp, cp, reward_type=1): #Bleu score # bleu = corpus_bleu([gold_summary],summary) cp = list(deepcopy(cp)) # DLD if gold_cp: dld = normalized_damerau_levenshtein_distance(list(gold_cp), list(cp)) else: dld = 0. boolean = np.zeros(len(cp)) for pos, element in enumerate(cp): if element in gold_cp: boolean[pos] = 1 precision = np.mean(boolean) recall = np.sum(boolean) / len(gold_cp) return (precision + recall + (1 - dld)) / 3
def compute_distances(reference, values, exp=None): if exp is None: exp = Config.SIMILARITY_EXPONENT sm = difflib.SequenceMatcher() sm.set_seq2(reference) sm_distances = [] dl_distances = [] for val in values: sm.set_seq1(val) sm_distances.append(sm.ratio()) dl_distances.append( 1 - normalized_damerau_levenshtein_distance(reference, val)) sm_distances = np.array(sm_distances) dl_distances = np.array(dl_distances) dl_exp = np.power(dl_distances, exp) sm_exp = np.power(sm_distances, exp) dist_sum = dl_exp + sm_exp return np.power(dist_sum, 1 / exp)
def calc_distance_domains(self): not_in_top = [] output = [] for item in self.data: if item not in self.top_domains: not_in_top.append(item) for item in not_in_top: entry = {item: {}} flag = False for td in self.top_domains: dist = normalized_damerau_levenshtein_distance(item, td) if 0 < dist < 0.2: entry[item][td] = dist flag = True if flag is True: output.append(entry) if len(output) > 0: return output else: return None
def processtxn(txn, choices): maxscoreJ = 0 matchstrJ = "" maxscoreDL = 0 matchstrDL = "" maxscoreNDL = 0 matchstrNDL = "" for c in choices: scoreJ = jaro.jaro_metric(txn, c) scoreDL = 1000 - damerau_levenshtein_distance(txn, c) scoreNDL = 1 - normalized_damerau_levenshtein_distance(txn, c) if scoreJ > maxscoreJ: matchstrJ = c maxscoreJ = scoreJ if scoreDL > maxscoreDL: matchstrDL = c maxscoreDL = scoreDL if scoreNDL > maxscoreNDL: matchstrNDL = c maxscoreNDL = scoreNDL return {'jaro': matchstrJ, 'dl': matchstrDL, 'ndl': matchstrNDL}
def get_likely_cfg_array_version(glycan_list, distance_threshold=2.0): '''Get the most likely CFG glycan array given a list of glycans. Uses a scaled Levenshtein distance to compute similarity between glycan strings, and returns the array with the minimum sum of scaled levenshtein distances for each pair of glycans in the glycan list and corresponding reference array. We need to do this because sometimes the array version is not provided, and there are slight spelling errors in the provided glycan names. It is easier to match to a reference list of glycans for a particular array version, with all errors corrected. Args: glycan_list (list): A list of glycan strings ordered by index. distance_threshold (float): A threshold for total scaled Levenshtein distance for calling a match. Returns: CFG glycan list (list), most likely array version (string), number of mismatches (int), scaled Levenshtein distance (float) ''' glycan_list = list(glycan_list) for i, glycan in enumerate(glycan_list): # Handle odd characters in some excel files. Nonbreaking spaces, greek letters etc. glycan_list[i] = glycan.replace('–', '-').replace('α', 'a') \ .replace('β', 'b').replace('[', '(') \ .replace(']', ')').replace(' ', '').replace(u"\u00A0", '') likely_array = None likely_array_mismatches = None scaled_levenshtein_total = 0 for key, value in cfg_array_versions.items(): # Take into account glycans which are almost the same. array_version = [x.replace(' ', '') for x in value[1]] scaled_levenshtein_sum = np.sum([normalized_damerau_levenshtein_distance(x, y) for x, y in zip_longest(glycan_list, array_version, fillvalue='')]) non_matches = len([x for x in zip(glycan_list, array_version) if x[0] != x[1]]) #if not likely_array or non_matches < likely_array_mismatches: if not likely_array or scaled_levenshtein_sum < scaled_levenshtein_total: likely_array = key likely_array_mismatches = non_matches scaled_levenshtein_total = scaled_levenshtein_sum if scaled_levenshtein_total > distance_threshold: raise ValueError("Glycan list does not match to known array versions.") return list(cfg_array_versions[likely_array][1]), likely_array, likely_array_mismatches, scaled_levenshtein_total
def get_str_similarity(source, target): source = source.lower() target = target.lower() long_sub = longest_substring(source, target) #"long_substring": longest_substring(source, target), similarities = { "ro_similarity": get_ro_similarity(source, target), "dleven_similarity": 1 - normalized_damerau_levenshtein_distance(source, target), "leven_similarity": Levenshtein.ratio(source, target), "phonetic_similarity": get_levenshtein_phonetic_similarity(source, target), } if (len(long_sub)): similarities["len_long_substring"] = len(long_sub) / len(source) else: similarities["len_long_substring"] = 0 res = {k: v for k, v in similarities.items() if v is not None} avg_sim = sum(res.values()) / len(res.values()) return avg_sim
def calculatePathDistance(self, pathA, pathB): courseNames = abstract.extractAllCourseNames([pathA, pathB]) idDict = dict() for i, n in enumerate(courseNames): idDict[n] = i semesterA, semesterB = [], [] for sem in pathA.semester: tempArr = [] for c in sem: tempArr.append(chr(idDict[c.name])) tempArr.sort() semesterA.append(''.join(tempArr)) for sem in pathB.semester: tempArr = [] for c in sem: tempArr.append(chr(idDict[c.name])) tempArr.sort() semesterB.append(''.join(tempArr)) strSemesterA = ''.join(semesterA) strSemesterB = ''.join(semesterB) distance = normalized_damerau_levenshtein_distance( strSemesterA, strSemesterB) return distance
def getCandidateBaseFeature(self, candidate, num_candidates, max_prior): # base feature_num features = [] m_label = candidate.getMentionText() # number of candidates features.append(num_candidates) # max_prior features.append(max_prior) # string similarity features if self._has_str_sim: c_label = candidate.label if self._lowercase: c_label = c_label.lower() # edit_distance features.append( normalized_damerau_levenshtein_distance(c_label, m_label)) # is equal features.append(1 if c_label == m_label else 0) # mlabel contains clabel features.append(1 if c_label in m_label else 0) # clabel contains mlabel features.append(1 if m_label in c_label else 0) # mlabel starts with clabel features.append(1 if m_label.startswith(c_label) else 0) # clabel starts with mlabel features.append(1 if c_label.startswith(m_label) else 0) # mlabel ends with clabel features.append(1 if m_label.endswith(c_label) else 0) # clabel ends with mlabel features.append(1 if c_label.endswith(m_label) else 0) # prior if self._has_prior: # entity prior features.append(candidate.getEntityMentionPrior()) return features
for j in range(FLAGS.batch_size): try: eocIndex = h[j].tolist().index(endOfCaseId) + 1 except ValueError, e: eocIndex = None suffixes_predicted[j] = h[j].tolist()[:eocIndex] suffixes_predicted_alpha = [ ''.join([num2alpha[element] for element in suffix]) for suffix in suffixes_predicted ] suffixes_alpha = [ ''.join([num2alpha[element] for element in suffix]) for suffix in suffixes[0] ] for j in range(FLAGS.batch_size): outputFile.write("{0}\n".format(suffixes_predicted_alpha[j])) outputFile.write("{0}\n".format(suffixes_alpha[j])) distance = dl.normalized_damerau_levenshtein_distance( suffixes_predicted_alpha[j], suffixes_alpha[j]) outputFile.write("{0}\n".format(distance)) sum_distance += distance outputFile.flush() os.fsync(outputFile.fileno()) print("Batch {} of {} ".format(batchNum, numBatches)) outputFile.write("average edit_distance: {0}\n".format( sum_distance / (FLAGS.batch_size * numBatches))) outputFile.close() resultFile.close()
import requests import json import pyxdameraulevenshtein matches = json.loads(requests.get('http://worldcup.kimonolabs.com/api/matches?sort=startTime&fields=homeScore,awayScore,startTime,awayTeamId,homeTeamId,id&apikey=72519cb45986ce5ffd15020a5e4b1a70').content) print matches gabriel_teams = list(set('Brazil,Mexico,Spain,Chile,Colombia,Cote Divoire,Uruguay,England,Switzerland,France,Argentina,Iran,Germany,Ghana,Belgium,Russia,Brazil,Cameroon,Spain,Australia,Colombia,Japan,Uruguay,Italy,Switzerland,Honduras,Argentina,Nigeria,Germany,USA,Belgium,Korea Republic,Cameroon,Croatia,Australia,Netherlands,Japan,Greece,Italy,Costa Rica,Honduras,Ecuador,Nigeria,Bosnia-Herzegovina,USA,Portugal,Korea Republic,Algeria'.split(','))) teams = json.loads(requests.get('http://worldcup.kimonolabs.com/api/teams?apikey=72519cb45986ce5ffd15020a5e4b1a70').content) names = list(set([team['name'].encode('utf-8') for team in teams])) print names matches = [teams[min([(i,pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(gteam, team)) for i, team in enumerate(names)], key = lambda x: x[1])[0]]['id'] for gteam in gabriel_teams] for a,b in zip(gabriel_teams, matches): try: print a,unicode(b) except: pass
def extract(self, source, paraphrase, position): # Levenshtein distance return normalized_damerau_levenshtein_distance(tokenize(source), tokenize(paraphrase))
def test_normalized_damerau_levenshtein_distance(self): assert normalized_damerau_levenshtein_distance('smtih', 'smith') == 0.20000000298023224 assert normalized_damerau_levenshtein_distance('', '') == 0 assert normalized_damerau_levenshtein_distance('snapple', 'apple') == 0.2857142984867096 assert normalized_damerau_levenshtein_distance('testing', 'testtn') == 0.2857142984867096 assert normalized_damerau_levenshtein_distance('saturday', 'sunday') == 0.375 assert normalized_damerau_levenshtein_distance('Saturday', 'saturday') == 0.125 assert normalized_damerau_levenshtein_distance('orange', 'pumpkin') == 1.0 assert normalized_damerau_levenshtein_distance('gifts', 'profit') == 0.8333333134651184 assert normalized_damerau_levenshtein_distance('Sjöstedt', 'Sjostedt') == 0.125 assert np.isclose(normalized_damerau_levenshtein_distance([1, 2, 3], [1, 3, 2]), 1.0 / 3.0) assert normalized_damerau_levenshtein_distance([], []) == 0.0 assert np.isclose(normalized_damerau_levenshtein_distance(list(range(10)), list(range(1, 11))), 0.2) assert normalized_damerau_levenshtein_distance([1, 2, 3, 4, 5, 6], [7, 8, 9, 7, 10, 11, 4]) == 1.0
def text_edit_ratio(doc, method=u'text_edit_ratio', ground_truth=None, xml_in=True, gt_format='tei', clean_in=True, clean_gt=True, divert=True): """ Calculates the similarity of the input documents and a given ground truth using the Damerau-Levenshtein distance. The result is a value between 0.0 (no commonality) and 1.0 (identical strings). Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. ground_truth (unicode): Ground truth location tuple or a list of ground truths to choose from. When more than one is given, the file sharing the longest prefix with the input document is chosen. xml_in (bool): Switch to treat input as an TEI-XML document. gt_format (unicode): Switch to select ground truth format. Valid values are 'tei', 'hocr', and 'text'. clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!) clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!) divert (bool): Switch selecting output diversion. If enabled the output will be added to the tracking arguments and the input document will be returned as the result of the task. Use this to insert a statistical measure into a chain without affecting the results. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) if not isinstance(ground_truth[0], basestring): ground_truth = find_matching(doc, ground_truth) with storage.StorageFile(*ground_truth) as fp: if gt_format == 'tei': tei = TEIFacsimile() tei.read(fp) t = StringIO.StringIO() tei.write_text(t) gt = t.getvalue() elif gt_format == 'hocr': gt = html.parse(fp).text_content() elif gt_format == 'text': gt = fp.read() else: raise NidabaInvalidParameterException('Input format ' + gt_format + ' unknown.') with storage.StorageFile(*doc) as fp: if xml_in: tei = TEIFacsimile() tei.read(fp) t = StringIO.StringIO() tei.write_text(t) text = t.getvalue() else: text = fp.read() if clean_in: text = cleanup(text) if clean_gt: gt = cleanup(gt) logger.debug('Recognition result: \n{}'.format(text)) logger.debug('Ground truth: \n{}'.format(gt)) edist = 1.0 - normalized_damerau_levenshtein_distance(text, gt) logger.debug('Edit distance: {}'.format(damerau_levenshtein_distance(text, gt))) logger.debug('Accuracy: {}'.format(edist)) if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(edit)) return output_path else: return {'edit_ratio': edist, 'ground_truth': ground_truth, 'doc': doc}
def create_json_text_similarity(type='train'): if type == 'train': input_path = "../modified_data/ItemPairs_train_with_additional_pairs_fixed.csv" # input_path = "../input/ItemPairs_train.csv" # out_path = "../modified_data/json_text_sim_params_train.csv" out_path = "../orig_features/train_json_sim_param.csv" else: input_path = "../input/ItemPairs_test.csv" # out_path = "../modified_data/json_text_sim_params_test.csv" out_path = "../orig_features/test_json_sim_param.csv" print('Get table...') table = get_filled_table(input_path, type) only_cats = ['Опыт работы', 'Образование', 'Адрес', 'Забронированные даты', 'Модель', 'Знание языков', 'Отчёт Автокод', 'Кадастровый номер', 'Номер свидетельства ТС', 'VIN-номер', 'Корпус', 'Ссылка на документацию', 'Корпус / очередь', 'Страна', 'Название новостройки', 'Кадастровый номер участка', 'Адрес компании'] vectorizer = prepareVectorizer() print('Write table in CSV ...') out = open(out_path, "w", encoding='UTF-8') out.write('itemID_1,itemID_2') # print header for key in only_cats: nm = get_param_name(key) out.write(',' + nm + '_dam_lev_norm') out.write(',address_tdidf') out.write('\n') for i, row in table.iterrows(): out.write(str(row['itemID_1'])) out.write(',') out.write(str(row['itemID_2'])) if row['attrsJSON_1'] == -1: data1 = dict() else: data1 = json.loads(str(row['attrsJSON_1'])) if row['attrsJSON_2'] == -1: data2 = dict() else: data2 = json.loads(str(row['attrsJSON_2'])) for key in only_cats: if key not in data1 and key not in data2: out.write(',-1') else: str1 = '' str2 = '' if key in data1: str1 = data1[key] if key in data2: str2 = data2[key] val = normalized_damerau_levenshtein_distance(str1, str2) out.write(',' + str(val)) # Адрес считаем tfidf for key in ['Адрес']: if key not in data1 and key not in data2: out.write(',-1') else: str1 = '' str2 = '' if key in data1: str1 = data1[key] if key in data2: str2 = data2[key] val = cosine_sim(str1, str2, vectorizer) out.write(',' + str(val)) out.write('\n') out.close()
def damerau_levenshtein(str1, str2): aux = pyxdameraulevenshtein.normalized_damerau_levenshtein_distance( str1, str2) return 1.0 - aux
last_id = 0 for match in pattern.findall(input): id = ids.get(match) if id is None: ids[match] = last_id last_id += 1 id = last_id - 1 result.append(id) return result def tokenize_file(fname): with open(fname) as f: return tokenize(f.read()) def similarity(one_tokens, other_tokens) distance = normalized_damerau_levenshtein_distance(one_tokens, other_tokens) return distance if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: %s [FILE1] [FILE2])" % sys.argv[0]) exit(1) other = sys.argv[2] one = sys.argv[1] other_tokens = tokenize_file(other) one_tokens = tokenize_file(one) result = similarity(one_tokens, other_tokens) print(result)
def descr_damerau_levenshtein_norm(row): return normalized_damerau_levenshtein_distance(row['description_1'], row['description_2'])
def title_damerau_levenshtein_norm(row): return normalized_damerau_levenshtein_distance(row['title_1'], row['title_2'])
# Read the content of the text file, coverting to unicode f = codecs.open(args.input, encoding='utf-8', mode='r') data = f.read().replace('\n', ' ').lower() f.close() # Eliminate special characters pattern = re.compile('[\W_]+') data_lower = pattern.sub(' ', data) firstname = '' n = 0 for secondname in data_lower.split(): if ((len(firstname) > 4) and (len(secondname) > 4)): # two long words for idx, entrada in df_dict.iterrows(): sim1 = 1.0 - normalized_damerau_levenshtein_distance( firstname, entrada['first']) if (sim1 > threshold): sim2 = 1.0 - normalized_damerau_levenshtein_distance( secondname, entrada['second']) if (sim2 > threshold): print(args.input, firstname, secondname, entrada['first'], entrada['second'], str((sim1 + sim2) / 2)) firstname = secondname end = datetime.datetime.now() diff = end - start print(args.input, str(diff.total_seconds()))
def stringDistance(string1, string2): return pyxdameraulevenshtein.normalized_damerau_levenshtein_distance(string1, string2)
def get_damerau_levenshtein_distance(str1, str2, normalized=False): if normalized: dis = normalized_damerau_levenshtein_distance(str1, str2) else: dis = damerau_levenshtein_distance(str1, str2) return dis
from pyxdameraulevenshtein import damerau_levenshtein_distance, normalized_damerau_levenshtein_distance import random import string import timeit print('#edit distances (low edit distance means words are similar):') print("damerau_levenshtein_distance('%s', '%s') = %d" % ('smtih', 'smith', damerau_levenshtein_distance('smtih', 'smith'))) print("damerau_levenshtein_distance('%s', '%s') = %d" % ('snapple', 'apple', damerau_levenshtein_distance('snapple', 'apple'))) print("damerau_levenshtein_distance('%s', '%s') = %d" % ('testing', 'testtn', damerau_levenshtein_distance('testing', 'testtn'))) print("damerau_levenshtein_distance('%s', '%s') = %d" % ('saturday', 'sunday', damerau_levenshtein_distance('saturday', 'sunday'))) print("damerau_levenshtein_distance('%s', '%s') = %d" % ('Saturday', 'saturday', damerau_levenshtein_distance('Saturday', 'saturday'))) print("damerau_levenshtein_distance('%s', '%s') = %d" % ('orange', 'pumpkin', damerau_levenshtein_distance('orange', 'pumpkin'))) print("damerau_levenshtein_distance('%s', '%s') = %d #unicode example\n" % ('Sjöstedt', 'Sjostedt', damerau_levenshtein_distance('Sjöstedt', 'Sjostedt'))) #unicode example print('#normalized edit distances (low ratio means words are similar):') print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('smtih', 'smith', normalized_damerau_levenshtein_distance('smtih', 'smith'))) print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('snapple', 'apple', normalized_damerau_levenshtein_distance('snapple', 'apple'))) print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('testing', 'testtn', normalized_damerau_levenshtein_distance('testing', 'testtn'))) print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('saturday', 'sunday', normalized_damerau_levenshtein_distance('saturday', 'sunday'))) print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('Saturday', 'saturday', normalized_damerau_levenshtein_distance('Saturday', 'saturday'))) print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('orange', 'pumpkin', normalized_damerau_levenshtein_distance('orange', 'pumpkin'))) print("normalized_damerau_levenshtein_distance('%s', '%s') = %f #unicode example\n" % ('Sjöstedt', 'Sjostedt', normalized_damerau_levenshtein_distance('Sjöstedt', 'Sjostedt'))) #unicode example print('#performance testing:') #random words will be comprised of ascii letters, numbers, and spaces chars = string.ascii_letters + string.digits + ' ' word1 = ''.join([random.choice(chars) for i in range(30)]) #generate a random string of characters of length 30 word2 = ''.join([random.choice(chars) for i in range(30)]) #and another print("""timeit.timeit("damerau_levenshtein_distance('%s', '%s')", 'from pyxdameraulevenshtein import damerau_levenshtein_distance', number=500000) = %f seconds""" % \ (word1, word2, timeit.timeit("damerau_levenshtein_distance('%s', '%s')" % (word1, word2), 'from pyxdameraulevenshtein import damerau_levenshtein_distance', number=500000)))