def get_grades_for_course(): token = request.args['access_token'] query = request.args['course'] status, response = canvas_requests.get( token, 'courses', params={'include[]': 'total_scores'}) if status != 200: return response, status most_similar_id = 1 high_score = 0 for course in response: name = course['name'] code = course['course_code'] name_sim = jellyfish.levenshtein_distance(name, query) code_sim = jellyfish.levenshtein_distance(code, query) if max(name_sim, code_sim) > high_score: most_similar_id = course['id'] course = [ course for course in response if course['id'] == most_similar_id ][0] enrollment = course['enrollments'][0] return { 'grade': enrollment['computed_current_grade'], 'score': enrollment['computed_current_score'], 'course': course['name'] }
def levenshtein_ratio(source, target, ignore_case=True): """Calculates the levenshtein ratio between two strings. The ratio is computed as follows: (len(source) + len(target) - distance) / (len(source) + len(target)) This function has been ported from (MIT license): https://github.com/texttheater/golang-levenshtein/blob/4041401c6e7f6a2b49815c4aea652e518ca8e92e/levenshtein/levenshtein.go#L115-L130 :param str source: :param str target: :rtype: float :return: """ if ignore_case: distance = jellyfish.levenshtein_distance(source.lower().strip(), target.lower().strip()) else: distance = jellyfish.levenshtein_distance(source, target) source_len = len(source) target_len = len(target) return (source_len + target_len - distance) / (source_len + target_len)
def distance_filter(df, c, thresh=3, suffix1='_x', suffix2='_y', col1=None, col2=None, nonull=None): if (col1 is not None) and (col2 is not None): c1 = col1 + suffix1 c2 = col2 + suffix2 else: c1 = c + suffix1 c2 = c + suffix2 if nonull is not None: df['distance'] = df.apply( lambda x: jf.levenshtein_distance(x[c1], x[c2]), axis=1) else: df['distance'] = df.apply(lambda x: 10 if (pd.isnull(x[c1]) | pd.isnull(x[c2])) else jf.levenshtein_distance(x[c1], x[c2]), axis=1) df = df[df.distance <= thresh] return df
def extract(self, x, y): if x is None or y is None: return 0 if self.similarity: return 1 - float(levenshtein_distance(unicode(x), unicode(y))) / max(len(x), len(y)) else: return levenshtein_distance(unicode(x), unicode(y))
def token_set_ratio(old_text, new_text): old_text_list = re.findall(r"[\w']+", old_text) new_text_list = re.findall(r"[\w']+", new_text) if len(old_text_list) == 0 or len(new_text_list) == 0: return 0 old_text_list = sorted(old_text_list) new_text_list = sorted(new_text_list) common_list = get_intersection(old_text_list, new_text_list) old_text_list_diff = get_difference(common_list, old_text_list) new_text_list_diff = get_difference(common_list, new_text_list) common_list = sorted(common_list) old_text_list_diff = sorted(old_text_list_diff) new_text_list_diff = sorted(new_text_list_diff) old_text_list = common_list+old_text_list_diff new_text_list = common_list+new_text_list_diff common_text_join = " ".join(str(x) for x in common_list) old_text_join = " ".join(str(x) for x in old_text_list) new_text_join = " ".join(str(x) for x in new_text_list) r1 = 100-jellyfish.levenshtein_distance(common_text_join, old_text_join)/len(old_text_join)*100 r2 = 100-jellyfish.levenshtein_distance(common_text_join, new_text_join)/len(new_text_join)*100 r3 = 100-jellyfish.levenshtein_distance(old_text_join, new_text_join)/max(len(old_text_join),len(new_text_join))*100 result = max(r1, r2, r3) return round(result,1)
def findToken(data, token, max_distance=2): result = [] for j in range(1, max_distance + 1): tkl = len(token) + j if len(data) >= tkl: dl = [] for i in range(len(data) - tkl): distance = jf.levenshtein_distance(data[i:i + tkl], token) dl.append(distance) for i in range(tkl): dl.append(tkl) result.append(dl) else: dl = [] for i in range(len(data)): dl.append(len(token)) result.append(dl) if len(data) >= len(token): dl = [] for i in range(len(data) - len(token)): distance = jf.levenshtein_distance(data[i:i + len(token)], token) dl.append(distance) for i in range(len(token)): dl.append(len(token)) result.append(dl) else: dl = [] for i in range(len(data)): dl.append(len(token)) result.append(dl) for j in range(1, max_distance + 1): tkl = len(token) - j if len(data) >= tkl: dl = [] for i in range(len(data) - tkl): distance = jf.levenshtein_distance(data[i:i + tkl], token) dl.append(distance) for i in range(tkl): dl.append(tkl) result.append(dl) else: dl = [] for i in range(len(data)): dl.append(len(token)) result.append(dl) if len(result) == 0: return for dl in result: if len(dl) == 0: return eachResult = np.array(result) lowest_i = np.unravel_index(np.argmin(eachResult), eachResult.shape) if eachResult[lowest_i[0]][lowest_i[1]] <= max_distance: next_i = lowest_i[1] + len(token) + max_distance - lowest_i[0] return data[lowest_i[1]:next_i], data[next_i:]
def get_matrix_distance(words_list, diagonal=True): M = [[[] for w in zip(words_list, words_list)] for w in zip(words_list, words_list)] #Generate Square Matrix for i in range(len(words_list)): for j in range(len(words_list)): if diagonal: if j>=i: M[i][j] = levenshtein_distance(words_list[i],words_list[j]) #Fill half of it else: M[i][j] = levenshtein_distance(words_list[i],words_list[j]) #Fill half of it return M
def get_closest_levenshtein(needle,haystack): closest = None; for x in haystack: if(closest == None): closest = (x,jellyfish.levenshtein_distance(needle,x)); else: temp = (x,jellyfish.levenshtein_distance(needle,x)); if(temp[1] < closest[1]): closest = temp; if(closest == None): return None; return closest[0];
def bigram_corr(line): #function with input line(sentence) words = line.split() #split line into words for idx, (word1, word2) in enumerate(zip(words[:-1], words[1:])): # line = list(itertools.chain.from_iterable(line)) for i,j in fdist: #iterate over bigrams if (word2==j) and (jf.levenshtein_distance(word1,i) < 5): #if 2nd words of both match, and 1st word is at an edit distance of 2 or 1, replace word with highest occurring bigram idx = 0 words[idx] = i elif (word1==i) and (jf.levenshtein_distance(word2,j) < 5): idx = 1 words[idx] = j return " ".join(words)
def levProDistance(str1, str2): c1 = str1.split(" ") c2 = str2.split(" ") score = 0 for word in c1: levScore = [jf.levenshtein_distance(word , alter) for alter in c2] score += min(levScore) score2 =0 for word in c2: levScore = [jf.levenshtein_distance(word , alter) for alter in c1] score2 += min(levScore) return ((score2*1.0/len(c2))+(score*1.0/len(c1)))/2
def get_closest_levenshtein(needle, haystack): closest = None for x in haystack: if (closest == None): closest = (x, jellyfish.levenshtein_distance(needle, x)) else: temp = (x, jellyfish.levenshtein_distance(needle, x)) if (temp[1] < closest[1]): closest = temp if (closest == None): return None return closest[0]
def compare_two_texts(self, string_a, string_b, normalize_value=True): """ Compare two string and return the value of Levenshtein algorithm the value is normalized between 0 and 1 values. """ if ((isinstance(string_a, unicode) and isinstance(string_b, unicode)) or (isinstance(string_a, str) and isinstance(string_b, str))): if normalize_value: return self.__normalized_value(jellyfish.levenshtein_distance(string_a, string_b)) else: return jellyfish.levenshtein_distance(string_a, string_b) else: raise TypeError
def jelly(): import jellyfish a = u'Korle Bu Teaching Hospital Sickle Cell Dept' b = u'Korle Bu Teaching Hospital' # a = u'x' # b = u'a' print jellyfish.levenshtein_distance(a, b) print jellyfish.jaro_distance(a, b) print jellyfish.damerau_levenshtein_distance(a, b) # print jellyfish.match_rating_comparison(a,b) from fuzzywuzzy import fuzz print fuzz.ratio(a, b)
def final(mlf): print(mlf) l1 = [] l2 = [] l3 = [] l4 = [] l5 = [] l6 = [] sdx_input = call_soundex(mlf) sdx_raw = call_soundex(utf_corpus()) for (i, j), (k, v) in product(sdx_input.items(), sdx_raw.items()): l1.append(i.split('-')[0]) l2.append(j) l3.append(k.split('-')[0]) l4.append(v) l5.append(jellyfish.levenshtein_distance(j, v)) l6.append( jellyfish.levenshtein_distance(i.split('-')[0], k.split('-')[0])) df = pd.DataFrame(np.nan, index=range(0, len(l1)), columns=[ 'wrd', 'sx_wrd', 'cpr', 'sx_cpr', 'sx_dist', 'lv_dist', ]) df['wrd'] = l1 df['sx_wrd'] = l2 df['cpr'] = l3 df['sx_cpr'] = l4 df['sx_dist'] = l5 df['lv_dist'] = l6 print(df.head(5)) min_df_lv = df[df['lv_dist'] <= 2] selected = [] for i in range(0, len(mlf)): if len(mlf[i]) > 0: x = min_df_lv[min_df_lv['wrd'] == list(mlf)[i]].sort_values( by='sx_dist', ascending=False).head(10) #print(x) s = x.groupby(['cpr'])['wrd'].transform('count') selected.append(x['cpr'].ix[s.idxmax()]) print(x['cpr'].ix[s.idxmax()]) return selected
def suggest_normalizations(sample, threshold=1.0): """ Attempts to identify spelling mistakes between two strings (a, b) using the Levenshtein distance metric, which is defined as the minimum edit distance between two strings. In order to identify candidates for replacement, we define a similarity measure (s) which is defined as such: f = jellyfish.levenshtein_distance s = floor([len(a)/f(a) + len(b)/f(a)]) / 2 :param sample: a collection of terms to use :param threshold the threshold to use :return: a dict of candidates for normalization """ if not all(map(lambda x: type(x) == list, [sample])): raise ValueError("normalize() expects scalar-valued arrays as input (e.g. a = [1, 2, 3])") else: seen = set() suggestions = [] c = collections.Counter(sample).most_common() for t1, c1 in c: for t2, c2 in c: if t1 != t2 and (t1, t2) not in seen: seen.add((t1, t2)) seen.add((t2, t1)) d = jellyfish.levenshtein_distance(t1, t2) similarity = ((len(t1) / d) + (len(t2) / d)) // 2.0 if similarity > threshold: suggestions.append(Suggestion(a=t1, b=t2, edit_distance=d)) return suggestions
def get_insee(postcode, name): """ Convert a postcode to an insee code. If no exact match, choose best candidate but record it as problematic. """ global problematicTown global problematicPost if not post2insee.has_key(postcode): # No match on postcode... problematicPost.add(postcode) return None elif post2insee[postcode].has_key(name.upper()): # Perfect match! return post2insee[postcode][name.upper()] else: # No perfect match, look for best candidate best = None best_score = None for candidate in post2insee[postcode].keys(): score = jellyfish.levenshtein_distance(name.upper(), candidate) if (best_score is None) or (score<best_score): best_score = score best = candidate problematicTown.add( name.upper() ) if not best is None: return post2insee[postcode][best] else: return None
def get_fuzzy_dict_features(w, s, dict_name=u'fuzzy', distance=5): import jellyfish to_return = dict() for cand in s: if jellyfish.levenshtein_distance(w.lower(), cand) < distance: to_return[u'wordlist-{}'.format(dict_name)] = 1 return to_return
def find_card(carddic, s): t = { 8209: 45, 8211:45, # convert dash 48: 111, 79: 111, # convert zero and uppercase O to small o 211: 111, 212: 111, 214: 111, # other chars similar to o 242: 111, 243: 111, 244: 111, 245: 111, 246: 111, # other chars similar to o 959:111, 1086:111, 8009:111, 1054:111, # other chars similar to o 73:105, 74:105, 106:105, 108:105, 124:105, # convert upper i, upper j, small j, small l and pipe symbol to small i 161:105, 205:105, 206:105, 236:105, 237:105, 238:105, 239:105, 1575:105, # convert other chars to i 192: 65, 193: 65, 194: 65, 196: 65, 1040:65, 1044:65, # upper A 200: 69, 201: 69, 202: 69, 1045:69, # upper E 85:117, # convert upper U to small u 218: 117, 220: 117, # other conversions to small u 249: 117, 250: 117, 251: 117, 252: 117, # other conversions to small u 956: 117, 1094: 117, 224: 97, 225: 97, 226: 97, 227: 97, 228: 97, 229: 97, # small a conversion 232: 101, 233: 101, 234: 101, 235: 101 # small e conversion } d = 999 dmin = 999 smin = "" for c in carddic: d = jellyfish.levenshtein_distance(c.translate(t), s.translate(t)) if dmin > d: dmin = d smin = c print(c.translate(t) + "/"+ s.translate(t)) return [carddic[smin], smin, dmin]
def find_match_levenshtein(self, token, canonical): candidates = [] best_score = 2 for word in self.dicts: score = jellyfish.levenshtein_distance( token, word.decode("utf-8").lower()) if score <= best_score: best_score = score candidates.append(word.lower()) #G = ngram.NGram(candidates) #best_candidates = G.search(token, threshold=0.5) #results = [item[0] for item in best_candidates] is_match = False for word in candidates: if word == canonical: is_match = True break #if len(best_candidates) > 0: # best_match = best_candidates[0][0] #else: # best_match = "" return candidates, is_match
def checkID_gorinski(movies): movie_db = imdb.IMDb() correct = 0 incorrect = 0 id_mismatch = [] for item in movies: movie_by_ID = movie_db.get_movie(item[-1]) if jelly.levenshtein_distance(str(item[0]), str(movie_by_ID)) >= 15: # try: # with open(item[2]) as fp: # contents = fp.readlines()[:60] # for line in contents: # line = line.strip() # if len(line) <= 15: # IMDB search character not working, # no cross match with actor (delete) # except FileNotFoundError: --> lots of file mismatches # id_mismatch.append(item) id_mismatch.append(item) incorrect += 1 else: correct +=1 return (str(correct/(correct+incorrect)*100), id_mismatch)
def union_names(anidb_names, absolute_names): if not anidb_names and not absolute_names: return [] if not anidb_names: return absolute_names if not absolute_names: return anidb_names anidb_names_copy = list(anidb_names) absolute_names_copy = list(absolute_names) name_matches = {} while anidb_names_copy: anidb_name = anidb_names_copy.pop() for name in absolute_names_copy: simi = jellyfish.levenshtein_distance( anidb_name, name.encode('utf-8')) if simi < 5: absolute_names_copy.pop(0) name_matches[anidb_name] = name break total_distinct_names = anidb_names total_distinct_names.extend([name for name in absolute_names if name not in name_matches.values()]) return total_distinct_names
def get_levenshtein_agseq(): ''' get levenshtein distance per antigen :return: ''' infile = 'abdb_outfiles_2019/heavy_light_ag_aaseq.csv' df = pd.read_csv(infile).iloc[:] print(df.info()) data = [] for i, pdbid in enumerate(df.pdbid.unique()): pdbdf = df[df.pdbid == pdbid] agseq1 = pdbdf.iloc[0].a_sequence print('computing %s #%s' % (pdbid, i)) for pdbid2 in df.pdbid.unique(): if pdbid2 != pdbid: pdbdf2 = df[df.pdbid == pdbid2] agseq2 = pdbdf2.iloc[0].a_sequence ld = jellyfish.levenshtein_distance(agseq1, agseq2) # print(ld) datum = [pdbid, pdbid2, agseq1, agseq2, ld] data.append(datum) colnames = ['pdbid1', 'pdbid2', 'agseq1', 'agseq2', 'ld'] lddf = pd.DataFrame(data, columns=colnames) outname = infile[:-4] + '_antigen_full_ld.csv' print(outname) lddf.to_csv(outname, index=False)
def checkID_agarwal(movies): movie_db = imdb.IMDb() correct = 0 incorrect = 0 id_mismatch = [] for item in movies: movie_by_ID = movie_db.get_movie(item[-1]) # if levenshtein distance test fails for movie title, continue to check # for movie year if jelly.levenshtein_distance(str(item[0]), str(movie_by_ID)) >= 10: year = str(movie_by_ID["year"]) writer = list(movie_by_ID["writer"]) writer_to_str = [str(w) for w in writer] with open(item[2]) as fp: content = fp.readlines()[:20] for w in writer_to_str: writer_check = any(w in c for c in content) match_year = [s for s in content if year in s] if match_year == [] and writer_check == False: print("Sanity check failed: \n Year or writer mismatch found. \n {} {}".format(item[-1], item[0]), "\n") incorrect += 1 id_mismatch.append(item) else: print("Sanity check passed: \n {} {}".format(item[-1], item[0]), "\n") correct += 1 else: print("Sanity check passed: \n {} {}".format(item[-1], item[0]), "\n") correct += 1 return (str(correct/(correct+incorrect)*100), id_mismatch)
def stringLevensteinFraction(s1, s2, recogHash=False): if recogHash: s1 = removeHashNSpace(s1) s2 = removeHashNSpace(s2) s1 = s1.replace(" ", "") s2 = s2.replace(" ", "") return (1 - jf.levenshtein_distance(s1, s2) / max(len(s1), len(s2)))
def p2(lines): import itertools from jellyfish import levenshtein_distance for (l1, l2) in itertools.product(lines, repeat=2): d = levenshtein_distance(l1, l2) if d == 1: return common(l1, l2)
def get_levenshtein_epitopeseq(): ''' get levenshtein distance per antigen :return: ''' infile = 'abdb_outfiles_2019/heavy_light_ag_aaseq.csv' df = pd.read_csv(infile).iloc[:] print(df.info()) data = [] for i, row in df.iterrows(): pdbid = row.pdbid epitopeseq1 = row.epitope for i2, row2 in df.iterrows(): pdbid2 = row2.pdbid if pdbid2 != pdbid: epitopeseq2 = row2.epitope ld = jellyfish.levenshtein_distance(epitopeseq1, epitopeseq2) datum = [pdbid, pdbid2, epitopeseq1, epitopeseq2, ld] data.append(datum) colnames = ['pdbid1', 'pdbid2', 'epitopeseq1', 'epitopeseq2', 'ld'] lddf = pd.DataFrame(data, columns=colnames) print(lddf.head()) outname = infile[:-4] + '_antigen_epitope_ld.csv' print(outname) lddf.to_csv(outname, index=False)
def get_levenshtein_segments_epitope(): ''' get levenshtein distance per segment :return: ''' infile = 'abdb_outfiles_2019/heavy_light_ag_aaseq.csv' df = pd.read_csv(infile).iloc[:] print(df.info()) df = df.dropna(subset=['epitope']) data = [] for segment in df.region.unique(): segdf = df[df.region == segment] print(segment) print(segdf.shape) counter = 0 for i, row in segdf.iterrows(): counter += 1 # print(counter) print('seq1 %s' % row.epitope) seq1 = row.epitope pdbid = row.pdbid for i2, row2 in segdf.iterrows(): pdbid2 = row2.pdbid if pdbid != pdbid2: print('seq2 %s' % row2.epitope) seq2 = row2.epitope ld = jellyfish.levenshtein_distance(seq1, seq2) datum = [pdbid, pdbid2, segment, seq1, seq2, ld] data.append(datum) colnames = ['pdbid1', 'pdbid2', 'region', 'epitope1', 'epitope2', 'ld'] lddf = pd.DataFrame(data, columns=colnames) print(lddf.head()) outname = infile[:-4] + '_antigen_epitope_ld.csv' print(outname) lddf.to_csv(outname, index=False)
def get_levenshtein_segments(): ''' get levenshtein distance per segment :return: ''' infile = 'abdb_outfiles_2019/abdb_segment_absequence_full_vgene_imgt_vgene.csv' df = pd.read_csv(infile) print(df.info()) data = [] for segment in df.segment.unique(): segdf = df[df.segment == segment] print(segment) print(segdf.shape) counter = 0 for i, row in segdf.iterrows(): counter += 1 print(counter) seq1 = row.segment_seq pdbid = row.pdbid for i2, row2 in segdf.iterrows(): pdbid2 = row2.pdbid if pdbid != pdbid2: seq2 = row2.segment_seq ld = jellyfish.levenshtein_distance(seq1, seq2) datum = [pdbid, pdbid2, segment, seq1, seq2, ld] data.append(datum) colnames = ['pdbid1', 'pdbid2', 'segment', 'seq1', 'seq2', 'ld'] lddf = pd.DataFrame(data, columns=colnames) print(lddf.head()) outname = infile[:-4] + '_ld.csv' print(outname) lddf.to_csv(outname, index=False)
def levenshtein_apply(pair): if _pair_has_any_null(pair): LOGGER.debug( "Can't compute Levenshtein distance, " "the pair contains null values: %s", pair, ) return np.nan scores = [] source_list, target_list = pair for source in source_list: for target in target_list: try: score = 1 - jellyfish.levenshtein_distance( source, target) / np.max( [len(source), len(target)]) scores.append(score) except TypeError: if pd.isnull(source) or pd.isnull(target): scores.append(self.missing_value) else: raise return max(scores)
def apply_soundex(misspell, dictionary): count = 0 result = [] for mis_word in misspell: predict_words = [] if mis_word not in dictionary: if '/' not in mis_word: for dict_word in dictionary: soundex_mis = jf.soundex(mis_word) soundex_dict = jf.soundex(dict_word) l_dist = jf.levenshtein_distance(soundex_mis, soundex_dict) predict_words.append((dict_word, l_dist)) first_five_pred = sorted(predict_words, key=operator.itemgetter(1), reverse=False)[:5] pred_words = [x[0] for x in first_five_pred] result.append(pred_words) else: # do not predict when word contains '/', a lazy method result.append(mis_word) # if mis_word in dictionary else: result.append(mis_word) count += 1 print("Processing: {} / {}".format(count, len(misspell)), end='\r') return result
def max_distance(set1, set2): if len(set1) == 0 or len(set2) == 0: return 0 return max(1 - jellyfish.levenshtein_distance(e1, e2) / max(len(e1), len(e2)) for e2 in set2 for e1 in set1)
def getSimilarityRpt(similar, hash, base_tlsh): # hash is a sha1, support other main hashes rpt = json.loads(similar) lengh = len(base_tlsh) sim = { 'sha1_hash': hash, 'tlsh': base_tlsh, 'data': [], } if rpt['query_status'] == "ok": print(str(len(rpt['data']) - 1) + " similar files to ") print('Base : ' + hash + ' tlsh: ' + base_tlsh) for sub in rpt['data']: dist = jellyfish.levenshtein_distance(base_tlsh, sub['tlsh']) percent = str(round(100 * ((lengh - dist) / lengh), 2)) + "%" if sub['sha1_hash'] != hash: print('Sha1 : ' + sub['sha1_hash'] + ' tlsh: ' + sub['tlsh'] + ' Similar at : ' + percent + ' tags: ' + str(sub['tags'])) data = {} data['sha1_hash'] = sub['sha1_hash'] data['tlsh'] = sub['tlsh'] data['similar'] = percent sim['data'].append(data) y = json.dumps(sim, indent=4) return y else: return rpt['query_status']
def select_busqueda(origen_datos, id_user, palabra_busq): unid_select = 15 y, r, items = importa_tablas_2(origen_datos) r0 = r[:, id_user] # Crea lista con distancia (Levenshtein ) distancia = list() palabra_busq = palabra_busq.lower() for i in range(len(r0)): palabra = items.loc[i][0].lower() dist_min = np.inf for p in palabra.split(): if palabra_busq in p: dist = 0 if dist < dist_min: dist_min = dist dist = jel.levenshtein_distance(palabra_busq, p) if dist < dist_min: dist_min = dist distancia.append(dist_min) jugado = 3 tabla_slc = crea_tabla_slc(distancia, r0, False, jugado) seleccion = ejecuta_seleccion(id_user, items, y, r, unid_select, tabla_slc) return seleccion
def get_levenshtein_avg(row1, row2): sum = 0 for columnIndex in xrange(1,15): a = row1[columnIndex] b = row2[columnIndex] sum += 1 - jellyfish.levenshtein_distance(a, b) / float(max(len(a), len(b))) return sum / 14.0
def commission_name_parse(self, string): """ Args: string (str): Commission name, such as 'NYB要员护卫'. Returns: str: Commission genre, such as 'urgent_gem'. """ # if self.is_doa_commission(): # return 'doa_daily' import jellyfish min_key = '' min_distance = 100 string = re.sub(r'[\x00-\x7F]', '', string) for key, value in dictionary_jp.items(): for keyword in value: distance = jellyfish.levenshtein_distance(keyword, string) if distance < min_distance: min_key = key min_distance = distance if min_distance < 3: return min_key logger.warning(f'Name with unknown genre: {string}') self.valid = False return ''
def near_dup_search(self,data,max_dist,content,md5,query,db_conn): q='' for mh in query: q+=str(mh) # results = self.s.search(q='*:*',fq='content_sg:\"'+q+'\"') results = db_conn[self.db][self.collection].find({'content_sg':q, '_id':{'$gt':ObjectId(data['_id'])}, 'catalogue_url':{'$ne':data['catalogue_url']}, # 'dupl':{'$ne':True} }) matches = defaultdict(list) # Just loop over it to access the results. for result in results: # print("The title is '{0}'.".format(result['content'].encode('utf8'))) if md5 == result['md5_hash']: # matches.append(result) matches['Exact'].append(result) elif jellyfish.levenshtein_distance(content, result['content']) < max_dist*len(content): matches['Approximate'].append(result) # if len(matches) > 0: # print('Dups for _id:%s found: ' % data['_id'],end='') # # for match in matches: # print(','.join([str(match['_id']) for match in matches])) if all (k in matches for k in ('Exact' and 'Approximate')): del matches['Approximate'] return matches
def alldist(filex, filey): xread = open(filex, 'r').read() yread = open(filey, 'r').read() lvd = jellyfish.levenshtein_distance(xread,yread) dlvd= jellyfish.damerau_levenshtein_distance(xread,yread) spsum = spamsum.match(xread,yread) spsum = 100 - spsum spsum = float(spsum/100.00) # print lvd res = float( lvd / 100.00 ) dres= float(dlvd / 100.00 ) # print res # print "Levenshtein Distance=",res jaro = jellyfish.jaro_distance(xread,yread) ## Added jaro-winkler distance by fahim 20111011 jarowink = jellyfish.jaro_winkler(xread,yread) jaro = 1.0 - jaro jarowink = 1.0 - jarowink # print "Jaro Distance = ",jaro ham = jellyfish.hamming_distance(xread,yread) ham = float ( ham / 100.00) print "Hamming Distance = ", ham # print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2)) # print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1)) # print "Spamsum Match score: ", spsum kl = kldiv(tokenize(xread), tokenize(yread)) return res, dres , jaro, jarowink, ham, kl, spsum
def add_query_features(df, inc, exc, k1list, k2list): """ Return a copy of a dataframe with summary features added for the named text files defining the query """ df_new = df.copy() k1lens = list(map(len, k1list)) k2lens = list(map(len, k2list)) k1max = max(k1lens) k2max = max(k2lens) k1count = len(k1list) k2count = len(k2list) df_new['k1_count'] = k1count df_new['k2_count'] = k2count df_new['k1_max'] = k1max df_new['k2_max'] = k2max jaro_dist = jellyfish.jaro_distance(inc, exc) lev_dist = jellyfish.levenshtein_distance(inc, exc) ji = textdistance.jaccard(inc, exc) sd = textdistance.sorensen(inc, exc) ro = textdistance.ratcliff_obershelp(inc, exc) #jellyfish.damerau_levenshtein_distance(inc,exc) #jellyfish.jaro_winkler(inc,exc) df_new['inc_jaro_exc'] = jaro_dist df_new['inc_lev_exc'] = lev_dist df_new['inc_ji_exc'] = ji df_new['inc_sd_exc'] = sd df_new['inc_ro_exc'] = ro return df_new
def get_insee(postcode, name, distmax=5): """ Convert a postcode to an insee code. If no exact match, choose best candidate but record it as problematic. """ global problematicTown global problematicPost # Handle cedex stuff if reg_cedex.search(name): name = reg_cedex.sub("", name) if not post2insee.has_key(postcode): # No match on postcode... problematicPost.add(postcode) return None elif post2insee[postcode].has_key(name.upper()): # Perfect match! return (name.upper(), post2insee[postcode][name.upper()]) else: # No perfect match, look for best candidate best = None best_score = None for candidate in post2insee[postcode].keys(): score = jellyfish.levenshtein_distance(name.upper(), candidate) if (best_score is None) or (score<best_score): best_score = score best = candidate problematicTown.add( name.upper() ) if (not best is None) and (best_score<distmax): return (best, post2insee[postcode][best]) else: return None
def find_min_dist(lyrics): nonlocal min_dist nonlocal min_dist_idx nonlocal phrase nonlocal idx # Find best match phrase in lyrics min_dist_this_lyrics = 10000 min_dist_start_idx = 0 min_dist_end_idx = 0 lyrics_met = jellyfish.metaphone(lyrics).split(' ') for i in range(0, len(lyrics_met) - len(test_met)): this_lyrics_met = lyrics_met[i:i + len(test_met)] if this_lyrics_met[0] == test_met[0]: dist = jellyfish.levenshtein_distance(''.join(test_met), ''.join(this_lyrics_met)) if dist < min_dist_this_lyrics: min_dist_this_lyrics = dist min_dist_start_idx = i min_dist_end_idx = i + len(test_met) # Check against global min if min_dist_this_lyrics < min_dist: min_dist = min_dist_this_lyrics min_dist_idx = idx phrase = ' '.join(lyrics.split(' ')[min_dist_start_idx:min_dist_end_idx]) # Increment global idx idx += 1
def alldist(filex, filey): xread = open(filex, "r").read() yread = open(filey, "r").read() lvd = jellyfish.levenshtein_distance(xread, yread) dlvd = jellyfish.damerau_levenshtein_distance(xread, yread) # print lvd res = float(lvd / 100.00) dres = float(dlvd / 100.00) # print res # print "Levenshtein Distance=",lv_d # jaro = jellyfish.jaro_distance(xread,yread) ## Added jaro-winkler distance by fahim 20111011 # jarowink = jellyfish.jaro_winkler(xread,yread) # jaro = 1.0 - jaro # jarowink = 1.0 - jarowink # print "Jaro Distance = ",jaro # ham = jellyfish.hamming_distance(xread,yread) # ham = float ( ham / 100.00) # print "Hamming Distance = ", ham # print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2)) # print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1)) # kl = kldiv(tokenize(xread), tokenize(yread)) return res, dres, jaro, jarowink, ham, kl
def levenshtein_similarity(self,s,t): """ Levenshtein Similarity """ Ns = len(s); Nt = len(t); lev_sim = 1.0 - (jellyfish.levenshtein_distance(s,t))/float(max(Ns,Nt)) return lev_sim
def get_avg_word_distance(target_words, predicted_words): try: trim_target_words = [word.strip() for word in target_words] trim_predicted_words = [word.strip() for word in predicted_words] dists = [1 - jellyfish.levenshtein_distance(t, p) / max(len(t), len(p)) for t, p in zip(trim_target_words, trim_predicted_words)] return sum(dists) / len(dists) except ZeroDivisionError: return 0
def levenshtein_apply(x): try: return 1 - jellyfish.levenshtein_distance(x[0], x[1]) / np.max([len(x[0]), len(x[1])]) except Exception as err: if pandas.isnull(x[0]) or pandas.isnull(x[1]): return np.nan else: raise err
def find_nearest_neighbour(self, e, neighbours): minneighbour="" mindistance=100000000000.0 d=0 for n in neighbours: d=jellyfish.levenshtein_distance(unicode(e),unicode(n)) if d < mindistance: mindistance=d minneighbour=n return minneighbour,mindistance
def test_levenshtein_distance(self): cases = [("", "", 0), ("abc", "", 3), ("bc", "abc", 1), ("kitten", "sitting", 3), ("Saturday", "Sunday", 3), ] for (s1, s2, value) in cases: self.assertEqual(jellyfish.levenshtein_distance(s1, s2), value)
def compare_strings(str1, str2): """Compares 2 strings with the Levenshtein distance and returns a normalized value between 0.0 and 1.0 (meaning totally different and exactly the same respectively.""" if str1 == str2: return 1.0 max_len = max(len(str1), len(str2)) if max_len == 0: return 0.0 distance = jellyfish.levenshtein_distance(str1, str2) return (max_len - distance) / float(max_len)
def bestcandidate(wrd): w = wrd candidate_list = [] try: #Check the Brown word clusters c = bcluster._word[w] for rec in c: d = rec['cluster'] recs = bcluster._cluster[d] for rec in recs: candidate = rec['word'] levenshtein = jellyfish.levenshtein_distance(w,candidate) n2 = jellyfish.metaphone(w) n3 = jellyfish.metaphone(candidate) if chant.check(candidate): #Filter the candidates within a specific character and phonetic distance if levenshtein <= 2 or jellyfish.levenshtein_distance(n2, n3) <= 1: candidate_list.append((candidate, rec['count'])) return candidate_list[-1][0] except Exception: return 'No'
def _string_dist_basic(str1, str2): """Basic edit distance between two strings, ignoring non-alphanumeric characters and case. Comparisons are based on a transliteration/lowering to ASCII characters. Normalized by string length. """ str1 = unidecode(str1) str2 = unidecode(str2) str1 = re.sub(r'[^a-z0-9]', '', str1.lower()) str2 = re.sub(r'[^a-z0-9]', '', str2.lower()) if not str1 and not str2: return 0.0 return levenshtein_distance(str1, str2) / float(max(len(str1), len(str2)))
def find_similar_pws(pw, pw_list, num_passwords): match_indices = [] best_leven_distances = [] distance = 0 for i, each in enumerate(pw_list): distance = jf.levenshtein_distance(pw, each) match_indices.append(i) best_leven_distances.append(distance) pwd_tuples = sorted(zip(match_indices, best_leven_distances), key=lambda tup: tup[1]) pwd_tuples = pwd_tuples[2000:100000] pwd_tuples = [ pwd_tuples[i] for i in sorted(random.sample(xrange(len(pwd_tuples)), 1000)) ] output = lookup_pwds(pwd_tuples, pw_list, num_passwords) return output
def compute(self,m0,m1,keys = ['DOC_SIM','WIN_SIM','SENT_SIM','OVERLAP']): sims = {}; mt_sim = jellyfish.levenshtein_distance(unicode(m0['mention_text']),unicode(m1['mention_text'])); #return {'MT_SIM': mt_sim}; # sims['DOC_SIM'] = self.cos_sim(m0['doc_tf_idf'],m1['doc_tf_idf']); sims['WIN_SIM']= self.cos_sim(m0['win_tf_idf'],m1['win_tf_idf']); # sims['SENT_SIM'] = self.cos_sim(m0['sentence_tf_idf'],m1['sentence_tf_idf']); # sims['OVERLAP'] = self.overlap(m0['NER_tags'],m1['NER_tags'],2); # sims['jaccard'] =self.jaccard(m0['win_VEs'],m1['win_VEs']) # sims['overlapVe'] =self.overlap(m0['sentence_VEs'],m1['sentence_VEs'],3) sims['win_SIMVe'] =self.cos_sim(self.extractTF_IDF(m0,'win'),self.extractTF_IDF(m1,'win')) # sims['sentence_SIMVe'] =self.cos_sim(self.extractTF_IDF(m0,'sentence'),self.extractTF_IDF(m1,'sentence')) # sims['doc_SIMVe'] =self.cos_sim(self.extractTF_IDF(m0,'doc'),self.extractTF_IDF(m1,'doc')) return sims;
def string_compare(str1, str2, method='JARO'): ''' (string, string, string) -> double returns the similarity of str1 and str2 according to the method: LEV or JARO ''' if method == "LEV": # computes Levnenshtein distance which is an integer larger or equal to zero # return jellyfish.levenshtein_distance(str1,str2) return jellyfish.levenshtein_distance(str1.lower(), str2.lower()) if method == "JARO": # computes Jaro Winkler measure which is always between 0 and 1 return jellyfish.jaro_distance(str1, str2) print("ERROR: Choose the right string similarity measure : LEV or JARO")
def calculator(aid, pid): a_row = authors.get(aid) pa_row = paper_authors.get(pid, aid) if a_row is None or pa_row is None: return np.nan if (a_row[Authors.IDX_AFF] == '' or pa_row[PaperAuthors.IDX_AFF]) == '': return np.nan sim = levenshtein_distance( unidecode(a_row[Authors.IDX_AFF]).lower(), unidecode(pa_row[PaperAuthors.IDX_AFF]).lower() ) return sim
def distance(string_1, string_2): """Compute the edit distance between two strings. """ return jsonify({ "levenshtein": jellyfish.levenshtein_distance(string_1, string_2), "damerau-levenshtein": jellyfish.damerau_levenshtein_distance( string_1, string_2 ), "jaro": jellyfish.jaro_distance(string_1, string_2), "jaro-winkler": jellyfish.jaro_winkler(string_1, string_2), "match_rating_codex": jellyfish.match_rating_comparison( string_1, string_2 ), "sift3": pymailcheck.sift3_distance(string_1, string_2), })
def levenshteincmpr(string, list): if len(list)==0: return False; best_lev_match = 999999999; fixed_string = strip_name(str(string).lower()).strip() for item in list: if options['Global']['debug']==1: print ".....Literating through {}".format(item) fixed_itemstring = strip_name(str(item).lower()).strip() levdist = levenshtein_distance(fixed_itemstring, fixed_string) if options['Global']['debug']==1: print "..........file <{}> vs imdb <{}> gave {} levenshtein distance".format(fixed_string, fixed_itemstring, levdist) if best_lev_match > levdist: best_lev_match = levdist best_match = fixed_itemstring return {'lev':best_lev_match, 'title':best_match}
def test_edit_dist(x): s1 = '12012014321231200112211' s2 = '1300201231200112211' seq1 = [1,2,0,1,2,0,1,4,3,2,1,2,3,1,2,0,0,1,1,2,2,1,1] seq2 = [1,3,0,0,2,0,1,2,3,1,2,0,0,1,1,2,2,1,1] pos = np.asarray([[0,0],[0,1], #0 and 1 are nn [2,0],[2,1], #2 and 3 are nn [4,0],[4,1], #4 and 5 are nn [6,0],[6,1], #6 and 7 are nn [8,0],[8,1], #8 and 9 are nn [9,0],[9,1], #10 and 11 are nn [10,0],[10,1]],#12 and 13 are nn dtype=float) #modify this to ensure it is a non-connected k-nn nn = distance.ann(pos,1)[1][:,1:] k = 0 rp = 1 w = {'M':lambda x:0,'I':lambda x:1,'D':lambda x:1, 'S':lambda x:2, 'P':lambda x:0.5 } a = align.Align(w,rp,nn,k) u,v = 0,0 t0 = time.time() for i in range(0,int(x)): u = jellyfish.levenshtein_distance(s1,s2) t1 = time.time() t2 = time.time() for i in range(0,int(x)): v = Levenshtein.editops(s1,s2) v = Levenshtein.distance(s1,s2) t3 = time.time() t4 = time.time() for i in range(0,int(x)): #v = a.edit_dist(seq1,seq2) #w = a.edit_graph(seq1,seq2) #w = a.levenshtein(seq1,seq2) w = 1 w = a.edit_dist(seq1,seq2) t5 = time.time() #w = a.edit_dist(seq1,seq2) print('editdist dist = %s'%v) print('seq edit dist = %s'%w) print('editdist runtime is %s seconds'%(t3-t2)) print('seq edit dist = %s'%(t5-t4))
def diff_string(string1, string2, algorithm="RO"): """ deafults to Ratcliff-Obershelp. can be changed to Levenshtein algorithm 1 == same string, 0 == no similarity. The two algorithms use a reversed score scale, I have to rescale.""" if algorithm == "LE": d = jf.levenshtein_distance(string1, string2) if d == 0: return 1 else: return 1 - float(d)/max(len(string1), len(string2)) elif algorithm == "RO": s = SequenceMatcher(None, string1, string2) r = s.ratio() return r else: raise Exception("Wrong algorithm chosen for difference match:" + algorithm)
def results(self, query): # Look for the query to be a substring of a legislator name # (case-insensitive) pattern = re.compile(".*%s.*" % query['query'], re.IGNORECASE) spec = {'full_name': pattern} for prop in query.get('properties', []): # Allow filtering by state or chamber for now if prop['pid'] in ('state', 'chamber'): spec[prop['pid']] = prop['v'] legislators = db.legislators.find(spec) results = [] for leg in legislators: if legislators.count() == 1: match = True score = 100 else: match = False if leg['last_name'] == query['query']: score = 90 else: distance = levenshtein_distance(leg['full_name'].lower(), query['query'].lower()) score = 100.0 / (1 + distance) # Note: There's a bug in Refine that causes reconciliation # scores to be overwritten if the same legislator is returned # for multiple queries. see: # http://code.google.com/p/google-refine/issues/detail?id=185 results.append({"id": leg['_id'], "name": leg['full_name'], "score": score, "match": match, "type": [ {"id": "/openstates/legislator", "name": "Legislator"}]}) return sorted(results, cmp=lambda l, r: cmp(r['score'], l['score']))