def generate_dataframe_with_scores2(self, data, tokens, question_category): data_df = pd.DataFrame(columns=["url", "score", "category"]) for t in tokens: #print("Genereating for token:", t) for (url, keywords, category) in data: text = "" for word in keywords: text += word text = text.replace("'", "").replace(",", "") #print(text) data_df = data_df.append( { 'url': url, #'keywords': text, 'score': round(lev.jaro(os.path.basename(url), t), 3) * 100 + 20 if not category == question_category else round(lev.jaro(os.path.basename(url), t), 3) * 100, 'category': category }, ignore_index=True) data_df = data_df.sort_values(by=['score'], ascending=False) return data_df
def calculateMethoddistance1(methodinfo1, methodinfo2): distance1_singlemethodlist = [] distance1_singlemethodlist.append(Levenshtein.jaro(methodinfo1.getMethodname(), methodinfo2.getMethodname())) distance1_singlemethodlist.append(Levenshtein.jaro(methodinfo1.getReturntype(), methodinfo2.getReturntype())) if(methodinfo1.getTotalparameter() != 0): distance1_singlemethodlist.append(abs(methodinfo1.getTotalparameter() - methodinfo2.getTotalparameter())/methodinfo1.getTotalparameter()) distance1_singlemethodlist.append(calculateparameter(methodinfo1, methodinfo2) / (methodinfo1.getTotalparameter() + methodinfo2.getTotalparameter())) elif(methodinfo2.getTotalparameter() == 0): distance1_singlemethodlist.append(0) distance1_singlemethodlist.append(0) else: distance1_singlemethodlist.append(1) distance1_singlemethodlist.append(1) if(methodinfo1.getMethodLOC() != 0): distance1_singlemethodlist.append(abs(methodinfo1.getMethodLOC() - methodinfo2.getMethodLOC())/methodinfo1.getMethodLOC()) elif(methodinfo2.getMethodLOC() == 0): distance1_singlemethodlist.append(0) else: distance1_singlemethodlist.append(1) distance1_method = 0.0 for i in distance1_singlemethodlist: distance1_method = distance1_method + i/5 return distance1_method
def getTreffer(self): #liste von ids wird zurückgegeben #print "get Treffer" daten = self.Datenbank.getDataAsList("select deutsch, fremd from vokabeln where id like "+ str(self.ids)) #print "vergleich zwischen "+str(daten[0][1]) +" und "+str(self.wort) if self.richtung == 1: if leve.distance(daten[0][1], self.wort) <= int(self.distanz) and leve.jaro(daten[0][1], self.wort) > round((self.minTreffer/100), 2): self.direktTreffer = True #print self.ids return [self.ids] else: if leve.distance(self.Vergeleichsfaehigkeit(daten[0][0]), self.Vergeleichsfaehigkeit(self.wort)) <= int(self.distanz) \ and leve.jaro(self.Vergeleichsfaehigkeit(daten[0][0]), self.Vergeleichsfaehigkeit(self.wort)) > round((self.minTreffer/100), 2): #print "Leven Vergleich zwischen "+ str(daten[0][0])+ " und "+ str(self.wort) self.direktTreffer = True #print self.id return [self.ids] rueckgabe = [] for i in self.liste: #print "Aktueller vergleich "+unicode(i[0]) +" und "+unicode(self.wort) if leve.distance(i[0], self.wort) <= int(self.distanz) and leve.jaro(i[0], self.wort) > 0.7: rueckgabe.append(i[1]) return rueckgabe
def prettyprint(self): print "Timestamp: " + self.data["timeseed"] print "Expected Data: " + self.data["expected_data"] print "PSK31 Data: " + self.data["psk_data"] print "PSK31 Jaro Dist: " + str(Levenshtein.jaro(self.data["expected_data"],self.data["psk_data"])) print "DOMEX8 Data: " + self.data["domex_data"] print "DOMEX Jaro Dist: " + str(Levenshtein.jaro(self.data["expected_data"],self.data["domex_data"]))
def test_compare_implementations(): # Compare the implementations of python-Levenshtein to our # pure-Python implementations if Levenshtein is False: raise unittest.SkipTest # Test on strings with randomly placed common char for string1, string2 in _random_common_char_pairs(n_pairs=50): assert (string_distances._jaro_winkler( string1, string2, winkler=False) == Levenshtein.jaro(string1, string2)) assert (string_distances._jaro_winkler( string1, string2, winkler=True) == Levenshtein.jaro_winkler(string1, string2)) assert (string_distances.levenshtein_ratio( string1, string2) == Levenshtein.ratio(string1, string2)) # Test on random strings for string1, string2 in _random_string_pairs(n_pairs=50): assert (string_distances._jaro_winkler( string1, string2, winkler=False) == Levenshtein.jaro(string1, string2)) assert (string_distances._jaro_winkler( string1, string2, winkler=True) == Levenshtein.jaro_winkler(string1, string2)) assert (string_distances.levenshtein_ratio( string1, string2) == Levenshtein.ratio(string1, string2))
def top_sample(ratio=0.2): data_cases = [] names_cases = [] data_controls = [] names_controls = [] path_cases = run_path + "/cases_encoding_str.txt" f = open(path_cases, 'r', encoding="UTF-8") for line in f: data_cases.append(line.split(":")[-1]) names_cases.append(line.split(":")[0]) f.close() acc_cases = [] for d in data_cases: sum = 0 for ds in data_cases: sum += Levenshtein.jaro(d, ds) result = sum / data_cases.__len__() acc_cases.append(result) result = dict(zip(names_cases, acc_cases)) result = sorted(result.items(), key=lambda x: -x[-1]) number = int(data_cases.__len__() * ratio) # low = int(number*(0.5-ratio/2)) # high = int(number*(0.5+ratio/2)) #用来取中位数 f = open(run_path + "/top_cases.csv", "w", encoding="UTF-8") first_line = "name,acc\n" f.write(first_line) for a in range(number): result_tmp = "%s,%.4f\n" % (result[a][0], result[a][1]) print(result_tmp) f.write(result_tmp) f.close() path_cases = run_path + "/controls_encoding_str.txt" f = open(path_cases, 'r', encoding="UTF-8") for line in f: data_controls.append(line.split(":")[-1]) names_controls.append(line.split(":")[0]) f.close() acc_controls = [] for d in data_controls: sum = 0 for ds in data_controls: sum += Levenshtein.jaro(d, ds) result = sum / data_controls.__len__() acc_controls.append(result) result = dict(zip(names_controls, acc_controls)) result = sorted(result.items(), key=lambda x: -x[-1]) number = int(data_controls.__len__() * ratio) # low = int(number * (0.5 - ratio / 2)) # high = int(number * (0.5 + ratio / 2)) # 用来取中位数 f = open(run_path + "/top_controls.csv", "w", encoding="UTF-8") first_line = "name,acc\n" f.write(first_line) for a in range(number): result_tmp = "%s,%.4f\n" % (result[a][0], result[a][1]) print(result_tmp) f.write(result_tmp) f.close() return True
def prettyprint(self): print "Timestamp: " + self.data["timeseed"] print "Expected Data: " + self.data["expected_data"] print "PSK31 Data: " + self.data["psk_data"] print "PSK31 Jaro Dist: " + str( Levenshtein.jaro(self.data["expected_data"], self.data["psk_data"])) print "DOMEX8 Data: " + self.data["domex_data"] print "DOMEX Jaro Dist: " + str( Levenshtein.jaro(self.data["expected_data"], self.data["domex_data"]))
def __filter_res(self, itunes_res_list, music_tag): # print('*******************************************') # print("Music Info:\n File: " + music_tag_dict[gl.FILE_PATH] + "\nAlbum: " + music_album + "\n Artist: " + music_album_artist + '\n Track Artist: ' + music_track_artist + '\n') # 最佳匹配 best_match = {} # 最佳匹配率 best_ratio = 0.0 for dic in itunes_res_list: ratio_album = 0.0 ratio_artist = 0.0 ratio_track_artist = 0.0 if music_tag[music.ALBUM] != "": ratio_album = Levenshtein.jaro( music_tag[music.ALBUM], process_album_info(dic.get(self.ITUNES_COLLECTION_NAME, "")) ) # print("Apple Info: \n Album:" + process_album_info(dic['collectionName']) + "\n") # print(type(ratio_album)) if music_tag[music.ALBUM_ARTIST] != "": ratio_artist = Levenshtein.jaro( music_tag[music.ALBUM_ARTIST], process_artist_info(dic.get(self.ITUNES_ARTIST_NAME, "")) ) # print("Artist: " + dic['artistName'] + "\n") # print(ratio_artist) if music_tag[music.ARTIST] != "": ratio_track_artist = Levenshtein.jaro( music_tag[music.ARTIST], process_artist_info(dic.get(self.ITUNES_ARTIST_NAME, "")) ) # print("Artist: " + dic['artistName'] + "\n") # print(ratio_track_artist) # print('---------------------') ratio_artist = ratio_artist if ratio_artist > ratio_track_artist else ratio_track_artist cur_ratio = (ratio_album + ratio_artist) / 2 if cur_ratio > best_ratio: best_ratio = cur_ratio best_match = dic # 如果最大匹配率大于用户设定的数值,则认为匹配成功 if best_ratio > self._trust_prob: # print(best_match['collectionViewUrl']) # print(process_album_info(best_match['collectionName'])) # print(process_artist_info(best_match['artistName'])) # print("hit") # print(best_ratio) return best_match else: # print("not hit") return {}
def test(clf): dvds = [] with open("dvd.csv") as f: for i, j in enumerate(f): dvds.append(j) movies = [] with open("movies.csv") as f: for i, j in enumerate(f): movies.append(j) dvds = [dvd for dvd in dvds if dvd > "B"] movies = [movie for movie in movies if movie > "B"] print(len(dvds), len(movies)) with open("test.csv", "w") as f: i = 0 for dvd in dvds: prefix = dvd[0] i += 1 maxSimil = 0.0 for movie in movies: if movie[0] == prefix: tempSim = lev.jaro(dvd, movie) if tempSim > maxSimil: maxSimil = tempSim maxMovie = movie temp = [ 1.0 - (lev.distance(dvd, maxMovie) / len(dvd)), lev.jaro(dvd, maxMovie), lev.jaro_winkler(dvd, maxMovie), lev.ratio(dvd, maxMovie), ] print("%s\t%s\t%f\t%f" % (dvd.rstrip(), maxMovie.rstrip(), clf.decision_function(temp), clf.predict(temp))) f.write( "%s\t%s\t%f\t%f\t%f\t%f\t%f\t%i\n" % ( dvd.rstrip(), maxMovie.rstrip(), 1.0 - (lev.distance(dvd, maxMovie) / len(dvd)), lev.jaro(dvd, maxMovie), lev.jaro_winkler(dvd, maxMovie), lev.ratio(dvd, maxMovie), clf.decision_function(temp), clf.predict(temp), ) )
def similarity(str1, str2): # 1. difflib seq = difflib.SequenceMatcher(None, str1, str2) ratio = seq.ratio() # print('difflib similarity1: ', ratio) #返回的结果超过0.6就算很相似。目前做近义词词典就是借助相似度自动化来实现。 # 3. 编辑距离,描述由一个字串转化成另一个字串最少的操作次数,在其中的操作包括 插入、删除、替换 sim1 = Levenshtein.distance(str1, str2) # print('Levenshtein similarity: ', sim1) # 4.计算莱文斯坦比 sim2 = Levenshtein.ratio(str1, str2) # print('Levenshtein.ratio similarity: ', sim2) # 5.计算jaro距离 sim3 = Levenshtein.jaro(str1, str2) # print('Levenshtein.jaro similarity: ', sim3) # 6. Jaro–Winkler距离 sim4 = Levenshtein.jaro_winkler(str1, str2) # print('Levenshtein.jaro_winkler similarity: ', sim4) if ratio > 0.6 or sim1 < 50 or ((sim2 + sim3 + sim4) / 3) > 0.8: return True
def doCompare(str1, str2): print(u"%s - %s 相似度计算" % (str1, str2)) seq = difflib.SequenceMatcher(None, str1, str2) ratio = seq.ratio() # 相似度,最大是1 # print(u"difflib 相似度") print((u"difflib 相似度:%s" % ratio)) #可以 # print(u"difflib 相似度:"+str(ratio))#可以 #编辑距离,越小越好,但无法体现长字符串的的相似情况 sim = Levenshtein.distance(str1, str2) print(u"Levenshtein 编辑距离:%s" % sim) #测试与diff相同 levenRatio = Levenshtein.ratio(str1, str2) print(u"Levenshtein 莱温斯坦比:%s" % levenRatio) #测试与diff相同 sim = Levenshtein.seqratio(str1, str2) print(u"Levenshtein 相似率:%s" % sim) #越大越好,最大1 jaro = Levenshtein.jaro(str1, str2) print(u"Levenshtein jaro距离:%s" % jaro) #越大越好,完全相同是1。出现很多次不完全相同,但结果还是1的情况 #如果只是数字和符号的不同,会被认为是1 jaroWinkler = Levenshtein.jaro_winkler(str1, str2) print(u"Levenshtein jaro_winkler距离:%s" % jaroWinkler) print("\n")
def get_closest_email(self, emails, entity): """ Compares and scores each email in a array with the entity provided by Amazon Lex. Returns a string which contains three emails that best match the entity. :params emails: a array of emails returned by get_emails() :type emails: array :param entity: an entity derived from the user's input which orginates from Amazon's Lex services :type entity: str """ # Creating an empty Dataframe with column names only df = pd.DataFrame(columns=['entity', 'score']) print('test') # Loop through each email... for email in emails: print('test') # Append a new row at the bottom: (email, comparison between this email and the entity provided) df = df.append( { 'entity': email, 'score': round(SequenceMatcher.jaro(email, entity), 3)*1000 }, ignore_index=True) print('test') # Sort the dataframe by highest to lowest score df = df.sort_values(by=['score'], ascending=False) print('test') # Concatinate a string with the top 3 scoring emails answer = f"\nHere are a few possible answers:\n{df.iloc[0]['entity']}" # return the answer return answer
def extract_features(document_tfidf, question_tfidf, answer_tfidf, document, question, answer): qa_cos_d = spatial.distance.cosine(question_tfidf, answer_tfidf) qd_cos_d = spatial.distance.cosine(question_tfidf, document_tfidf) ad_cos_d = spatial.distance.cosine(answer_tfidf, document_tfidf) qa_euc_d = np.linalg.norm(question_tfidf - answer_tfidf) qd_euc_d = np.linalg.norm(question_tfidf - document_tfidf) ad_euc_d = np.linalg.norm(answer_tfidf - document_tfidf) qa_lev_d = Levenshtein.distance(question, answer) qa_lev_r = Levenshtein.ratio(question, answer) qa_jar_s = Levenshtein.jaro(question, answer) qa_jaw_s = Levenshtein.jaro_winkler(question, answer) qa_tfidf_score = np.sum(question_tfidf * answer_tfidf.T) qd_tfidf_score = np.sum(question_tfidf * document_tfidf.T) ad_tfidf_score = np.sum(answer_tfidf * document_tfidf.T) document_tfidf_sum = np.sum(document_tfidf) question_tfidf_sum = np.sum(question_tfidf) answer_tfidf_sum = np.sum(answer_tfidf) f = [ qa_cos_d, qd_cos_d, ad_cos_d, qa_euc_d, qd_euc_d, ad_euc_d, qa_lev_d, qa_lev_r, qa_jar_s, qa_jaw_s, qa_tfidf_score, qd_tfidf_score, ad_tfidf_score, document_tfidf_sum, question_tfidf_sum, answer_tfidf_sum ] return f
def matchKeyWords(path, keys, keysWeight, colsWeight, ansNum): #读取excel中相关信息 data = xlrd.open_workbook(path) sheet1 = data.sheet_by_name('sheet1') rowsNum = sheet1.nrows #总行数 colsNum = 3 #初始化value清零 value = [0 for x in range(0, rowsNum)] #对每个关键字,与表格中所有字符串匹配 for i in range (0, 5): key = keys[i] for row in range (1, rowsNum): arrRow = sheet1.row_values(row) #第row行字符串 for col in range(0, 3): #计算相似度加入字符串所在行 value[row] += Levenshtein.jaro(key , arrRow[col]) * colsWeight[col] * keysWeight[i] #对行数与其对应value组成二元组按value从大到小排序 ans = [] for i in range(0, rowsNum): ans.append((value[i], i)) ans = sorted(ans, reverse = True) for i in range(0, ansNum): print(ans[i][1])
def __get_suggest(self, word, rating_limit, count): word_len = str(len(word) / 2) trigrammed_word = '"{}"/1'.format(trigram(word)) self.__configure(SphinxConfig.index_sugg, word_len) result = self.client_sugg.Query(trigrammed_word, SphinxConfig.index_sugg) # Если по данному слову не найдено подсказок (а такое бывает?) # возвращаем [] if not result['matches']: return [] maxrank = result['matches'][0]['attrs']['krank'] maxleven = None outlist = list() for match in result['matches']: if len(outlist) >= count: break if maxrank - match['attrs']['krank'] < self.default_rating_delta: jaro_rating = Levenshtein.jaro(word, match['attrs']['word']) if not maxleven: maxleven = jaro_rating - jaro_rating * self.regression_coef if jaro_rating >= rating_limit and jaro_rating >= maxleven: outlist.append([match['attrs']['word'], jaro_rating]) del jaro_rating outlist.sort(key=lambda x: x[1], reverse=True) return outlist
def calculateD(example): ''' 计算各种距离 :param example: :param request_template: :return: 返回跟每个模版比较的加权距离,此处加权较为简单,平均做的 ''' # sim = {'hamming':0,'distance':0,'Leven':0,..} sim_all = [] # if example in request_template: # return for request_M in request_template: if example != request_M['request_data']: sim = {'hamming': 0, 'distance': 0, 'Leven': 0,'jaro':0,'jaro_winkler':0,'function':request_M['function'],'sum':0} sim['distance'] = 1/Levenshtein.distance(example, request_M['request_data']) sim['Leven'] = Levenshtein.ratio(example, request_M['request_data']) sim['jaro'] = Levenshtein.jaro(example,request_M['request_data']) sim['jaro_winkler'] = Levenshtein.jaro_winkler(example,request_M['request_data']) try: sim['hamming'] = 1/Levenshtein.hamming(example, request_M['request_data']) except ValueError: sim['hamming'] = 0 sim['sum'] = (sim['hamming']+sim['distance']+sim['Leven']+sim['jaro']+sim['jaro_winkler'])/5 sim_all.append(sim) else: return [{'hamming': 1, 'distance': 1, 'Leven': 1,'jaro':1,'jaro_winkler':1,'function':request_M['function'],'sum':1}] # print(sim) return sim_all
def similarity(str1, str2): seq = difflib.SequenceMatcher(None, str1, str2) ratio = seq.ratio() sim3 = Levenshtein.jaro(str1, str2) sim4 = jaro.jaro_metric(str1, str2) if ratio > 0.731104540194254 and (sim3 + sim4) / 2 > 0.7890962851907381: return True
def find(s): getVec = 0 query = 0 linearSearch = 0 t1 = time.time() sv = getvec(s) t2 = time.time() res = lsh.query(sv, num_results = 20) t3 = time.time() resList = [] choice = (0, 'none') aboveThresh = 0 for r in res: resList.append([nameDict[toStr(r[0])],r[1]]) t4 = time.time() if len(resList) >= 1: rlen = len(resList) for i in range(rlen): candidate = resList[i][0][0] resList[i].append(Levenshtein.jaro(candidate,s)) if len(resList) > 1: resList = sorted(resList, reverse = True, key=distSort) choice = (resList[0][2], resList[0][0][0]) # coice = (dist, name) if choice[0] >= thresh: aboveThresh = 1 t5 = time.time() getVec = (t2 - t1) query = (t3 - t2) linearSearch = (t5 - t4) timeList = [getVec, query, linearSearch] return (aboveThresh, choice, timeList)
def rec_results_parser(tbody, rec_BDMC): """ Returns the closest song name using Levenshtein ratio """ tr = tbody.findAll('tr') # print tr rec_BDMC = unicode(rec_BDMC) print '\n', rec_BDMC ratios = [] for i, entries in enumerate(tr[1:]): entry = entries.findAll('td') sco = entry[0].text rec = entry[1].text art = entry[3].text rel = entry[4].text if sco == str(100): ratio = l.jaro(rec_BDMC, rec) ratios.append(ratio) # print sco, '\t', rec, ratio, '\t', art, '\t', rel idx = ratios.index(max(ratios)) return idx
def getMatches(name, inDict = False): name = name.lower() ti = time.time() aboveThresh = 1 tup = tuple(lsh.hshingle(name, num_shingles)) sig = c.signer.sign(tup) resSet = set() choice = (0, "none") matchList = [] for band_inx, hshval in enumerate(c.hasher.hash(sig)): for h in c.hashmaps[band_inx][hshval]: resSet.add(h) for r in resSet: sim = lev.jaro(r, name) if sim > .7: matchList.append((sim, r)) dt = time.time() - ti if len(matchList) > 0: matchList = sorted(matchList, reverse = True, key=simSort) choice = tuple(matchList[0]) if inDict and choice[0] == 1: #print "skipping match" choice = tuple(matchList[int(inDict)]) if choice[0] < thresh: aboveThresh = 0 return (aboveThresh, choice, [0,dt,0])
def check_cons(name1, name2): ratio = Levenshtein.ratio(name1, name2) jaro = Levenshtein.jaro(name1, name2) jaro_winkler = Levenshtein.jaro_winkler(name1, name2) if ratio > .6 or jaro > .7 or jaro_winkler > .7: return True else: return False
def check_sure(name1, name2): ratio = Levenshtein.ratio(name1, name2) jaro = Levenshtein.jaro(name1, name2) jaro_winkler = Levenshtein.jaro_winkler(name1, name2) if ratio >= 0.9 and jaro >= 0.95 and jaro_winkler >= 0.95: return True else: return False
def gensamples( skips, k, batch_size, short, temperature, use_unk, model, sequence, data, idx2word, maxlen, maxlenh, maxlend, oov0, glove_idx2idx, vocab_size, nb_unknown_words): """Generate text samples.""" X_test, Y_test = data # unpack data i = random.randint(0, len(X_test) - 1) print('HEAD:', ' '.join(idx2word[w] for w in Y_test[i][:maxlenh])) print('DESC:', ' '.join(idx2word[w] for w in X_test[i][:maxlend])) sys.stdout.flush() print('HEADS:') x = X_test[i] samples = [] if maxlend == 0: skips = [0] else: skips = range(min(maxlend, len(x)), max(maxlend, len(x)), abs(maxlend - len(x)) // skips + 1) for s in skips: start = lpadd(x[:s], maxlend, eos) fold_start = vocab_fold(start, oov0, glove_idx2idx, vocab_size, nb_unknown_words) sample, score = beamsearch( predict=keras_rnn_predict, start=fold_start, k=k, maxsample=maxlen, empty=empty, eos=eos, temperature=temperature, use_unk=use_unk, nb_unknown_words=nb_unknown_words, vocab_size=vocab_size, model=model, maxlen=maxlen, maxlend=maxlend, sequence=sequence, batch_size=batch_size ) assert all(s[maxlend] == eos for s in sample) samples += [(s, start, scr) for s, scr in zip(sample, score)] samples.sort(key=lambda x: x[-1]) codes = [] for sample, start, score in samples: code = '' words = [] sample = vocab_unfold(start, sample, oov0)[len(start):] for w in sample: if w == eos: break words.append(idx2word[w]) code += chr(w // (256 * 256)) + chr((w // 256) % 256) + chr(w % 256) if short: distance = min([100] + [-Levenshtein.jaro(code, c) for c in codes]) if distance > -0.6: print(score, ' '.join(words)) else: print(score, ' '.join(words)) codes.append(code)
def get_simhash_dis(str1, str2): """计算两个文本之间的simhash相似度""" simhash_str1 = simhash.Simhash(str1) simhash_str2 = simhash.Simhash(str2) dis_simhash = 1 - simhash_str1.distance(simhash_str2) / 64 dis_ratio = Levenshtein.ratio(str1, str2) dis_jaro = Levenshtein.jaro(str1, str2) res = (dis_simhash + dis_ratio + dis_jaro) / 3 return res
def gensamples(skips=2, k=10, batch_size=constants.BATCH_SIZE, short=True, temperature=1., use_unk=False): i = random.randint(0, len(X_test) - 1) #print('DESC:',' '.join(index2word[w] for w in Y_test[i][:DESC_SEQ_LEN])) #print('CONTENT:',' '.join(index2word[w] for w in X_test[i][:CONTENT_SEQ_LEN])) sys.stdout.flush() print('DESCRIPTION:') x = X_test[i] samples = [] if CONTENT_SEQ_LEN == 0: skips = [0] else: skips = range(min(CONTENT_SEQ_LEN, len(x)), max(CONTENT_SEQ_LEN, len(x)), abs(CONTENT_SEQ_LEN - len(x)) // skips + 1) for s in skips: start = lpadd(x[:s]) fold_start = vocab_fold(start) print('Length of list of foldstart: ', len(list(fold_start))) sample, score = beamsearch( predict=keras_rnn_predict, start=fold_start, k=k, temperature=temperature, use_unk=use_unk) #k = 10 , use_unk = False, temperature = 1. try: assert all(s[CONTENT_SEQ_LEN] == constants.eos for s in sample) except: print("Assertion error in gensamples---- proceed") samples += [(s, start, scr) for s, scr in zip(sample, score)] samples.sort(key=lambda x: x[-1]) codes = [] for sample, start, score in samples: code = '' words = [] sample = vocab_unfold(start, sample)[len(start):] for w in sample: if w == constants.eos: break words.append(index2word[w]) code += chr(w // (256 * 256)) + chr( (w // 256) % 256) + chr(w % 256) if short: distance = min([100] + [-Levenshtein.jaro(code, c) for c in codes]) if distance > -0.6: print(score, ' '.join(words)) # print '%s (%.2f) %f'%(' '.join(words), score, distance) else: print(score, ' '.join(words)) codes.append(code)
def mysimilar(): import difflib import Levenshtein as ls str1 = "我的骨骼雪白 也长不出青稞" str2 = "雪的日子 我只想到雪中去si" # 1. difflib seq = difflib.SequenceMatcher(None, str1, str2) ratio = seq.ratio() print('difflib similarity1: ', ratio) # difflib 去掉列表中不需要比较的字符 seq = difflib.SequenceMatcher(lambda x: x in ' 我的雪', str1, str2) ratio = seq.ratio() print('difflib similarity2: ', ratio) # 2. hamming距离,str1和str2长度必须一致,描述两个等长字串之间对应位置上不同字符的个数 # sim = ls.hamming(str1, str2) # print 'hamming similarity: ', sim # 3. 编辑距离,描述由一个字串转化成另一个字串最少的操作次数,在其中的操作包括 插入、删除、替换 sim = ls.distance(str1, str2) print('ls similarity: ', sim) # 4.计算莱文斯坦比 sim = ls.ratio(str1, str2) print('ls.ratio similarity: ', sim) # 5.计算jaro距离 sim = ls.jaro(str1, str2) print('ls.jaro similarity: ', sim) # 6. Jaro–Winkler距离 sim = ls.jaro_winkler(str1, str2) print('ls.jaro_winkler similarity: ', sim)
def test(name, key): """ test if the two names are the same :param name: one of the name :param key: another name :return: bool result """ if Levenshtein.jaro(name, key) == 1: return 1
def check_beli(name1, name2): ratio = Levenshtein.ratio(name1, name2) jaro = Levenshtein.jaro(name1, name2) jaro_winkler = Levenshtein.jaro_winkler(name1, name2) if ratio >= 0.9 or jaro >= 0.9 or jaro_winkler >= 0.9: return True elif ratio >= .7 and jaro >= .8 and jaro_winkler >= .8: return True else: return False
def test(clf): dvds = [] with open("dvd.csv") as f: for i, j in enumerate(f): dvds.append(j) movies = [] with open("movies.csv") as f: for i, j in enumerate(f): movies.append(j) dvds = [dvd for dvd in dvds if dvd > "B"] movies = [movie for movie in movies if movie > "B"] print(len(dvds), len(movies)) with open("test.csv", "w") as f: i = 0 for dvd in dvds: prefix = dvd[0] i += 1 maxSimil = 0. for movie in movies: if movie[0] == prefix: tempSim = lev.jaro(dvd, movie) if tempSim > maxSimil: maxSimil = tempSim maxMovie = movie temp = [ 1. - (lev.distance(dvd, maxMovie) / len(dvd)), lev.jaro(dvd, maxMovie), lev.jaro_winkler(dvd, maxMovie), lev.ratio(dvd, maxMovie), ] print("%s\t%s\t%f\t%f" % (dvd.rstrip(), maxMovie.rstrip(), clf.decision_function(temp), clf.predict(temp))) f.write("%s\t%s\t%f\t%f\t%f\t%f\t%f\t%i\n" % (dvd.rstrip(), maxMovie.rstrip(), 1. - (lev.distance(dvd, maxMovie) / len(dvd)), lev.jaro(dvd, maxMovie), lev.jaro_winkler( dvd, maxMovie), lev.ratio(dvd, maxMovie), clf.decision_function(temp), clf.predict(temp)))
def match_two_list_jaro(listx, listy): res = {} for animLine in listx: animTerm = animLine.split(";")[1] for dbpediaLine in listy: dbpediaTerm = dbpediaLine.split(";")[1] if Levenshtein.jaro(animTerm, dbpediaTerm) > 0.83 and len( animTerm) > 4 and len(dbpediaTerm) > 4: res[animLine.split(";")[0]] = dbpediaLine.split(";")[0] return res
def autocomplete(string, sl): bestMatch = None value = 0 for item in sl.items: tmpValue = Levenshtein.jaro(string.upper(), itemStr(item).upper()) if not value or tmpValue > value: value = tmpValue bestMatch = item return bestMatch, value
def gensamples(X=None, X_test=None, Y_test=None, avoid=None, avoid_score=1, skips=2, k=10, batch_size=batch_size, short=True, temperature=1., use_unk=True): if X is None or isinstance(X,int): if X is None: i = random.randint(0,len(X_test)-1) else: i = X print 'HEAD %d:'%i,' '.join(idx2word[w] for w in Y_test[i]) print 'DESC:',' '.join(idx2word[w] for w in X_test[i]) sys.stdout.flush() x = X_test[i] else: x = [word2idx[w.rstrip('^')] for w in X.split()] if avoid: # avoid is a list of avoids. Each avoid is a string or list of word indeicies if isinstance(avoid,str) or isinstance(avoid[0], int): avoid = [avoid] avoid = [a.split() if isinstance(a,str) else a for a in avoid] avoid = [vocab_fold([w if isinstance(w,int) else word2idx[w] for w in a]) for a in avoid] print 'HEADS:' samples = [] if maxlend == 0: skips = [0] else: skips = range(min(maxlend,len(x)), max(maxlend,len(x)), abs(maxlend - len(x)) // skips + 1) for s in skips: start = lpadd(x[:s]) fold_start = vocab_fold(start) sample, score = beamsearch(predict=keras_rnn_predict, start=fold_start, avoid=avoid, avoid_score=avoid_score, k=k, temperature=temperature, use_unk=use_unk) assert all(s[maxlend] == eos for s in sample) samples += [(s,start,scr) for s,scr in zip(sample,score)] samples.sort(key=lambda x: x[-1]) codes = [] for sample, start, score in samples: code = '' words = [] sample = vocab_unfold(start, sample)[len(start):] for w in sample: if w == eos: break words.append(idx2word[w]) code += chr(w//(256*256)) + chr((w//256)%256) + chr(w%256) if short: distance = min([100] + [-Levenshtein.jaro(code,c) for c in codes]) if distance > -0.6: print score, ' '.join(words) # print '%s (%.2f) %f'%(' '.join(words), score, distance) else: print score, ' '.join(words) codes.append(code) return samples
def gensamples(skips=2, k=10, batch_size=batch_size, short=True, temperature=1., use_unk=True): i = random.randint(0, len(X_test) - 1) print 'HEAD:', ' '.join(idx2word[w] for w in Y_test[i][:maxlenh]) print 'DESC:', ' '.join(idx2word[w] for w in X_test[i][:maxlend]) sys.stdout.flush() print 'HEADS:' x = X_test[i] samples = [] if maxlend == 0: skips = [0] else: skips = range(min(maxlend, len(x)), max(maxlend, len(x)), abs(maxlend - len(x)) // skips + 1) for s in skips: start = lpadd(x[:s]) fold_start = vocab_fold(start, vocab_size, glove_idx2idx) sample, score = beamsearch_t(start=fold_start, k=k, temperature=temperature, use_unk=use_unk, maxsample=maxlen, vocab_size=vocab_size, model=model, maxlen=maxlen, maxlend=maxlend, sequence=sequence) assert all(s[maxlend] == eos for s in sample) samples += [(s, start, scr) for s, scr in zip(sample, score)] samples.sort(key=lambda x: x[-1]) codes = [] for sample, start, score in samples: code = '' words = [] sample = vocab_unfold(start, sample, oov0)[len(start):] for w in sample: if w == eos: break words.append(idx2word[w]) code += chr(w // (256 * 256)) + chr( (w // 256) % 256) + chr(w % 256) if short: distance = min([100] + [-Levenshtein.jaro(code, c) for c in codes]) if distance > -0.6: print score, ' '.join(words) # print '%s (%.2f) %f'%(' '.join(words), score, distance) else: print score, ' '.join(words) codes.append(code)
def mention_estimate(mention, mentions): best_href = '' if mention != '': #print mention max = 0. for href in mentions[mention]: l = Levenshtein.jaro(href[6:], mention) if l >= max: max = l best_href = href return best_href
def get_simm(str1, str2): # 1. difflib seq = difflib.SequenceMatcher(None, str1, str2) sim1 = seq.ratio() # 2.计算莱文斯坦比 sim2 = Levenshtein.ratio(str1, str2) # 3.计算jaro距离sa764 ZzREsa sim3 = Levenshtein.jaro(str1, str2) # 4. Jaro–Winkler距离 sim4 = Levenshtein.jaro_winkler(str1, str2) return (sim1 + sim2 + sim3 + sim4) / 4
def calculate_lev_distance(name_1, name_2, ln_length, jw): if jw: #print(name_2) #print(ln_length) if ln_length <= 5 and ln_length > 0: return Levenshtein.jaro_winkler(name_1, name_2, .1) else: return Levenshtein.jaro_winkler(name_1, name_2, .13) else: return Levenshtein.jaro(name_1, name_2)
def processSentence(sentence, i): newX = [] for word in sentence.split(" "): if word in words: newX.append(words.index(word)) else: jaros = [Levenshtein.jaro(word, w) for w in words] highest_index = jaros.index(max(jaros)) newX.append(highest_index) newX = torch.tensor(newX).to(device).long() print(str(i / total_samples * 100.0) + "%\r", end="") return newX
def get_closest_match_leven(text, comparison_list, minimum_match_value): closest_match = '' closest_match_value=0 for comparison_text in comparison_list: temp_match_value = leven.jaro(text, comparison_text) if temp_match_value>closest_match_value: closest_match = comparison_text closest_match_value = temp_match_value if closest_match_value>minimum_match_value: return closest_match else: return ''
def getSortedJaroScoreList(name, refIndexNames): scoredIndexedNames = [] for r in refIndexNames: jaroTests = [] test1 = Levenshtein.jaro(name['WrittenFirst'].lower().replace(' ',''), r['YearbookFirst'].lower().replace(' ','')) test2 = Levenshtein.jaro(name['WrittenLast'].lower().replace(' ',''), r['YearbookLast'].lower().replace(' ','')) jaroTests.append((test1,test2)) test1 = Levenshtein.jaro(name['WrittenFirst'].lower().replace(' ',''), r['YearbookLast'].lower().replace(' ','')) test2 = Levenshtein.jaro(name['WrittenLast'].lower().replace(' ',''), r['YearbookFirst'].lower().replace(' ','')) jaroTests.append((test1,test2)) test1 = Levenshtein.jaro((name['WrittenFirst'] + name['WrittenLast']).lower().replace(' ',''), (r['YearbookFirst'] + r['YearbookLast']).lower().replace(' ','')) test2 = test1 jaroTests.append((test1,test2)) test1 = Levenshtein.jaro((name['WrittenFirst'] + name['WrittenLast']).lower().replace(' ',''), (r['YearbookLast'] + r['YearbookFirst']).lower().replace(' ','')) test2 = test1 jaroTests.append((test1,test2)) jaroScore = max(map(lambda t:(t[0] + t[1])/2, jaroTests)) spellingDict = {'Spelling':str(jaroScore)} spellingDict.update(r) # print(str(spellingDict) + '\n') scoredIndexedNames.append(spellingDict) if jaroScore == 1: #if you found an exact match, exit early break return sorted(scoredIndexedNames, key=lambda k: k['Spelling'], reverse=True)
def _find_song(name): max_similarity=-0.1; ind=-1; threshold=0.5; print("match with name %s" % (name)); for i in song_name_dict.keys(): sim=leven.jaro(song_name_dict[i], name) print("similarity %f" % (sim)); if(sim>=max_similarity): ind=i; max_similarity=sim; return ind if(max_similarity>=threshold) else None;
def get_matches(needle, haystack, ratio=0.6): needle = unicode(needle) result = {} for s in haystack: if s != needle: assert unicode(s) distance = Levenshtein.jaro(needle, unicode(s)) if distance > ratio: result[s] = distance return result
def getClosest(name): aboveThresh = 0 choice = (0, 'none') ti = time.time() for n in nameList: dist = Levenshtein.jaro(n, name) if dist > choice[0]: choice = (dist, n) tf = time.time() dt = tf - ti if choice[0] >= thresh: aboveThresh = 1 return (aboveThresh, choice, [0,dt,0])
def search_vindicat(name): results = [] url = 'https://vcat.pl/gielda-dlugow/oferty/api/?draw=2&columns[0][data]=title&columns[0][name]=&columns[0][searchable]=true&columns[0][orderable]=true&columns[0][search][value]=&columns[0][search][regex]=false&columns[1][data]=firm_name&columns[1][name]=&columns[1][searchable]=true&columns[1][orderable]=true&columns[1][search][value]={}&columns[1][search][regex]=false&columns[2][data]=city&columns[2][name]=&columns[2][searchable]=true&columns[2][orderable]=false&columns[2][search][value]=&columns[2][search][regex]=false&columns[3][data]=claim_type&columns[3][name]=&columns[3][searchable]=true&columns[3][orderable]=false&columns[3][search][value]=&columns[3][search][regex]=false&columns[4][data]=debts_sum&columns[4][name]=&columns[4][searchable]=true&columns[4][orderable]=false&columns[4][search][value]=0%2C0&columns[4][search][regex]=false&columns[5][data]=for_sale&columns[5][name]=&columns[5][searchable]=true&columns[5][orderable]=false&columns[5][search][value]=&columns[5][search][regex]=false&columns[6][data]=site_details&columns[6][name]=&columns[6][searchable]=true&columns[6][orderable]=false&columns[6][search][value]=&columns[6][search][regex]=false&order[0][column]=0&order[0][dir]=desc&start=0&length=10000&search[value]=&search[regex]=false&_=1615275634918'.format( name.split(' ')[1]) data_json = requests.get(url).json() for elem in data_json['packages']: name_part = name.split(' ')[0] surname_part = name.split(' ')[1] if Levenshtein.jaro( elem['firm_name'], name) > 0.8 and elem['firm_name'].find(surname_part) != -1: results.append(elem) return results
def valueOf(self, _word): if _word in self.aggSum: # Stop if word already exists return self.aggSum[_word] else: prev = 0 for i, word in enumerate(self.aggSum): # https://rawgit.com/ztane/python-Levenshtein/master/docs/Levenshtein.html#Levenshtein-jaro jaro = Levenshtein.jaro(_word, word) if jaro > prev: nArr = self.aggSum[word] prev = jaro return nArr
def getClosest(name): aboveThresh = 0 choice = (0, 'none') ti = time.time() for n in surnames.dic.keys(): dist = lev.jaro(n, name) if dist > choice[0]: choice = (dist, n) tf = time.time() dt = tf - ti if choice[0] >= thresh: aboveThresh = 1 return (aboveThresh, choice, [0,dt,0])
def distance_feature(self, df): column1, column2 = self.column1, self.column2 columns = df.columns df['distance'] = df[[column1, column2]].apply( lambda x: Levenshtein.distance(x[column1], x[column2]), axis=1) df['ratio'] = df[[column1, column2]].apply( lambda x: Levenshtein.ratio(x[column1], x[column2]), axis=1) df['jaro'] = df[[column1, column2]].apply( lambda x: Levenshtein.jaro(x[column1], x[column2]), axis=1) df['jaro_winkler'] = df[[column1, column2]].apply( lambda x: Levenshtein.jaro_winkler(x[column1], x[column2]), axis=1) new_columns = list(set(df.columns) - set(columns)) return df[new_columns]
def getEntities(sentence): tagged_sent= nltk.pos_tag(nltk.word_tokenize(sentence)) tree= nltk.ne_chunk(tagged_sent) print tagged_sent print tree """ Entities is a list of lists to facilitate pairing with multiple entities in a sentence """ entities=[] for subtree in tree.subtrees(): if subtree.node == 'PERSON': nelist=[] for child in subtree: name,tag = child if tag == 'NNP': nelist.append(name) if len(nelist) > 0: entities.append(nelist) actors = [line.split(" ", 1) for line in open('actors_index.txt').readlines()] print entities matches= {} for list in entities: for pair in pairs(list): first, last= pair name= first + ' ' + last print name ratios = [] for id, actor in actors: s= actor.strip().replace(' ', '').split(',') if len(s) == 1: actor= s[0] else: actor = s[1] + ' ' + s[0] ratio= lev.jaro(name,actor) if ratio >= 0.9: ratios.append((ratio,id,actor)) ratios.sort(key=lambda x: x[0], reverse= True) if len(ratios) > 0: r, id, actor = ratios[0] matches[pair] = {'id':id, 'name': actor, 'class': 'actor'} return matches
def train(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete = []): allTrainX = list() allTrainY = list() with open("./data/train.csv") as f: for line in f: lin = line.split(",") if len(lin) == 3: st1 = lin[0].lower() st2 = lin[1].lower() temp = [ 1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))), lev.jaro(st1,st2), lev.jaro_winkler(st1,st2), lev.ratio(st1,st2), distance.sorensen(st1,st2), jaccard(set(st1),set(st2)), 1. - distance.nlevenshtein(st1,st2,method=1), 1. - distance.nlevenshtein(st1,st2,method=2), dice_coefficient(st1,st2,lenGram=2), dice_coefficient(st1,st2,lenGram=3), dice_coefficient(st1,st2,lenGram=4), cosineWords(st1,st2,dictTrain,tfidf_matrix_train), cosineBigrams(st1,st2,dictTrainBigrams,tfidf_matrix_trainBigrams,lenGram) ] if len(delete) > 0: for elem in delete: temp[elem] = 0. allTrainX.append(temp) allTrainY.append(int(lin[2])) X = np.array(allTrainX,dtype=float) y = np.array(allTrainY,dtype=float) clf = svm.LinearSVC(C=1.,dual=False,loss='l2', penalty='l1') clf2 = linear_model.LogisticRegression(C=1.,dual=False, penalty='l1') clf.fit(X, y) clf2.fit(X, y) weights = np.array(clf.coef_[0]) print(weights) weights = np.array(clf2.coef_[0]) print(weights) return clf,clf2
def get_psk_jaro(self): return Levenshtein.jaro(self.data["expected_data"],self.data["psk_data"])
# Jaro Distance # Jaro-Winkler Distance # Match Rating Approach Comparison # Hamming Distance # Phonetic encoding: # American Soundex # Metaphone # NYSIIS (New York State Identification and Intelligence System) # Match Rating Codex import jellyfish print(jellyfish.levenshtein_distance('jellyfish', 'smellyfish')) # 2; 编辑距离 print(jellyfish.jaro_distance('jellyfish', 'smellyfish')) # 0.89629629629629637 print(jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')) # 1; 编辑距离, 带翻转的 print(jellyfish.metaphone('Jellyfish')) # 'JLFX' print(jellyfish.soundex('Jellyfish')) # 'J412' print(jellyfish.nysiis('Jellyfish')) # 'JALYF' print(jellyfish.match_rating_codex('Jellyfish')) # 'JLLFSH' ################################################################## ## Lenvenshtein import Levenshtein print(Levenshtein.hamming('hello', 'helol')) # 2; 计算汉明距离; 要求 str1 和 str2 必须长度一致; 是描述两个等长字串之间对应位置上不同字符的个数 print(Levenshtein.distance('hello', 'helol')) # 2; 计算编辑距离(也成 Levenshtein 距离); 是描述由一个字串转化成另一个字串最少的操作次数, 在其中的操作包括插入 & 删除 & 替换 print(Levenshtein.distance('hello world asdf', 'helolaaaa world asdf')) # 5 print(Levenshtein.ratio('hello', 'helol')) # 0.8; 计算莱文斯坦比; 计算公式 r = (sum - ldist) / sum, 其中 sum 是指 str1 和 str2 字串的长度总和, ldist 是类编辑距离 # 注意: 这里的类编辑距离不是 2 中所说的编辑距离, 2 中三种操作中每个操作+1, 而在此处, 删除、插入依然+1, 但是替换+2 # 这样设计的目的: ratio('a', 'c'), sum=2, 按 2 中计算为(2-1)/2 = 0.5,' a','c'没有重合, 显然不合算, 但是替换操作+2, 就可以解决这个问题 print(Levenshtein.jaro('hello', 'helol')) # 0.9333333333333332; 计算 jaro 距离; 用于健康普查 print(Levenshtein.jaro_winkler('hello', 'helol')) # 0.9533333333333333; 计算 Jaro – Winkler 距离
def stats(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete = [],plotX=False): with open("./data/stats.csv") as infile: for i,line in enumerate(infile): pass dimMatrix = 16 predict = np.zeros((i+1,dimMatrix)) clf1,clf2 = train(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete=delete) with open("./data/stats.csv") as infile: for i,line in enumerate(infile): a = line.rstrip().split("\t") ## create same vector with more distances st1 = a[0].lower() st2 = a[1].lower() temp = [ 1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))), lev.jaro(st1,st2), lev.jaro_winkler(st1,st2), lev.ratio(st1,st2), distance.sorensen(st1,st2), jaccard(set(st1),set(st2)), 1. - distance.nlevenshtein(st1,st2,method=1), 1. - distance.nlevenshtein(st1,st2,method=2), dice_coefficient(st1,st2,lenGram=2), dice_coefficient(st1,st2,lenGram=3), dice_coefficient(st1,st2,lenGram=4), cosineWords(st1,st2), cosineBigrams(st1,st2)] if len(delete) > 0: for elem in delete: temp[elem] = 0. predict[i,:-3] = temp predict[i,-3] = clf1.decision_function(np.array(temp,dtype=float)) predict[i,-2] = clf2.decision_function(np.array(temp,dtype=float)) predict[i,-1] = a[-1] if plotX: labelsM = ["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"] f1matrix = np.zeros((100,dimMatrix-1)) fig = plt.figure() fig.set_size_inches(9,6) ax = fig.add_subplot(111) iC = -1 for i in np.linspace(0,1,100): iC += 1 for j in range(dimMatrix-1): t = np.array(predict[:,j]) if j >= dimMatrix-3: t = (t - np.min(t))/(np.max(t)-np.min(t)) f1matrix[iC,j] = f1_score(y_pred=t>i ,y_true=predict[:,-1]) F1scores = [] for j in range(dimMatrix-1): F1scores.append(np.max(f1matrix[:,j])) #ax.plot(np.linspace(0,1,100),f1matrix[:,j],label=labelsM[j],color=tableau20[j]) ax.bar(range(dimMatrix-1),F1scores) plt.xticks(np.arange(dimMatrix-1)+0.5,["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"],rotation=45) ax.set_ylabel("F1 score") ax.set_xlabel("Parameter") plt.legend(loc=2) customaxis(ax) plt.savefig("f1_bar.pdf") plt.show() fig = plt.figure() fig.set_size_inches(9, 6) ax = fig.add_subplot(111) AUCScores = [] for j in range(dimMatrix-1): # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(predict[:,-1], predict[:,j]) AUCScores.append(auc(fpr, tpr)) # Plot ROC curve ax.plot(fpr, tpr, label=labelsM[j],color=tableau20[j]) ax.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) ax.set_xlabel('False Positive Rate') ax.set_ylabel('True Positive Rate') ax.set_title('ROC Curve') plt.legend(loc=2) customaxis(ax) plt.savefig("roc.pdf") plt.show() fig = plt.figure() fig.set_size_inches(9, 6) ax = fig.add_subplot(111) ax.bar(range(dimMatrix-1),AUCScores) ax.set_ylabel('Area Under Curve') plt.xticks(np.arange(dimMatrix-1)+0.5,["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"],rotation=45) customaxis(ax) plt.savefig("roc_bar.pdf") plt.show()
print "Sequences:", asmLCS.seq.sequences # asmLCS.seq is the LCSequence object print "Substrings:", asmLCS.substr.substrings lenSeqOne = (float)(len(asmLCS.seq.seqOne)) lenSeqOneBuiltin = (float)(asmLCS.seq.matrix.seqOneLen) lenSeqTwo = (float)(len(asmLCS.seq.seqTwo)) lenSeqTwoBuiltin = (float)(asmLCS.seq.matrix.seqTwoLen) lenLCSeq = (float)(len(asmLCS.seq)) lenLCSub = (float)(len(asmLCS.substr)) perSim = ((lenLCSeq / lenSeqOne) + (lenLCSeq / lenSeqTwo)) / 2 perExact = ((lenLCSub / lenSeqOne) + (lenLCSub / lenSeqTwo)) / 2 print "Length of SeqOne:", lenSeqOne print "Length of SeqOne (builtin):", lenSeqOneBuiltin print "Length of SeqTwo:", lenSeqTwo print "Length of SeqTwo (builtin):", lenSeqTwoBuiltin print "Length of LCSeq:", lenLCSeq print "Length of LCSub:", lenLCSub print "Substring in SeqOne starts at postion:", asmLCS.seq.seqOne.find(list(asmLCS.substr.substrings)[0]) print "Substring in SeqTwo starts at postion:", asmLCS.seq.seqTwo.find(list(asmLCS.substr.substrings)[0]) print "Percent Similar:", perSim print "Percent Exact Copy:", perExact print "Levenshtein Distance:", ldistance.distance(asmLCS.seq.seqOne, asmLCS.seq.seqTwo) print "Jaro Similarity:", ldistance.jaro(asmLCS.seq.seqOne, asmLCS.seq.seqTwo) print "Jaro-Winkler:", ldistance.jaro_winkler(asmLCS.seq.seqOne, asmLCS.seq.seqTwo) print "Simlarity ratio:", ldistance.ratio(asmLCS.seq.seqOne, asmLCS.seq.seqTwo) print "\nSeconds to process and calculate:", time.time() - start_time # Levenshtein distance - character operations (add, remove, swap) needed to transform one string into the other. # Jaro Similarity - similarity of short strings; 0 if completely different, 1 if identical # Jaro-Winkler - Prefix weighted version of Jaro, because typos and divergence happens near the end of seqs # Similarity Ratio - The real minimal edit distance, aka diff sequence matching
def get_domex_jaro(self): return Levenshtein.jaro(self.data["expected_data"],self.data["domex_data"])
def jaroDistance(form1, form2): return Levenshtein.jaro(form1, form2) if (len(form1) * len(form2) > 0) else 0.0
import time import os import difflib import Levenshtein start = time.time() with open('/Users/fan/anaconda/bin/Workspace/data/openriceName.json', 'r') as f: openrice = json.load(f, encoding='utf8') with open('/Users/fan/anaconda/bin/Workspace/data/ifoodName.json', 'r') as f: ifood = json.load(f, encoding='utf8') for o in ifood: ifname =ifood[o].split('|')[0] ifaddress =ifood[o].split('|')[1] temp = {} for o in openrice: opname = openrice[o].split('|')[0] opaddress = openrice[o].split('|')[1] jw = Levenshtein.jaro_winkler(ifname, opname, 0.25) if jw not in temp: temp[jw] = '%s|%s' % (opname, opaddress) else: addressjw0 = Levenshtein.jaro(ifaddress, temp[jw].split('|')[1]) addressjw1 = Levenshtein.jaro(ifaddress, opaddress) if addressjw1 > addressjw0: temp[jw] = '%s|%s' % (opname, opaddress) print '%s|%s' % (ifname, temp[max(temp.keys())].split('|')[0]) print '%s|%s' % (ifaddress, temp[max(temp.keys())].split('|')[1])
def main(): ifName ='梁記麻辣火鍋冰棒豆腐' orName ='桔園' orName2 ='火鍋冰棒豆腐' orName3 ='梁記' orName4 ='梁記麻辣火鍋' orName5 ='梁記石頭火鍋' orName6 ='梁記火鍋' print 'jaro' print orName,':',Levenshtein.jaro(ifName, orName) print orName2,':',Levenshtein.jaro(ifName, orName2) print orName3,':',Levenshtein.jaro(ifName, orName3) print orName4,':',Levenshtein.jaro(ifName, orName4) print orName5,':',Levenshtein.jaro(ifName, orName5) print orName6,':',Levenshtein.jaro(ifName, orName6) print '---------------------------' print 'jaro_winkler' print orName,':',Levenshtein.jaro_winkler(ifName, orName, 0.25) print orName2,':',Levenshtein.jaro_winkler(ifName, orName2, 0.25) print orName3,':',Levenshtein.jaro_winkler(ifName, orName3, 0.25) print orName4,':',Levenshtein.jaro_winkler(ifName, orName4, 0.25) print orName5,':',Levenshtein.jaro_winkler(ifName, orName5, 0.25) print orName6,':',Levenshtein.jaro_winkler(ifName, orName6, 0.25) print '---------------------------' print 'distance' print orName,':',Levenshtein.distance(ifName, orName) print orName2,':',Levenshtein.distance(ifName, orName2) print orName3,':',Levenshtein.distance(ifName, orName3) print orName4,':',Levenshtein.distance(ifName, orName4) print orName5,':',Levenshtein.distance(ifName, orName5) print orName6,':',Levenshtein.distance(ifName, orName6) print '---------------------------' print 'ratio' print orName,':',Levenshtein.ratio(ifName, orName) print orName2,':',Levenshtein.ratio(ifName, orName2) print orName3,':',Levenshtein.ratio(ifName, orName3) print orName4,':',Levenshtein.ratio(ifName, orName4) print orName5,':',Levenshtein.ratio(ifName, orName5) print orName6,':',Levenshtein.ratio(ifName, orName6) print '---------------------------' print 'fuzzywuzzyRatio' print orName,':',fuzz.ratio(ifName, orName) print orName2,':',fuzz.ratio(ifName, orName2) print orName3,':',fuzz.ratio(ifName, orName3) print orName4,':',fuzz.ratio(ifName, orName4) print orName5,':',fuzz.ratio(ifName, orName5) print orName6,':',fuzz.ratio(ifName, orName6) print '---------------------------' print 'fuzzywuzzyPartial_ratio' print orName,':',fuzz.partial_ratio(ifName, orName) print orName2,':',fuzz.partial_ratio(ifName, orName2) print orName3,':',fuzz.partial_ratio(ifName, orName3) print orName4,':',fuzz.partial_ratio(ifName, orName4) print orName5,':',fuzz.partial_ratio(ifName, orName5) print orName6,':',fuzz.partial_ratio(ifName, orName6) print '---------------------------' print 'fuzzywuzzyToken_sort_ratio' print orName,':',fuzz.token_sort_ratio(ifName, orName) print orName2,':',fuzz.token_sort_ratio(ifName, orName2) print orName3,':',fuzz.token_sort_ratio(ifName, orName3) print orName4,':',fuzz.token_sort_ratio(ifName, orName4) print orName5,':',fuzz.token_sort_ratio(ifName, orName5) print orName6,':',fuzz.token_sort_ratio(ifName, orName6) print '---------------------------' print 'fuzzywuzzyToken_set_ratio' print orName,':',fuzz.token_set_ratio(ifName, orName) print orName2,':',fuzz.token_set_ratio(ifName, orName2) print orName3,':',fuzz.token_set_ratio(ifName, orName3) print orName4,':',fuzz.token_set_ratio(ifName, orName4) print orName5,':',fuzz.token_set_ratio(ifName, orName5) print orName6,':',fuzz.token_set_ratio(ifName, orName6)
def distances(st1,st2): #if return lev.jaro(st1,st2)
def build_spanish(args, prefixes_gen=None): corpus_files = os.listdir(args.es_corpus) sentences = [] wordcount = 0 for filename in corpus_files: print 'Processing %s' % filename with codecs.open(os.path.join(args.es_corpus, filename), encoding='utf-8') as f: sent = [] for line_no, line in enumerate(f): if line.strip() == '' or line.strip()[0] == '<' : sentences.append(sent) sent = [] else: try: word, lemm, morph, num = line.strip().split(' ') if num != '0': # 0 is for punctuation ant other trash sent.append( (word, lemm, morph) ) wordcount += 1 except Exception: print line_no print line print 'Loaded sentences: %d, wordcount = %d' % (len(sentences), wordcount) word_to_sents = defaultdict(set) words = [] sent_sets = [] for sent_no, s in enumerate(sentences): sent_set = set() for i, w in enumerate(s): words.append( (w[0], w[1], sent_no, len(words)) ) sent_set.add(w[0].lower()) for w in sent_set: word_to_sents[w].add(len(sent_sets)) sent_sets.append(sent_set) goodwords = [] word_used = defaultdict(int) for w in words: key = (w[0], w[1]) if word_used[key] < 10: word_used[key] += 1 goodwords.append(w) print 'Total goodwords: %d' % len(goodwords) prefix_map = defaultdict(list) for w in goodwords: prefix_map[w[0][:2].lower()].append(w) print 'Splitting prefix map' while True: splitted = False for k in prefix_map.keys(): if len(prefix_map[k]) > PREFIX_THRESH: lst = prefix_map.pop(k) print 'splitting %d items for key %s' % (len(lst), k) splitted = True for w in lst: prefix_map[w[0][:(len(k)+1)].lower()].append(w) break if not splitted: break prefix_map_keys = prefix_map.keys() print 'Prefix spanish map splitted onto %d groups' % len(prefix_map) print 'Prefix map spanish max pairs: %d' % sum([len(v) ** 2 for k, v in prefix_map.iteritems()]) print 'Loading word2vec data...' vectors = wv_common.load_text_vectors(args.es_text_vectors) articles = [] # wikipedia articles word_to_articles = defaultdict(set) word_to_id = {} id_to_word = {} word_to_freq = defaultdict(int) print 'Processing wiki data...' with codecs.open(args.spanish_wikidata, encoding='utf-8') as f: for line_no, line in enumerate(f): elems = line.strip().split() if elems[0] == 'WRD': article_words_ids = set() for word in elems[1:]: wl = word.lower() word_to_freq[wl] += 1 if wl not in word_to_id: idx = len(word_to_id) word_to_id[wl] = idx id_to_word[idx] = wl idx = word_to_id[wl] article_words_ids.add(idx) word_to_articles[wl].add(len(articles)) articles.append(article_words_ids) if line_no % 10000 == 0: print 'Processed %d lines of wikidata' % line_no if line_no > 100000: break print 'Total wiki vocab size: %d' % (len(word_to_freq)) total_wiki_words = float(sum(word_to_freq.values())) alphabet = set() for w in word_to_freq.iterkeys(): alphabet |= set(w) print 'Alphabet size: %d' % len(alphabet) trie_fwd = FreqTrie(alphabet) trie_inv = FreqTrie(alphabet) print 'Building tries' alphabet = set(ES_ALPHABET) trie_skipped = 0 for wf_no, wf in enumerate(word_to_freq.iteritems()): if len(set(wf[0]) & alphabet) > 0: trie_fwd.add(wf[0], wf[1]) trie_inv.add(wf[0][::-1], wf[1]) else: trie_skipped += 1 if wf_no % 10000 == 0: print 'Added to trie: %d words, nodecounts: %d %d skipped: %d' % (wf_no, trie_fwd.nodecount(), trie_inv.nodecount(), trie_skipped) print 'Building freqs' # 2,3,4-suffixes suffix_freqs = defaultdict(int) for w in goodwords: wl = w[0].lower() suffix_freqs[wl[-2:]] += 1 suffix_freqs[wl[-3:]] += 1 suffix_freqs[wl[-4:]] += 1 word_freqs = defaultdict(int) for w in goodwords: word_freqs[w[0].lower()] += 1 prc = 0 tst = 1000000 for i in xrange(tst): w1, w2 = random.sample(goodwords, 2) if w1[1]==w2[1]: prc += 1 print 'prc:', prc, tst lemm_map = defaultdict(dict) for w in goodwords: lemm_map[w[1]][w[0]] = set() for i, w in enumerate(goodwords): lemm_map[w[1]][w[0]].add(i) # lemm -> forms -> positions positive_examples = gen_positive_examples(lemm_map) print 'Generated %d positive examples' % len(positive_examples) print 'Generating features...' features = [] if prefixes_gen == None: for i in xrange(args.gen_pos + args.gen_neg): if i != 0 and (i % 10000) == 0 : print '%d...' % i answer = True if i < args.gen_pos else False w1 = -1 w2 = -1 feature = None if answer: w1, w2 = positive_examples[i] feature = {'answer': answer, 'id1': goodwords[w1][3], 'id2': goodwords[w2][3]} else: prefix_group = None while True: prefix = random.choice(prefix_map_keys) prefix_group = prefix_map[prefix] w1 = random.randint(0, len(prefix_group)-1) w2 = random.randint(0, len(prefix_group)-1) t1 = words[prefix_group[w1][3]][0].lower() t2 = words[prefix_group[w2][3]][0].lower() if w1 == w2 or answer != (prefix_group[w1][1].lower() == prefix_group[w2][1].lower()) or t1 == t2: continue break feature = {'answer': answer, 'id1': prefix_group[w1][3], 'id2': prefix_group[w2][3]} features.append(feature) else: pass pos_ex = set([(min(wid1, wid2), max(wid1, wid2)) for wid1, wid2 in positive_examples]) def fgen(): for prefix_no, prefix in enumerate(sorted(prefix_map_keys)): print 'Processing prefix: %s (%d of %d) with %d words' % (prefix, prefix_no, len(prefix_map_keys), len(prefix_map[prefix])) prefix_group = [] wcnt = defaultdict(int) for line in prefix_map[prefix]: if wcnt[line[0].lower()] < 1: wcnt[line[0].lower()] += 1 prefix_group.append(line) print 'Lines in prefix group: %d' % (len(prefix_group)) for i in xrange(len(prefix_group)): for j in xrange(i+1, len(prefix_group)): wid1 = prefix_group[i][3] wid2 = prefix_group[j][3] p_key = (min(wid1, wid2), max(wid1, wid2)) feature = {'answer': p_key in pos_ex, 'id1': wid1, 'id2': wid2} yield feature features = fgen() print 'Filling features...' not_found_vectors_count = 0 zero_mutual_info_corpus_count = 0 zero_mutual_info_wiki_count = 0 for feature_no, f in enumerate(features): i1 = f['id1']; w1 = words[i1][0]; wl1 = w1.lower() i2 = f['id2']; w2 = words[i2][0]; wl2 = w2.lower() f['tag1'] = '0' f['tag2'] = '0' f['w1'] = w1 f['w2'] = w2 # TODO: add tags: "max common length", "left-pos-tag" and "right-pos-tag" for both of them f['common_prefix_len'] = common_prefix_len(w1.lower(), w2.lower()) f['common_prefix_len_rel'] = f['common_prefix_len'] * 2.0 / (len(w1) + len(w2)) f['levenshtein'] = levenshtein.distance(wl1, wl2) f['jaro_winkler'] = levenshtein.jaro_winkler(wl1, wl2) f['jaro'] = levenshtein.jaro(wl1, wl2) common_prefix = wl1[:f['common_prefix_len']] suffix1 = wl1[f['common_prefix_len']:] suffix2 = wl2[f['common_prefix_len']:] f['freq_common_prefix'] = trie_fwd.get(common_prefix) * 1.0 / trie_fwd.getsum() f['freq_suffix1'] = trie_inv.get(suffix1) * 1.0 / trie_inv.getsum() f['freq_suffix2'] = trie_inv.get(suffix2) * 1.0 / trie_inv.getsum() f['freq1'] = word_freqs[w1.lower()] f['freq2'] = word_freqs[w2.lower()] f['suf2freq1'] = suffix_freqs[wl1[-2:]] f['suf2freq2'] = suffix_freqs[wl2[-2:]] f['suf3freq1'] = suffix_freqs[wl1[-3:]] f['suf3freq2'] = suffix_freqs[wl2[-3:]] f['suf4freq1'] = suffix_freqs[wl1[-4:]] f['suf4freq2'] = suffix_freqs[wl2[-4:]] if wl1 in vectors and wl2 in vectors: f['wv_dist'] = vectors[wl1]['vec'].dot(vectors[wl2]['vec']) else: not_found_vectors_count += 1 f['wv_dist'] = 1.0 # calculating mutual information w1fsc = len(word_to_sents[wl1]) w2fsc = len(word_to_sents[wl2]) w12fsc = len(word_to_sents[wl1] & word_to_sents[wl2]) if w12fsc > 0: w1fsc /= 1.0 * len(sent_sets) w2fsc /= 1.0 * len(sent_sets) w12fsc /= 1.0 * len(sent_sets) f['mut_info_corpus'] = math.log(w1fsc) + math.log(w2fsc) - math.log(w12fsc) else: f['mut_info_corpus'] = 0.0 zero_mutual_info_corpus_count += 1 w1fsw = len(word_to_articles[wl1]) w2fsw = len(word_to_articles[wl2]) w12fsw = len(word_to_articles[wl1] & word_to_articles[wl2]) if w12fsw > 0: w1fsw /= 1.0 * len(articles) w2fsw /= 1.0 * len(articles) w12fsw /= 1.0 * len(articles) f['mut_info_wiki'] = math.log(w1fsw) + math.log(w2fsw) - math.log(w12fsw) else: f['mut_info_wiki'] = 0.0 zero_mutual_info_wiki_count += 1 if feature_no % 1000 == 0: print 'Samples processed: %d' % (feature_no) if prefixes_gen != None: yield f if prefixes_gen == None: print 'Not found word vectors for: %d pairs of %d' % (not_found_vectors_count, len(features)) print 'Zeroed mutual corpus info for: %d pairs of %d' % (zero_mutual_info_corpus_count, len(features)) print 'Zeroed mutual wiki info for: %d pairs of %d' % (zero_mutual_info_wiki_count, len(features)) print 'Saving features...' save_features(args.es_features_output, features)
def domex_valid(self): return Levenshtein.jaro(self.data["expected_data"],self.data["domex_data"]) > self.threshold