def test(clf): dvds = [] with open("dvd.csv") as f: for i, j in enumerate(f): dvds.append(j) movies = [] with open("movies.csv") as f: for i, j in enumerate(f): movies.append(j) dvds = [dvd for dvd in dvds if dvd > "B"] movies = [movie for movie in movies if movie > "B"] print(len(dvds), len(movies)) with open("test.csv", "w") as f: i = 0 for dvd in dvds: prefix = dvd[0] i += 1 maxSimil = 0.0 for movie in movies: if movie[0] == prefix: tempSim = lev.jaro(dvd, movie) if tempSim > maxSimil: maxSimil = tempSim maxMovie = movie temp = [ 1.0 - (lev.distance(dvd, maxMovie) / len(dvd)), lev.jaro(dvd, maxMovie), lev.jaro_winkler(dvd, maxMovie), lev.ratio(dvd, maxMovie), ] print("%s\t%s\t%f\t%f" % (dvd.rstrip(), maxMovie.rstrip(), clf.decision_function(temp), clf.predict(temp))) f.write( "%s\t%s\t%f\t%f\t%f\t%f\t%f\t%i\n" % ( dvd.rstrip(), maxMovie.rstrip(), 1.0 - (lev.distance(dvd, maxMovie) / len(dvd)), lev.jaro(dvd, maxMovie), lev.jaro_winkler(dvd, maxMovie), lev.ratio(dvd, maxMovie), clf.decision_function(temp), clf.predict(temp), ) )
def are_similar(name1, name2): name1, name2 = (mangle_name(s) for s in (name1, name2)) ratio = Levenshtein.jaro_winkler(name1, name2) # TODO: remove this debug print if ratio < 0.8: print " * ratio = %s => name1 = '%s' vs name2 = '%s'" % (ratio, name1, name2) return ratio >= 0.8
def predictionRatio(df, metric="Levenshtein"): #Generate all possible combinations for string matching soc_media_1, soc_media_2 = df.columns # Convert everything to lower case df[soc_media_1] = df[soc_media_1].str.lower() df[soc_media_2] = df[soc_media_2].str.lower() df_known = DataFrame([df[soc_media_1].tolist()] * df.shape[0], index=df.index, columns=df.index) df_search = DataFrame([df[soc_media_2].tolist()] * df.shape[0], index=df.index, columns=df.index) df_known_list = df_known.applymap(lambda x: list([x])) df_search_list = df_search.applymap(lambda x: list([x])) df_search_list = df_known_list+df_search_list.T # Find the indices of columns for each row based on metric # For Levenshtein get the min., for JaroWinkler get the max. if metric == 'Levenshtein': search_res = df_search_list.applymap(lambda x: Levenshtein.distance(x[0], x[1])) indices = search_res.idxmin(axis=1) else: search_res = df_search_list.applymap(lambda x: Levenshtein.jaro_winkler(x[0], x[1])) indices = search_res.idxmax(axis=1) # Get the matches for social media account match = df[soc_media_2].ix[indices] df_t = DataFrame() df_t['actual'] = df[soc_media_2].reset_index(drop=True) df_t['match'] = match.reset_index(drop=True) # Find the ratio of correct matches match_count = (df_t.actual == df_t.match).value_counts() ratio = float(match_count[True]) / (match_count[True] + match_count[False]) return ratio
def response(db, user, inStr): inStr = common.que_init(inStr) ans = '' colls = db.collection_names() random.shuffle(colls) for coll in colls: if coll[-4:] != '_yml': continue reqs = db[coll].find_one({'tag': 'dia'}) if not reqs: continue qas = reqs['qas'] if not qas: continue random.shuffle(qas) for qa in qas: ques = qa['que'] random.shuffle(ques) for que in ques: que = str(que) que = common.que_init(que) if Leven.jaro_winkler(inStr, que) > JARO_WINKLER_PERCENT: ans = qa['ans'] if type(ans) is list: ans = random.choice(ans) return ans return ans
def __get_match(self, query, words): _match = [] for word in words: distance = Levenshtein.jaro_winkler(word, query, self.weight) if distance > self.accuracy: _match.append((distance, word)) return _match
def check_cons(name1, name2): ratio = Levenshtein.ratio(name1, name2) jaro = Levenshtein.jaro(name1, name2) jaro_winkler = Levenshtein.jaro_winkler(name1, name2) if ratio > .6 or jaro > .7 or jaro_winkler > .7: return True else: return False
def check_sure(name1, name2): ratio = Levenshtein.ratio(name1, name2) jaro = Levenshtein.jaro(name1, name2) jaro_winkler = Levenshtein.jaro_winkler(name1, name2) if ratio >= 0.9 and jaro >= 0.95 and jaro_winkler >= 0.95: return True else: return False
def choose(s, possibilities, threshold=.6): """ Returns the closest match to string s if exceeds threshold, else returns None """ if s in possibilities: return s startswith = [x for x in possibilities if x.lower().startswith(s.lower())] if len(startswith) == 1: return startswith[0] contained = [x for x in possibilities if s.lower() in x.lower()] if len(contained) > 1: return contained[0] close = sorted([(x, Levenshtein.jaro_winkler(s, x, .05)) for x in possibilities], key=itemgetter(1)) best = max([(x, Levenshtein.jaro_winkler(s, x, .05)) for x in possibilities], key=itemgetter(1)) if best[1] < threshold: print 'returning None because', best, 'is below threshold of', threshold print 'out of', close return None return best[0]
def clusterStrings(self, stringList): for string_1 in stringList: for string_2 in stringList: similarity = Levenshtein.jaro_winkler(string_1, string_2) if(similarity > 0.95): print similarity print string_1 print string_2 break
def check_beli(name1, name2): ratio = Levenshtein.ratio(name1, name2) jaro = Levenshtein.jaro(name1, name2) jaro_winkler = Levenshtein.jaro_winkler(name1, name2) if ratio >= 0.9 or jaro >= 0.9 or jaro_winkler >= 0.9: return True elif ratio >= .7 and jaro >= .8 and jaro_winkler >= .8: return True else: return False
def find_closest_string(query, dictionary, thresh=0.90): """ This function returns the closest match for a query string against a dictionary of terms using levenstein distance """ dist = {i:Levenshtein.jaro_winkler(query, i) for i in dictionary} dist = sorted(dist.items(), key=operator.itemgetter(1), reverse=True) if dist[0][1] >= thresh: return dist[0][0] else: return None
def dp(s1, s2): key = (tuple(s1), tuple(s2)) if key in d: return d[key] if not s1 or not s2: return 0 best = dp(s1[1:], s2) for s2i in s2: w = Levenshtein.jaro_winkler(s1[0], s2i) best = max(best, w + dp(s1[1:], s2 - set([s2i]))) d[key] = best return best
def __best_country_match(self, raw): max_jw = 0 max_country = '' for country in self.country_list: jw = lev.jaro_winkler(country, raw) if jw > max_jw: max_jw = jw max_country = country if max_jw > self.threshhold_jw: latitude, longitude = self.countries[max_country] return max_country, max_country, latitude, longitude else: return None, None, None, None
def __call__(self, ua, devices): """ @param ua: The user agent @type ua: string @param devices: The devices object to search @type devices: Devices @rtype: Device @raises pywurfl.DeviceNotFound """ match = max((Levenshtein.jaro_winkler(x, ua, self.weight), x) for x in devices.devuas) if match[0] >= self.accuracy: return devices.devuas[match[1]] else: raise DeviceNotFound(ua)
def choose(s, possibilities, threshold=.6): """ Returns the closest match to string s if exceeds threshold, else returns None """ if not possibilities: return None if s in possibilities: return s if s == '': return None startswith = [x for x in possibilities if x.lower().startswith(s.lower())] if len(startswith) == 1: return startswith[0] contained = [x for x in possibilities if s.lower() in x.lower()] if len(contained) == 1: return contained[0] best = max([(x, Levenshtein.jaro_winkler(s, x, .05)) for x in possibilities], key=itemgetter(1)) if best[1] < threshold: #print 'did you mean %s?' % best[0] return None return best[0]
def train(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete = []): allTrainX = list() allTrainY = list() with open("./data/train.csv") as f: for line in f: lin = line.split(",") if len(lin) == 3: st1 = lin[0].lower() st2 = lin[1].lower() temp = [ 1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))), lev.jaro(st1,st2), lev.jaro_winkler(st1,st2), lev.ratio(st1,st2), distance.sorensen(st1,st2), jaccard(set(st1),set(st2)), 1. - distance.nlevenshtein(st1,st2,method=1), 1. - distance.nlevenshtein(st1,st2,method=2), dice_coefficient(st1,st2,lenGram=2), dice_coefficient(st1,st2,lenGram=3), dice_coefficient(st1,st2,lenGram=4), cosineWords(st1,st2,dictTrain,tfidf_matrix_train), cosineBigrams(st1,st2,dictTrainBigrams,tfidf_matrix_trainBigrams,lenGram) ] if len(delete) > 0: for elem in delete: temp[elem] = 0. allTrainX.append(temp) allTrainY.append(int(lin[2])) X = np.array(allTrainX,dtype=float) y = np.array(allTrainY,dtype=float) clf = svm.LinearSVC(C=1.,dual=False,loss='l2', penalty='l1') clf2 = linear_model.LogisticRegression(C=1.,dual=False, penalty='l1') clf.fit(X, y) clf2.fit(X, y) weights = np.array(clf.coef_[0]) print(weights) weights = np.array(clf2.coef_[0]) print(weights) return clf,clf2
def suggestions(s, possibilities): #TODO don't use jaro_winkler, or use it more intelligently; # ie break up words and match on each of them # jaro_winkler weighs the front more startswith = [x for x in possibilities if x.lower().startswith(s.lower())] if startswith: return startswith contained = [x for x in possibilities if s.lower() in x.lower()] if contained: return contained jws = [(x, Levenshtein.jaro_winkler(s, x)) for x in possibilities] jws.sort(key=lambda x:0-x[1]) diffs = [x[1] - y[1] for x, y in zip(jws[:-1], jws[1:])] output = [] for (card_name, score), diff in zip(jws[:-1], diffs): output.append(card_name) print diff if diff > .05: break if len(output) > 5: break return output
def numMatch(boxesds,num): matchedProb = None if (num is None): return matchedProb # tempSim = 0 # maxSim = 0 # matchedProb = dict() matchedProb = [] # print('+------------------+') # print(boxesds[0].boxname,boxesds[1].boxname,boxesds[2].boxname,boxesds[3].boxname,boxesds[4].boxname) for item in boxesds: tempSim = Levenshtein.jaro_winkler(str(item.number),num) matchedProb.append(tempSim) # matchedProb.update({item.boxname:tempSim}) # print(item.boxname+': '+ str(tempSim)) # if(tempSim > maxSim): # maxSim = tempSim # matchRst = item.boxname # print('+------------------+\n') return matchedProb
def __call__(self, ua, devices): """ @param ua: The user agent @type ua: string @param devices: The devices object to search @type devices: Devices @rtype: Device @raises pywurfl.DeviceNotFound """ match = max((Levenshtein.jaro_winkler(x, ua, self.weight), x) for x in devices.devuas) if match[0] >= self.accuracy: dev_clone = copy.copy(devices.devuas[match[1]]) dev_clone.accuracy = match[0] # print "Got accuracy " + match[1] + " " + str(match[0]) return dev_clone else: raise DeviceNotFound(ua)
def jaro_winkler(str1, str2): jaro_dist = Levenshtein.jaro_winkler(str1, str2) return jaro_dist
print "Sequences:", asmLCS.seq.sequences # asmLCS.seq is the LCSequence object print "Substrings:", asmLCS.substr.substrings lenSeqOne = (float)(len(asmLCS.seq.seqOne)) lenSeqOneBuiltin = (float)(asmLCS.seq.matrix.seqOneLen) lenSeqTwo = (float)(len(asmLCS.seq.seqTwo)) lenSeqTwoBuiltin = (float)(asmLCS.seq.matrix.seqTwoLen) lenLCSeq = (float)(len(asmLCS.seq)) lenLCSub = (float)(len(asmLCS.substr)) perSim = ((lenLCSeq / lenSeqOne) + (lenLCSeq / lenSeqTwo)) / 2 perExact = ((lenLCSub / lenSeqOne) + (lenLCSub / lenSeqTwo)) / 2 print "Length of SeqOne:", lenSeqOne print "Length of SeqOne (builtin):", lenSeqOneBuiltin print "Length of SeqTwo:", lenSeqTwo print "Length of SeqTwo (builtin):", lenSeqTwoBuiltin print "Length of LCSeq:", lenLCSeq print "Length of LCSub:", lenLCSub print "Substring in SeqOne starts at postion:", asmLCS.seq.seqOne.find(list(asmLCS.substr.substrings)[0]) print "Substring in SeqTwo starts at postion:", asmLCS.seq.seqTwo.find(list(asmLCS.substr.substrings)[0]) print "Percent Similar:", perSim print "Percent Exact Copy:", perExact print "Levenshtein Distance:", ldistance.distance(asmLCS.seq.seqOne, asmLCS.seq.seqTwo) print "Jaro Similarity:", ldistance.jaro(asmLCS.seq.seqOne, asmLCS.seq.seqTwo) print "Jaro-Winkler:", ldistance.jaro_winkler(asmLCS.seq.seqOne, asmLCS.seq.seqTwo) print "Simlarity ratio:", ldistance.ratio(asmLCS.seq.seqOne, asmLCS.seq.seqTwo) print "\nSeconds to process and calculate:", time.time() - start_time # Levenshtein distance - character operations (add, remove, swap) needed to transform one string into the other. # Jaro Similarity - similarity of short strings; 0 if completely different, 1 if identical # Jaro-Winkler - Prefix weighted version of Jaro, because typos and divergence happens near the end of seqs # Similarity Ratio - The real minimal edit distance, aka diff sequence matching
def main(): ifName ='梁記麻辣火鍋冰棒豆腐' orName ='桔園' orName2 ='火鍋冰棒豆腐' orName3 ='梁記' orName4 ='梁記麻辣火鍋' orName5 ='梁記石頭火鍋' orName6 ='梁記火鍋' print 'jaro' print orName,':',Levenshtein.jaro(ifName, orName) print orName2,':',Levenshtein.jaro(ifName, orName2) print orName3,':',Levenshtein.jaro(ifName, orName3) print orName4,':',Levenshtein.jaro(ifName, orName4) print orName5,':',Levenshtein.jaro(ifName, orName5) print orName6,':',Levenshtein.jaro(ifName, orName6) print '---------------------------' print 'jaro_winkler' print orName,':',Levenshtein.jaro_winkler(ifName, orName, 0.25) print orName2,':',Levenshtein.jaro_winkler(ifName, orName2, 0.25) print orName3,':',Levenshtein.jaro_winkler(ifName, orName3, 0.25) print orName4,':',Levenshtein.jaro_winkler(ifName, orName4, 0.25) print orName5,':',Levenshtein.jaro_winkler(ifName, orName5, 0.25) print orName6,':',Levenshtein.jaro_winkler(ifName, orName6, 0.25) print '---------------------------' print 'distance' print orName,':',Levenshtein.distance(ifName, orName) print orName2,':',Levenshtein.distance(ifName, orName2) print orName3,':',Levenshtein.distance(ifName, orName3) print orName4,':',Levenshtein.distance(ifName, orName4) print orName5,':',Levenshtein.distance(ifName, orName5) print orName6,':',Levenshtein.distance(ifName, orName6) print '---------------------------' print 'ratio' print orName,':',Levenshtein.ratio(ifName, orName) print orName2,':',Levenshtein.ratio(ifName, orName2) print orName3,':',Levenshtein.ratio(ifName, orName3) print orName4,':',Levenshtein.ratio(ifName, orName4) print orName5,':',Levenshtein.ratio(ifName, orName5) print orName6,':',Levenshtein.ratio(ifName, orName6) print '---------------------------' print 'fuzzywuzzyRatio' print orName,':',fuzz.ratio(ifName, orName) print orName2,':',fuzz.ratio(ifName, orName2) print orName3,':',fuzz.ratio(ifName, orName3) print orName4,':',fuzz.ratio(ifName, orName4) print orName5,':',fuzz.ratio(ifName, orName5) print orName6,':',fuzz.ratio(ifName, orName6) print '---------------------------' print 'fuzzywuzzyPartial_ratio' print orName,':',fuzz.partial_ratio(ifName, orName) print orName2,':',fuzz.partial_ratio(ifName, orName2) print orName3,':',fuzz.partial_ratio(ifName, orName3) print orName4,':',fuzz.partial_ratio(ifName, orName4) print orName5,':',fuzz.partial_ratio(ifName, orName5) print orName6,':',fuzz.partial_ratio(ifName, orName6) print '---------------------------' print 'fuzzywuzzyToken_sort_ratio' print orName,':',fuzz.token_sort_ratio(ifName, orName) print orName2,':',fuzz.token_sort_ratio(ifName, orName2) print orName3,':',fuzz.token_sort_ratio(ifName, orName3) print orName4,':',fuzz.token_sort_ratio(ifName, orName4) print orName5,':',fuzz.token_sort_ratio(ifName, orName5) print orName6,':',fuzz.token_sort_ratio(ifName, orName6) print '---------------------------' print 'fuzzywuzzyToken_set_ratio' print orName,':',fuzz.token_set_ratio(ifName, orName) print orName2,':',fuzz.token_set_ratio(ifName, orName2) print orName3,':',fuzz.token_set_ratio(ifName, orName3) print orName4,':',fuzz.token_set_ratio(ifName, orName4) print orName5,':',fuzz.token_set_ratio(ifName, orName5) print orName6,':',fuzz.token_set_ratio(ifName, orName6)
s = len(res2_unmatch) if s == 0: console('本数据全为精确匹配!') res_final = pd.concat([res1_remain, res2_remain], axis=0).reset_index(drop=True) res_final.to_csv(path + 'output_data.csv', index=False) #直接输出精确匹配结果 else: console('剩余%d条编码需要模糊匹配' % s) #打印需要进行模糊匹配的数量 # 使用分词后的名称进行模糊匹配 df = pd.DataFrame(columns=['icd_code_yb', 'name_yb']) console("开始模糊匹配") for item in list(zip(res2_unmatch['icd_code'], res2_unmatch['icd_name'])): count += 1 console("模糊匹配进度: 第%d编码进行正在进行匹配" % count) yb_diag['codeScore'] = yb_diag['icd_code_yb'].apply( lambda x: Levenshtein.jaro_winkler(item[0][:-1], x) ) #对于icd编码使用Levenshtein.jaro_winkler # 忽略顺序匹配 yb_diag['nameScore'] = yb_diag['icd_name_yb'].apply( lambda x: fuzz.token_sort_ratio(item[1], x) / 100 ) #对于icd编码使用fuzz.token_sort_ratio yb_diag['finalScore'] = yb_diag[['codeScore', 'nameScore']].apply( lambda x: Score(x['nameScore'], x['codeScore']), axis=1) #根据编码和名称的相似度得到最终的相似度 df1 = yb_diag.iloc[ yb_diag.finalScore.argmax(), :] #[['icd_code_yb','icd_name_yb']] df = df.append(df1) if count % 10 == 0: _ = count // 10 * 10
def jaroWinklerDistance(form1, form2): return Levenshtein.jaro_winkler(form1, form2, 0.1) if (len(form1) * len(form2) > 0) else 0.0
lenLCSeq = (float)(len(asmLCS.seq)) lenLCSub = (float)(len(asmLCS.substr)) perSim = ((lenLCSeq / lenSeqOne) + (lenLCSeq / lenSeqTwo)) / 2 perExact = ((lenLCSub / lenSeqOne) + (lenLCSub / lenSeqTwo)) / 2 print "Length of SeqOne:", lenSeqOne print "Length of SeqOne (builtin):", lenSeqOneBuiltin print "Length of SeqTwo:", lenSeqTwo print "Length of SeqTwo (builtin):", lenSeqTwoBuiltin print "Length of LCSeq:", lenLCSeq print "Length of LCSub:", lenLCSub print "Substring in SeqOne starts at postion:", asmLCS.seq.seqOne.find( list(asmLCS.substr.substrings)[0]) print "Substring in SeqTwo starts at postion:", asmLCS.seq.seqTwo.find( list(asmLCS.substr.substrings)[0]) print "Percent Similar:", perSim print "Percent Exact Copy:", perExact print "Levenshtein Distance:", ldistance.distance(asmLCS.seq.seqOne, asmLCS.seq.seqTwo) print "Jaro Similarity:", ldistance.jaro(asmLCS.seq.seqOne, asmLCS.seq.seqTwo) print "Jaro-Winkler:", ldistance.jaro_winkler(asmLCS.seq.seqOne, asmLCS.seq.seqTwo) print "Simlarity ratio:", ldistance.ratio(asmLCS.seq.seqOne, asmLCS.seq.seqTwo) print "\nSeconds to process and calculate:", time.time() - start_time # Levenshtein distance - character operations (add, remove, swap) needed to transform one string into the other. # Jaro Similarity - similarity of short strings; 0 if completely different, 1 if identical # Jaro-Winkler - Prefix weighted version of Jaro, because typos and divergence happens near the end of seqs # Similarity Ratio - The real minimal edit distance, aka diff sequence matching
def stats(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete = [],plotX=False): with open("./data/stats.csv") as infile: for i,line in enumerate(infile): pass dimMatrix = 16 predict = np.zeros((i+1,dimMatrix)) clf1,clf2 = train(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete=delete) with open("./data/stats.csv") as infile: for i,line in enumerate(infile): a = line.rstrip().split("\t") ## create same vector with more distances st1 = a[0].lower() st2 = a[1].lower() temp = [ 1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))), lev.jaro(st1,st2), lev.jaro_winkler(st1,st2), lev.ratio(st1,st2), distance.sorensen(st1,st2), jaccard(set(st1),set(st2)), 1. - distance.nlevenshtein(st1,st2,method=1), 1. - distance.nlevenshtein(st1,st2,method=2), dice_coefficient(st1,st2,lenGram=2), dice_coefficient(st1,st2,lenGram=3), dice_coefficient(st1,st2,lenGram=4), cosineWords(st1,st2), cosineBigrams(st1,st2)] if len(delete) > 0: for elem in delete: temp[elem] = 0. predict[i,:-3] = temp predict[i,-3] = clf1.decision_function(np.array(temp,dtype=float)) predict[i,-2] = clf2.decision_function(np.array(temp,dtype=float)) predict[i,-1] = a[-1] if plotX: labelsM = ["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"] f1matrix = np.zeros((100,dimMatrix-1)) fig = plt.figure() fig.set_size_inches(9,6) ax = fig.add_subplot(111) iC = -1 for i in np.linspace(0,1,100): iC += 1 for j in range(dimMatrix-1): t = np.array(predict[:,j]) if j >= dimMatrix-3: t = (t - np.min(t))/(np.max(t)-np.min(t)) f1matrix[iC,j] = f1_score(y_pred=t>i ,y_true=predict[:,-1]) F1scores = [] for j in range(dimMatrix-1): F1scores.append(np.max(f1matrix[:,j])) #ax.plot(np.linspace(0,1,100),f1matrix[:,j],label=labelsM[j],color=tableau20[j]) ax.bar(range(dimMatrix-1),F1scores) plt.xticks(np.arange(dimMatrix-1)+0.5,["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"],rotation=45) ax.set_ylabel("F1 score") ax.set_xlabel("Parameter") plt.legend(loc=2) customaxis(ax) plt.savefig("f1_bar.pdf") plt.show() fig = plt.figure() fig.set_size_inches(9, 6) ax = fig.add_subplot(111) AUCScores = [] for j in range(dimMatrix-1): # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(predict[:,-1], predict[:,j]) AUCScores.append(auc(fpr, tpr)) # Plot ROC curve ax.plot(fpr, tpr, label=labelsM[j],color=tableau20[j]) ax.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) ax.set_xlabel('False Positive Rate') ax.set_ylabel('True Positive Rate') ax.set_title('ROC Curve') plt.legend(loc=2) customaxis(ax) plt.savefig("roc.pdf") plt.show() fig = plt.figure() fig.set_size_inches(9, 6) ax = fig.add_subplot(111) ax.bar(range(dimMatrix-1),AUCScores) ax.set_ylabel('Area Under Curve') plt.xticks(np.arange(dimMatrix-1)+0.5,["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"],rotation=45) customaxis(ax) plt.savefig("roc_bar.pdf") plt.show()
def make_feature(data_or,vec_model): print('get features:') from gensim.models import Word2Vec vec_model = Word2Vec.load('pretrain_model/w2v_300.model') dictionary = corpora.Dictionary.load('temp_data/train_dictionary.dict') tfidf = models.TfidfModel.load("temp_data/train_tfidf.model") index = similarities.SparseMatrixSimilarity.load('temp_data/train_index.index') item_id_list = joblib.load('temp_data/paper_id.pkl') with open('temp_data/train_content.pkl','rb') as fr: corpus = pickle.load(fr) data = data_or.copy() data['abstract_pre'] = data['abstract_pre'].apply( lambda x: np.nan if str(x) == 'nan' or len(x) < 9 else x) data['abstract_pre'] = data['abstract_pre'].apply( lambda x: 'none' if str(x) == 'nan' or str(x).split(' ') == ['n', 'o', 'n', 'e'] else x) data['key_text_pre'] = data['key_text_pre'].fillna('none') data['description_text'] = data['description_text'].fillna('none') data['title_pro'] = data['title_pro'].fillna('none') data['description_text_pre'] = data['description_text_pre'].fillna('none') prefix = 'num_' # 长度 data[prefix + 'key_text_len'] = data['key_text_pre'].apply(lambda x: len(x.split(' '))) # 长度append data[prefix + 'description_text_len'] = data['description_text'].apply(lambda x: len(x.split(' '))) data.loc[data[prefix + 'key_text_len'] < 7, 'key_text_pre'] = data[data[prefix + 'key_text_len'] < 7][ 'description_text'].apply( lambda x: ' '.join(pre_process(re.sub(r'[\[|,]+\*\*\#\#\*\*[\]|,]+', '', x)))).values # abstract是否为空 data[prefix + 'cate_pa_isnull'] = data['abstract_pre'].apply(lambda x: 1 if str(x) == 'none' else 0) # key_words是否为空 data[prefix + 'cate_pkeywords_isnull'] = data['keywords'].apply(lambda x: 1 if str(x) == 'nan' else 0) #描述在key_word中出现的次数 def get_num_key(x,y): if str(y)=='nan': return -1 y=y.strip(';').split(';') num=0 for i in y: if i in x: num+=1 return num data[prefix+'key_in_key_word_number']=list(map(lambda x,y: get_num_key(x,y),data['key_text_pre'],data['keywords'])) #描述在key_word中出现的次数/key_words的个数 data[prefix+'key_in_key_word_number_rate']=list(map(lambda x,y: 0 if x==-1 else x/len(y.strip(';').split(';')),data[prefix+'key_in_key_word_number'], data['keywords'])) #append data[prefix+'key_in_key_word_number2']=list(map(lambda x,y: get_num_key(x,y),data['description_text'],data['keywords'])) #描述在key_word中出现的次数/key_words的个数 data[prefix+'key_in_key_word_number2_rate']=list(map(lambda x,y: 0 if x==-1 else x/len(y.strip(';').split(';')),data[prefix+'key_in_key_word_number2'], data['keywords'])) # 描述在title出现单词的统计 def get_num_common_words_and_ratio(merge, col): # merge data merge = merge[col] merge.columns = ['q1', 'q2'] merge['q2'] = merge['q2'].apply(lambda x: 'none' if str(x) == 'nan' else x) q1_word_set = merge.q1.apply(lambda x: x.split(' ')).apply(set).values q2_word_set = merge.q2.apply(lambda x: x.split(' ')).apply(set).values q1_word_len = merge.q1.apply(lambda x: len(x.split(' '))).values q2_word_len = merge.q2.apply(lambda x: len(x.split(' '))).values q1_word_len_set = merge.q1.apply(lambda x: len(set(x.split(' ')))).values q2_word_len_set = merge.q2.apply(lambda x: len(set(x.split(' ')))).values result = [len(q1_word_set[i] & q2_word_set[i]) for i in range(len(q1_word_set))] result_ratio_q = [result[i] / q1_word_len[i] for i in range(len(q1_word_set))] result_ratio_t = [result[i] / q2_word_len[i] for i in range(len(q1_word_set))] result_ratio_q_set = [result[i] / q1_word_len_set[i] for i in range(len(q1_word_set))] result_ratio_t_set = [result[i] / q2_word_len_set[i] for i in range(len(q1_word_set))] return result, result_ratio_q, result_ratio_t, q1_word_len, q2_word_len, q1_word_len_set, q2_word_len_set, result_ratio_q_set, result_ratio_t_set data[prefix + 'common_words_k_pt'], \ data[prefix + 'common_words_k_pt_k'], \ data[prefix + 'common_words_k_pt_pt'], \ data[prefix + 'k_len'], \ data[prefix + 'pt_len'], \ data[prefix + 'k_len_set'], \ data[prefix + 'pt_len_set'], \ data[prefix + 'common_words_k_pt_k_set'], \ data[prefix + 'common_words_k_pt_pt_set'] = get_num_common_words_and_ratio(data, ['key_text_pre', 'title_pro']) data[prefix + 'common_words_k_at'], \ data[prefix + 'common_words_k_at_k'], \ data[prefix + 'common_words_k_at_at'], \ data[prefix + 'k_len'], \ data[prefix + 'at_len'], \ data[prefix + 'k_len_set'], \ data[prefix + 'at_len_set'], \ data[prefix + 'common_words_k_at_k_set'], \ data[prefix + 'common_words_k_at_at_set'] = get_num_common_words_and_ratio(data, ['key_text_pre', 'abstract_pre']) #append data[prefix + 'common_words_k_pt_2'], \ data[prefix + 'common_words_k_pt_k_2'], \ data[prefix + 'common_words_k_pt_pt_2'], \ data[prefix + 'k_len_2'], \ data[prefix + 'pt_len'], \ data[prefix + 'k_len_set_2'], \ data[prefix + 'pt_len_set'], \ data[prefix + 'common_words_k_pt_k_set_2'], \ data[prefix + 'common_words_k_pt_pt_set_2'] = get_num_common_words_and_ratio(data, ['description_text', 'title_pro']) data[prefix + 'common_words_k_at_2'], \ data[prefix + 'common_words_k_at_k_2'], \ data[prefix + 'common_words_k_at_at_2'], \ data[prefix + 'k_len_2'], \ data[prefix + 'at_len'], \ data[prefix + 'k_len_set_2'], \ data[prefix + 'at_len_set'], \ data[prefix + 'common_words_k_at_k_set_2'], \ data[prefix + 'common_words_k_at_at_set_2'] = get_num_common_words_and_ratio(data, ['description_text', 'abstract_pre']) # Jaccard 相似度 def jaccard(x, y): if str(y) == 'nan': y = 'none' x = set(x) y = set(y) return float(len(x & y) / len(x | y)) data[prefix + 'jaccard_sim_k_pt'] = list(map(lambda x, y: jaccard(x, y), data['key_text_pre'], data['title_pro'])) data[prefix + 'jaccard_sim_k_pa'] = list( map(lambda x, y: jaccard(x, y), data['key_text_pre'], data['abstract_pre'])) #append data[prefix + 'jaccard_sim_k_pt2'] = list(map(lambda x, y: jaccard(x, y), data['description_text'], data['title_pro'])) data[prefix + 'jaccard_sim_k_pa2'] = list( map(lambda x, y: jaccard(x, y), data['key_text_pre'], data['description_text'])) # 编辑距离 print('get edict distance:') data[prefix + 'edict_distance_k_pt'] = list( map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1), tqdm(data['key_text_pre']), data['title_pro'])) data[prefix + 'edict_jaro'] = list( map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['key_text_pre']), data['title_pro'])) data[prefix + 'edict_ratio'] = list( map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['key_text_pre']), data['title_pro'])) data[prefix + 'edict_jaro_winkler'] = list( map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['key_text_pre']), data['title_pro'])) data[prefix + 'edict_distance_k_pa'] = list( map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1), tqdm(data['key_text_pre']), data['abstract_pre'])) data[prefix + 'edict_jaro_pa'] = list( map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['key_text_pre']), data['abstract_pre'])) data[prefix + 'edict_ratio_pa'] = list( map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['key_text_pre']), data['abstract_pre'])) data[prefix + 'edict_jaro_winkler_pa'] = list( map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['key_text_pre']), data['abstract_pre'])) #append print('get edict distance:') data[prefix + 'edict_distance_k_pt_2'] = list( map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1), tqdm(data['description_text']), data['title_pro'])) data[prefix + 'edict_jaro_2'] = list( map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['description_text']), data['title_pro'])) data[prefix + 'edict_ratio_2'] = list( map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['description_text']), data['title_pro'])) data[prefix + 'edict_jaro_winkler_2'] = list( map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['description_text']), data['title_pro'])) data[prefix + 'edict_distance_k_pa_2'] = list( map(lambda x, y: Levenshtein.distance(x, y) / (len(x)+1), tqdm(data['description_text']), data['abstract_pre'])) data[prefix + 'edict_jaro_pa_2'] = list( map(lambda x, y: Levenshtein.jaro(x, y), tqdm(data['description_text']), data['abstract_pre'])) data[prefix + 'edict_ratio_pa_2'] = list( map(lambda x, y: Levenshtein.ratio(x, y), tqdm(data['description_text']), data['abstract_pre'])) data[prefix + 'edict_jaro_winkler_pa_2'] = list( map(lambda x, y: Levenshtein.jaro_winkler(x, y), tqdm(data['description_text']), data['abstract_pre'])) #余弦相似度 def get_sim(doc, corpus): corpus = corpus.split(' ') corpus_vec = [dictionary.doc2bow(corpus)] corpus_tfidf = tfidf[corpus_vec] featurenum = len(dictionary.token2id.keys()) index_i = similarities.SparseMatrixSimilarity(corpus_tfidf, num_features=featurenum) doc = doc.split(' ') vec = dictionary.doc2bow(doc) vec_tfidf = tfidf[vec] sim = index_i.get_similarities(vec_tfidf) return sim[0] data[prefix + 'sim'] = list(map(lambda x, y: get_sim(x, y), tqdm(data['key_text_pre']), data['title_pro'])) data[prefix + 'sim_pa'] = list(map(lambda x, y: get_sim(x, y), tqdm(data['key_text_pre']), data['abstract_pre'])) #append data[prefix + 'sim_2'] = list(map(lambda x, y: get_sim(x, y), tqdm(data['description_text']), data['title_pro'])) data[prefix + 'sim_pa_2'] = list(map(lambda x, y: get_sim(x, y), tqdm(data['description_text']), data['abstract_pre'])) # tfidf def get_simlilary(query, title): def get_weight_counter_and_tf_idf(x, y): x = x.split() y = y.split() corups = x + y obj = dict(collections.Counter(corups)) x_weight = [] y_weight = [] idfs = [] for key in obj.keys(): idf = 1 w = obj[key] if key in x: idf += 1 x_weight.append(w) else: x_weight.append(0) if key in y: idf += 1 y_weight.append(w) else: y_weight.append(0) idfs.append(math.log(3.0 / idf) + 1) return [np.array(x_weight), np.array(y_weight), np.array(x_weight) * np.array(idfs), np.array(y_weight) * np.array(idfs), np.array(list(obj.keys()))] weight = list(map(lambda x, y: get_weight_counter_and_tf_idf(x, y), tqdm(query), title)) x_weight_couner = [] y_weight_couner = [] x_weight_tfidf = [] y_weight_tfidf = [] words = [] for i in weight: x_weight_couner.append(i[0]) y_weight_couner.append(i[1]) x_weight_tfidf.append(i[2]) y_weight_tfidf.append(i[3]) words.append(i[4]) # 曼哈顿距离 def mhd_simlilary(x, y): return np.linalg.norm(x - y, ord=1) mhd_simlilary_counter = list(map(lambda x, y: mhd_simlilary(x, y), x_weight_couner, y_weight_couner)) mhd_simlilary_tfidf = list(map(lambda x, y: mhd_simlilary(x, y), x_weight_tfidf, y_weight_tfidf)) # 余弦相似度 def cos_simlilary(x, y): return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) cos_simlilary_counter = list(map(lambda x, y: cos_simlilary(x, y), x_weight_couner, y_weight_couner)) cos_simlilary_tfidf = list(map(lambda x, y: cos_simlilary(x, y), x_weight_tfidf, y_weight_tfidf)) # 欧式距离 def Euclidean_simlilary(x, y): return np.sqrt(np.sum(x - y) ** 2) Euclidean_simlilary_counter = list(map(lambda x, y: Euclidean_simlilary(x, y), x_weight_couner, y_weight_couner)) Euclidean__simlilary_tfidf = list(map(lambda x, y: Euclidean_simlilary(x, y), x_weight_tfidf, y_weight_tfidf)) return mhd_simlilary_counter, mhd_simlilary_tfidf, cos_simlilary_counter, \ cos_simlilary_tfidf, Euclidean_simlilary_counter, Euclidean__simlilary_tfidf data[prefix + 'mhd_similiary'], data[prefix + 'tf_mhd_similiary'], \ data[prefix + 'cos_similiary'], data[prefix + 'tf_cos_similiary'], \ data[prefix + 'os_similiary'], data[prefix + 'tf_os_similiary'] = get_simlilary(data['key_text_pre'],data['title_pro']) data[prefix + 'mhd_similiary_pa'], data[prefix + 'tf_mhd_similiary_pa'], \ data[prefix + 'cos_similiary_pa'], data[prefix + 'tf_cos_similiary_pa'], \ data[prefix + 'os_similiary_pa'], data[prefix + 'tf_os_similiary_pa'] = get_simlilary(data['key_text_pre'],data['abstract_pre']) '词向量平均的相似度' def get_vec(x): vec = [] for word in x.split(): if word in vec_model: vec.append(vec_model[word]) if len(vec) == 0: return np.nan else: return np.mean(np.array(vec), axis=0) data['key_text_pre_vec'] = data['key_text_pre'].progress_apply(lambda x: get_vec(x)) data['title_pro_vec'] = data['title_pro'].progress_apply(lambda x: get_vec(x)) data['abstract_pre_vec'] = data['abstract_pre'].progress_apply(lambda x: get_vec(x)) data['description_text_vec'] = data['description_text'].progress_apply(lambda x: get_vec(x)) # cos data[prefix + 'cos_mean_word2vec'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)), tqdm(data['key_text_pre_vec']), data['title_pro_vec'])) data[prefix + 'cos_mean_word2vec'] = data[prefix + 'cos_mean_word2vec'].progress_apply( lambda x: np.nan if np.isnan(x).any() else x) # 欧式距离 data[prefix + 'os_mean_word2vec'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)), tqdm(data['key_text_pre_vec']), data['title_pro_vec'])) # mhd data[prefix + 'mhd_mean_word2vec'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else np.linalg.norm(x - y, ord=1), tqdm(data['key_text_pre_vec']), data['title_pro_vec'])) # cos data[prefix + 'cos_mean_word2vec_pa'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)), tqdm(data['key_text_pre_vec']), data['abstract_pre_vec'])) data[prefix + 'cos_mean_word2vec_pa'] = data[prefix + 'cos_mean_word2vec_pa'].progress_apply( lambda x: np.nan if np.isnan(x).any() else x) # 欧式距离 data[prefix + 'os_mean_word2vec_pa'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)), tqdm(data['key_text_pre_vec']), data['abstract_pre_vec'])) # mhd data[prefix + 'mhd_mean_word2vec_pa'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else np.linalg.norm(x - y, ord=1), tqdm(data['key_text_pre_vec']), data['abstract_pre_vec'])) #append data[prefix + 'cos_mean_word2vec_2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)), tqdm(data['description_text_vec']), data['title_pro_vec'])) data[prefix + 'cos_mean_word2vec_2'] = data[prefix + 'cos_mean_word2vec_2'].progress_apply( lambda x: np.nan if np.isnan(x).any() else x) # 欧式距离 data[prefix + 'os_mean_word2vec_2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)), tqdm(data['description_text_vec']), data['title_pro_vec'])) # mhd data[prefix + 'mhd_mean_word2vec_2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else np.linalg.norm(x - y, ord=1), tqdm(data['description_text_vec']), data['title_pro_vec'])) # cos data[prefix + 'cos_mean_word2vec_pa2'] = list(map(lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)), tqdm(data['description_text_vec']), data['abstract_pre_vec'])) data[prefix + 'cos_mean_word2vec_pa2'] = data[prefix + 'cos_mean_word2vec_pa2'].progress_apply( lambda x: np.nan if np.isnan(x).any() else x) # 欧式距离 data[prefix + 'os_mean_word2vec_pa2'] = list(map(lambda x, y: np.sqrt(np.sum((x - y) ** 2)), tqdm(data['description_text_vec']), data['abstract_pre_vec'])) # mhd data[prefix + 'mhd_mean_word2vec_pa2'] = list(map(lambda x, y: np.nan if np.isnan(x).any() or np.isnan(y).any() else np.linalg.norm(x - y, ord=1), tqdm(data['description_text_vec']), data['abstract_pre_vec'])) #n-gram距离相关 data[prefix+'n_gram_sim'],data[prefix+'sim_numeber_rate']=get_df_grams(data,2,['key_text_pre','title_pro']) data[prefix+'n_gram_sim_pa'],data[prefix+'sim_numeber_rate_pa']=get_df_grams(data,2,['key_text_pre','abstract_pre']) #append #n-gram距离相关 data[prefix+'n_gram_sim_2'],data[prefix+'sim_numeber_rate_2']=get_df_grams(data,2,['description_text','title_pro']) data[prefix+'n_gram_sim_pa_2'],data[prefix+'sim_numeber_rate_pa_2']=get_df_grams(data,2,['description_text','abstract_pre']) #################################################朋哥已做################################## # def apply_fun(df): # df.columns = ['d_id', 'key', 'doc'] # df['d_id'] = df['d_id'].fillna('always_nan') # query_id_group = df.groupby(['d_id']) # bm_list = [] # for name, group in tqdm(query_id_group): # corpus = group['doc'].values.tolist() # corpus = [sentence.strip().split() for sentence in corpus] # query = group['key'].values[0].strip().split() # bm25Model = BM25(corpus) # bmscore = bm25Model.get_scores(query) # bm_list.extend(bmscore) # return bm_list # data[prefix + 'bm25'] = apply_fun(data[['description_id', 'key_text_pre', 'title_pro']]) # data[prefix + 'bm25_pa'] = apply_fun(data[['description_id', 'key_text_pre', 'abstract_pre']]) # #append # data[prefix + 'bm25_2'] = apply_fun(data[['description_id', 'description_text', 'title_pro']]) # data[prefix + 'bm25_pa_2'] = apply_fun(data[['description_id', 'description_text', 'abstract_pre']]) # # get bm25 # def get_bm25(p_id, query): # query = query.split(' ') # score = bm25Model.get_score(query, item_id_list.index(p_id)) # return score # data[prefix + 'bm_25_all'] = list(map(lambda x, y: get_bm25(x, y), tqdm(data['paper_id']), data['key_text_pre'])) # #append # data[prefix + 'bm_25_all_2'] = list(map(lambda x, y: get_bm25(x, y), tqdm(data['paper_id']), data['description_text'])) #################################################朋哥已做################################## data[prefix + 'Hamming_kt'] = list(map(lambda x, y: textdistance.Hamming(qval=None).normalized_distance(x, y), tqdm(data['key_text_pre']), data['title_pro'])) data[prefix + 'Hamming_dt'] = list(map(lambda x, y: textdistance.Hamming(qval=None).normalized_distance(x, y), tqdm(data['description_text_pre']), data['title_pro'])) data[prefix + 'Hamming_ka'] = list(map(lambda x, y: textdistance.Hamming(qval=None).normalized_distance(x, y), tqdm(data['key_text_pre']), data['abstract_pre'])) data[prefix + 'Hamming_da'] = list(map(lambda x, y: textdistance.Hamming(qval=None).normalized_distance(x, y), tqdm(data['description_text_pre']), data['abstract_pre'])) data[prefix + 'Hamming_sim_kt'] = list(map(lambda x, y: textdistance.Hamming(qval=None).similarity(x, y), tqdm(data['key_text_pre']), data['title_pro'])) data[prefix + 'Hamming_sim_dt'] = list(map(lambda x, y: textdistance.Hamming(qval=None).similarity(x, y), tqdm(data['description_text_pre']), data['title_pro'])) data[prefix + 'Hamming_sim_ka'] = list(map(lambda x, y: textdistance.Hamming(qval=None).similarity(x, y), tqdm(data['key_text_pre']), data['abstract_pre'])) data[prefix + 'Hamming_sim_da'] = list(map(lambda x, y: textdistance.Hamming(qval=None).similarity(x, y), tqdm(data['description_text_pre']), data['abstract_pre'])) def edit_distance(df,w1, w2): word1 = df[w1].split() word2 = df[w2].split() len1 = len(word1) len2 = len(word2) dp = np.zeros((len1 + 1, len2 + 1)) for i in range(len1 + 1): dp[i][0] = i for j in range(len2 + 1): dp[0][j] = j for i in range(1, len1 + 1): for j in range(1, len2 + 1): delta = 0 if word1[i - 1] == word2[j - 1] else 1 dp[i][j] = min(dp[i - 1][j - 1] + delta, min(dp[i - 1][j] + 1, dp[i][j - 1] + 1)) return dp[len1][len2] data[prefix + 'edit_distance_kt'] = data.apply(edit_distance, axis=1, args=('key_text_pre', 'title_pro')) data[prefix + 'edit_distance_dt'] = data.apply(edit_distance, axis=1, args=('description_text_pre', 'title_pro')) data[prefix + 'edit_distance_ka'] = data.apply(edit_distance, axis=1, args=('key_text_pre', 'abstract_pre')) data[prefix + 'edit_distance_da'] = data.apply(edit_distance, axis=1, args=('description_text_pre', 'abstract_pre')) def get_same_word_features(query, title): q_list = query.split() t_list = title.split() set_query = set(q_list) set_title = set(t_list) count_words = len(set_query.union(set_title)) comwords = [word for word in t_list if word in q_list] comwords_set = set(comwords) unique_rate = len(comwords_set) / count_words same_word1 = [w for w in q_list if w in t_list] same_word2 = [w for w in t_list if w in q_list] same_len_rate = (len(same_word1) + len(same_word2)) / (len(q_list) + len(t_list)) if len(comwords) > 0: com_index1 = len(comwords) same_word_q = com_index1 / len(q_list) same_word_t = com_index1 / len(t_list) for word in comwords_set: index_list = [i for i, x in enumerate(q_list) if x == word] com_index1 += sum(index_list) q_loc = com_index1 / (len(q_list) * len(comwords)) com_index2 = len(comwords) for word in comwords_set: index_list = [i for i, x in enumerate(t_list) if x == word] com_index2 += sum(index_list) t_loc = com_index2 / (len(t_list) * len(comwords)) same_w_set_q = len(comwords_set) / len(set_query) same_w_set_t = len(comwords_set) / len(set_title) word_set_rate = 2 * len(comwords_set) / (len(set_query) + len(set_title)) com_set_query_index = len(comwords_set) for word in comwords_set: index_list = [i for i, x in enumerate(q_list) if x == word] if len(index_list) > 0: com_set_query_index += index_list[0] loc_set_q = com_set_query_index / (len(q_list) * len(comwords_set)) com_set_title_index = len(comwords_set) for word in comwords_set: index_list = [i for i, x in enumerate(t_list) if x == word] if len(index_list) > 0: com_set_title_index += index_list[0] loc_set_t = com_set_title_index / (len(t_list) * len(comwords_set)) set_rate = (len(comwords_set) / len(comwords)) else: unique_rate, same_len_rate, same_word_q, same_word_t, q_loc, t_loc, same_w_set_q, same_w_set_t, word_set_rate, loc_set_q, loc_set_t, set_rate = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 return unique_rate, same_len_rate, same_word_q, same_word_t, q_loc, t_loc, same_w_set_q, same_w_set_t, word_set_rate, loc_set_q, loc_set_t, set_rate data[prefix+"unique_rate_kt"],data[prefix+"same_len_rate_kt"],data[prefix+"same_word_q_kt"],\ data[prefix+"same_word_t_kt"],data[prefix+"q_loc_kt"],data[prefix+"t_loc_kt"],data[prefix+"same_w_set_q_kt"],data[prefix+"same_w_set_t_kt"],data[prefix+"word_set_rate_kt"],\ data[prefix+"loc_set_q_kt"], data[prefix+"loc_set_t_kt"], data[prefix+"set_rate_kt"]= zip( *data.apply(lambda line: get_same_word_features(line["key_text_pre"], line["title_pro"]), axis=1)) data[prefix+"unique_rate_dt"],data[prefix+"same_len_rate_dt"],data[prefix+"same_word_q_dt"],\ data[prefix+"same_word_t_dt"],data[prefix+"q_loc_dt"],data[prefix+"t_loc_dt"],data[prefix+"same_w_set_q_dt"],data[prefix+"same_w_set_t_dt"],data[prefix+"word_set_rate_dt"],\ data[prefix+"loc_set_q_dt"], data[prefix+"loc_set_t_dt"], data[prefix+"set_rate_dt"]= zip( *data.apply(lambda line: get_same_word_features(line["description_text_pre"], line["title_pro"]), axis=1)) data[prefix+"unique_rate_ka"],data[prefix+"same_len_rate_ka"],data[prefix+"same_word_q_ka"],\ data[prefix+"same_word_t_ka"],data[prefix+"q_loc_ka"],data[prefix+"t_loc_ka"],data[prefix+"same_w_set_q_ka"],data[prefix+"same_w_set_t_ka"],data[prefix+"word_set_rate_ka"],\ data[prefix+"loc_set_q_ka"], data[prefix+"loc_set_t_ka"], data[prefix+"set_rate_ka"]= zip( *data.apply(lambda line: get_same_word_features(line["key_text_pre"], line["abstract_pre"]), axis=1)) data[prefix+"unique_rate_da"],data[prefix+"same_len_rate_da"],data[prefix+"same_word_q_da"],\ data[prefix+"same_word_t_da"],data[prefix+"q_loc_da"],data[prefix+"t_loc_da"],data[prefix+"same_w_set_q_da"],data[prefix+"same_w_set_t_da"],data[prefix+"word_set_rate_da"],\ data[prefix+"loc_set_q_da"], data[prefix+"loc_set_t_da"], data[prefix+"set_rate_da"]= zip( *data.apply(lambda line: get_same_word_features(line["description_text_pre"], line["abstract_pre"]), axis=1)) def get_df_grams_3(train_sample,values,cols): def create_ngram_set(input_list, ngram_value=3): return set(zip(*[input_list[i:] for i in range(ngram_value)])) def get_n_gram(df, values=3): train_query = df.values train_query = [[word for word in str(sen).replace("'", '').split(' ')] for sen in train_query] train_query_n = [] for input_list in train_query: train_query_n_gram = set() for value in range(3, values + 1): train_query_n_gram = train_query_n_gram | create_ngram_set(input_list, value) train_query_n.append(train_query_n_gram) return train_query_n train_query = get_n_gram(train_sample[cols[0]], values) train_title = get_n_gram(train_sample[cols[1]], values) sim = list(map(lambda x, y: len(x) + len(y) - 2 * len(x & y), train_query, train_title)) sim_number_rate=list(map(lambda x, y: len(x & y)/ len(x) if len(x)!=0 else 0, train_query, train_title)) return sim ,sim_number_rate data[prefix+'3_gram_sim'],data[prefix+'sim_numeber_rate_3']=get_df_grams_3(data,3,['key_text_pre','title_pro']) data[prefix+'3_gram_sim_pa'],data[prefix+'sim_numeber_rate_pa_3']=get_df_grams_3(data,3,['key_text_pre','abstract_pre']) #append #n-gram距离相关 data[prefix+'3_gram_sim_2'],data[prefix+'sim_numeber_rate_2_3']=get_df_grams_3(data,3,['description_text_pre','title_pro']) data[prefix+'3_gram_sim_pa_2'],data[prefix+'sim_numeber_rate_pa_2_3']=get_df_grams_3(data,3,['description_text_pre','abstract_pre']) def get_son_str_feature(query, title): q_list = query.split() query_len = len(q_list) t_list = title.split() title_len = len(t_list) count1 = np.zeros((query_len + 1, title_len + 1)) index = np.zeros((query_len + 1, title_len + 1)) for i in range(1, query_len + 1): for j in range(1, title_len + 1): if q_list[i - 1] == t_list[j - 1]: count1[i][j] = count1[i - 1][j - 1] + 1 index[i][j] = index[i - 1][j - 1] + j else: count1[i][j] = 0 index[i][j] = 0 max_count1 = count1.max() if max_count1 != 0: row = int(np.where(count1 == np.max(count1))[0][0]) col = int(np.where(count1 == np.max(count1))[1][0]) mean_pos = index[row][col] / (max_count1 * title_len) begin_loc = (col - max_count1 + 1) / title_len rows = np.where(count1 != 0.0)[0] cols = np.where(count1 != 0.0)[1] total_loc = 0 for i in range(0, len(rows)): total_loc += index[rows[i]][cols[i]] density = total_loc / (query_len * title_len) rate_q_len = max_count1 / query_len rate_t_len = max_count1 / title_len else: begin_loc, mean_pos, total_loc, density, rate_q_len, rate_t_len = 0, 0, 0, 0, 0, 0 return max_count1, begin_loc, mean_pos, total_loc, density, rate_q_len, rate_t_len data[prefix+"long_same_max_count1_kt"], data[prefix+"long_same_local_begin_kt"], data[prefix+"long_same_local_mean_kt"],data[prefix+"long_same_total_loc_kt"],\ data[prefix+"long_same_density_kt"], data[prefix+"long_same_rate_q_len_kt"], data[prefix+"long_same_rate_t_len_kt"]= zip( *data.apply(lambda line: get_son_str_feature(line["key_text_pre"], line["title_pro"]), axis=1)) data[prefix+"long_same_max_count1_dt"], data[prefix+"long_same_local_begin_dt"], data[prefix+"long_same_local_mean_dt"],data[prefix+"long_same_total_loc_dt"],\ data[prefix+"long_same_density_dt"], data[prefix+"long_same_rate_q_len_dt"], data[prefix+"long_same_rate_t_len_dt"]= zip( *data.apply(lambda line: get_son_str_feature(line["description_text_pre"], line["title_pro"]), axis=1)) data[prefix+"long_same_max_count1_da"], data[prefix+"long_same_local_begin_da"], data[prefix+"long_same_local_mean_da"],data[prefix+"long_same_total_loc_da"],\ data[prefix+"long_same_density_da"], data[prefix+"long_same_rate_q_len_da"], data[prefix+"long_same_rate_t_len_da"]= zip( *data.apply(lambda line: get_son_str_feature(line["description_text_pre"], line["abstract_pre"]), axis=1)) data[prefix+"long_same_max_count1_ka"], data[prefix+"long_same_local_begin_ka"], data[prefix+"long_same_local_mean_ka"],data[prefix+"long_same_total_loc_ka"],\ data[prefix+"long_same_density_ka"], data[prefix+"long_same_rate_q_len_ka"], data[prefix+"long_same_rate_t_len_ka"]= zip( *data.apply(lambda line: get_son_str_feature(line["key_text_pre"], line["abstract_pre"]), axis=1)) def q_t_common_words(query, title): query = set(query.split(' ')) title = set(title.split(' ')) return len(query & title) data[prefix+'common_words_kt'] = data.apply(lambda index: q_t_common_words(index.key_text_pre, index.title_pro), axis=1) data[prefix+'common_words_dt'] = data.apply(lambda index: q_t_common_words(index.description_text_pre, index.title_pro), axis=1) data[prefix+'common_words_ka'] = data.apply(lambda index: q_t_common_words(index.key_text_pre, index.abstract_pre), axis=1) data[prefix+'common_words_da'] = data.apply(lambda index: q_t_common_words(index.description_text_pre, index.abstract_pre), axis=1) data['key_text_len'] = data['key_text_pre'].apply(lambda x: len(x.split(' '))) data['description_text_pre_len'] = data['description_text_pre'].apply(lambda x: len(x.split(' '))) data['title_pro_len'] = data['title_pro'].apply(lambda x: len(x.split(' '))) data['abstract_pre_len'] = data['abstract_pre'].apply(lambda x: len(x.split(' '))) data[prefix+'common_words_kt_rate_k'] = data[prefix+'common_words_kt'] / data['key_text_len'] data[prefix+'common_words_kt_rate_t'] = data[prefix+'common_words_kt'] / data['title_pro_len'] data[prefix+'common_words_dt_rate_d'] = data[prefix+'common_words_dt'] / data['description_text_pre_len'] data[prefix+'common_words_dt_rate_t'] = data[prefix+'common_words_dt'] / data['title_pro_len'] data[prefix+'common_words_ka_rate_k'] = data[prefix+'common_words_ka'] / data['key_text_len'] data[prefix+'common_words_ka_rate_a'] = data[prefix+'common_words_ka'] / data['abstract_pre_len'] data[prefix+'common_words_da_rate_d'] = data[prefix+'common_words_da'] / data['description_text_pre_len'] data[prefix+'common_words_da_rate_a'] = data[prefix+'common_words_da'] / data['abstract_pre_len'] feat = ['description_id','paper_id'] for col in data.columns: if re.match('num_', col) != None: feat.append(col) data = data[feat] return data
for index, row in df_train.iterrows(): jacc = jaccard_similarity(row.question1,row.question2) tr_jacc_coef.append(jacc) train_feat['jacc_coef']=tr_jacc_coef # jaccard coefficient of test set te_jacc_coef = list() for index, row in df_test.iterrows(): jacc = jaccard_similarity(row.question1,row.question2) te_jacc_coef.append(jacc) test_feat['jacc_coef']=te_jacc_coef #jarowinkler tr_jarowinkler = list() for index, row in df_train.iterrows(): jaro = Levenshtein.jaro_winkler(row.question1,row.question2) tr_jarowinkler.append(jaro) train_feat['jarowinkler'] = tr_jarowinkler te_jarowinkler = list() for index, row in df_test.iterrows(): jaro = Levenshtein.jaro_winkler(row.question1,row.question2) te_jarowinkler.append(jaro) test_feat['jarowinkler'] = te_jarowinkler #dice distance tr_dice = list() for i in range(len(train_q1_words_s)): total = len(train_q1_words_s[i])+ len(train_q2_words_s[i])
def value(self, word, correction): return Levenshtein.jaro_winkler(word, correction)
def getwikidatacity(_step, list_wikidataid, ne_fid, ne_xid, ne_lon, ne_lat, ne_wikidataid, ne_name ,ne_namealt ,ne_adm0name,ne_adm1name,ne_ls_name,ne_geonameid, ne_scalerank,ne_labelrank,ne_natscale): query_template=""" PREFIX geo: <http://www.opengis.net/ont/geosparql#> SELECT ?place ?placeLabel ?placeDescription (group_concat(distinct ?pLabel ; separator = "#") as ?type_grp) (group_concat(distinct ?placeLabelru ; separator = "#") as ?placeLabelru) (group_concat(distinct ?sitelink_en ; separator = "#") as ?sitelink_en) (group_concat(distinct ?sitelink_es ; separator = "#") as ?sitelink_es) (group_concat(distinct ?sitelink_ru ; separator = "#") as ?sitelink_ru) (group_concat(distinct ?sitelink_zh ; separator = "#") as ?sitelink_zh) (group_concat(distinct ?sitelink_ceb ; separator = "#") as ?sitelink_ceb) (group_concat(distinct ?countryLabelx; separator = "#") as ?countryLabel) (SAMPLE(?sistercity) as ?sistercity_sample) (AVG(?distance) as ?distance ) (MAX(?population) as ?max_population ) (group_concat(distinct ?place_alternative ; separator = "#") as ?place_alternative_grp) (group_concat(distinct ?GeoNames_ID ; separator = "#") as ?GeoNames_ID_grp) WITH { SELECT DISTINCT ?place ?distance { #S1# ?place p:P31/ps:P31 wd:Q515. #S2# ?place p:P31/ps:P31 wd:Q3957. #S3# {?place (p:P31/wdt:P31/wdt:P279*) wd:Q532. } #S3# UNION {?place p:P31/ps:P31 wd:Q532. } #S3# UNION {?place (p:P31/wdt:P31/wdt:P279*) wd:Q15078955.} #S3# UNION {?place p:P31/ps:P31 wd:Q15078955.} #S3# UNION { #S3# ?place (p:P31/wdt:P31/wdt:P279*) wd:Q486972 . #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q131596. }. #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q5084. }. #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q2514025 }. #S3# FILTER NOT EXISTS { ?place wdt:P36 ?capitalplace }. #S3# ?place rdfs:label ?placeLabel_en FILTER (lang(?placeLabel_en) = "en"). #S3# } #S3# UNION { #S3# ?place p:P31/ps:P31 wd:Q486972. #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q131596. }. #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q5084. }. #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q2514025 }. #S3# FILTER NOT EXISTS { ?place wdt:P36 ?capitalplace }. #S3# ?place rdfs:label ?placeLabel_en FILTER (lang(?placeLabel_en) = "en"). #S3# } #S3# UNION { #S3# ?place p:P31/ps:P31/wdt:P279* wd:Q486972. #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q131596. }. #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q5084. }. #S3# FILTER NOT EXISTS { ?place wdt:P31 wd:Q2514025 }. #S3# FILTER NOT EXISTS { ?place wdt:P36 ?capitalplace }. #S3# ?place rdfs:label ?placeLabel_en FILTER (lang(?placeLabel_en) = "en"). #S3# } #S4# {?place (p:P31/wdt:P31/wdt:P279*) wd:Q2039348. } #S4# UNION {?place p:P31/ps:P31 wd:Q2039348. } #S4# UNION {?place (p:P31/wdt:P31/wdt:P279*) wd:Q1867183. } #S4# UNION {?place p:P31/ps:P31 wd:Q1867183. } #S4# UNION {?place wdt:P1376 ?admin_ara. } #S4# UNION {?place (p:P31/wdt:P31/wdt:P279*) wd:Q1637706. } #S4# UNION {?place p:P31/ps:P31 wd:Q1637706. } #S4# UNION {?place (p:P31/wdt:P31/wdt:P279*) wd:Q16861602.} #S4# UNION {?place p:P31/ps:P31 wd:Q16861602.} #S4# UNION {?place p:P31/ps:P31 wd:Q188509. ?place p:P17/ps:P17 wd:Q408. } #S4# UNION {?place (p:P31/wdt:P31/wdt:P279*) wd:Q1070990. } #S4# UNION {?place p:P31/ps:P31 wd:Q1070990. } #S4# UNION {?place p:P31/wdt:P31/wdt:P279* wd:Q748149. } #S4# UNION {?place p:P31/ps:P31 wd:Q748149. } #S4# UNION {?place p:P31/wdt:P31/wdt:P279* wd:Q735428. } #S4# UNION {?place p:P31/ps:P31 wd:Q735428. } #S4# UNION {?place p:P31/wdt:P31/wdt:P279* wd:Q318727. } #S4# UNION {?place p:P31/ps:P31 wd:Q318727. } #S4# UNION {?place p:P31/wdt:P31/wdt:P279* wd:Q15284. } #S4# UNION {?place p:P31/ps:P31 wd:Q15284. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q15284. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q532. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q15078955.} #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q498162. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3389680. } #S4# UNION {?place p:P31/ps:P31 wd:Q1639634. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q1639634. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q2112349. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q749622. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q11618417. } #S4# UNION {?place p:P31/ps:P31 wd:Q11618417. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q640364. } #S4# UNION {?place p:P31/ps:P31 wd:Q640364. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q2555896. } #S4# UNION {?place p:P31/ps:P31 wd:Q2555896. } #S4# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q109108. } #S4# UNION {?place p:P31/ps:P31 wd:Q109108. } #S5# {?place p:P31/ps:P31/wdt:P279* wd:Q1763214. } #S5# UNION {?place p:P31/ps:P31 wd:Q1763214. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q1840161. } #S5# UNION {?place p:P31/ps:P31 wd:Q1840161. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q4249901. } #S5# UNION {?place p:P31/ps:P31 wd:Q4249901. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3685463. } #S5# UNION {?place p:P31/ps:P31 wd:Q3685463. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q12081657. } #S5# UNION {?place p:P31/ps:P31 wd:Q12081657. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q27676416. } #S5# UNION {?place p:P31/ps:P31 wd:Q27676416. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3076994. } #S5# UNION {?place p:P31/ps:P31 wd:Q3076994. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3360771. } #S5# UNION {?place p:P31/ps:P31 wd:Q3360771. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3685463. } #S5# UNION {?place p:P31/ps:P31 wd:Q3685463. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q605291. } #S5# UNION {?place p:P31/ps:P31 wd:Q605291. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q1539014. } #S5# UNION {?place p:P31/ps:P31 wd:Q1539014. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q7830262. } #S5# UNION {?place p:P31/ps:P31 wd:Q7830262. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3327862. } #S5# UNION {?place p:P31/ps:P31 wd:Q3327862. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q956318. } #S5# UNION {?place p:P31/ps:P31 wd:Q956318. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q155239. } #S5# UNION {?place p:P31/ps:P31 wd:Q155239. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q27676428. } #S5# UNION {?place p:P31/ps:P31 wd:Q27676428. } #S5# UNION {?place p:P31/ps:P31 wd:Q5084. ?place p:P17/ps:P17 wd:Q16. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q17305746. } #S5# UNION {?place p:P31/ps:P31 wd:Q17305746. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q14762300. } #S5# UNION {?place p:P31/ps:P31 wd:Q14762300. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q17366755. } #S5# UNION {?place p:P31/ps:P31 wd:Q17366755. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3327873. } #S5# UNION {?place p:P31/ps:P31 wd:Q3327873. } #S5# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3788231. } #S5# UNION {?place p:P31/ps:P31 wd:Q3788231. } # --- S6 ------------------- #S6# {?place p:P31/ps:P31/wdt:P279* wd:Q6609799. } #S6# UNION {?place p:P31/ps:P31 wd:Q6609799. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q3685430. } #S6# UNION {?place p:P31/ps:P31 wd:Q3685430. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q2679157. } #S6# UNION {?place p:P31/ps:P31 wd:Q2679157. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q2989470. } #S6# UNION {?place p:P31/ps:P31 wd:Q2989470. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q6593035. } #S6# UNION {?place p:P31/ps:P31 wd:Q6593035. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q43742. } #S6# UNION {?place p:P31/ps:P31 wd:Q43742. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q83020. } #S6# UNION {?place p:P31/ps:P31 wd:Q83020. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q2706302. } #S6# UNION {?place p:P31/ps:P31 wd:Q2706302. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q482821. } #S6# UNION {?place p:P31/ps:P31 wd:Q482821. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q2225003. } #S6# UNION {?place p:P31/ps:P31 wd:Q2225003. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q133442. } #S6# UNION {?place p:P31/ps:P31 wd:Q133442. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q1500350. } #S6# UNION {?place p:P31/ps:P31 wd:Q1500350. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q16725943. } #S6# UNION {?place p:P31/ps:P31 wd:Q16725943. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q9316670. } #S6# UNION {?place p:P31/ps:P31 wd:Q9316670. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q1065118. } #S6# UNION {?place p:P31/ps:P31 wd:Q1065118. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q1289426. } #S6# UNION {?place p:P31/ps:P31 wd:Q1289426. } #S6# UNION {?place p:P31/ps:P31/wdt:P279* wd:Q1336099. } #S6# UNION {?place p:P31/ps:P31 wd:Q1336099. } #S6# { #S6# ?place (p:P31/wdt:P31/wdt:P279*) wd:Q486972 . #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q131596. }. #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q5084. }. #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q2514025 }. #S6# FILTER NOT EXISTS { ?place wdt:P36 ?capitalplace }. #S6# # FILTER(NOT EXISTS { ?item rdfs:label ?lang_labelx. FILTER(LANG(?lang_labelx) = "en") }). #S6# ?place rdfs:label ?placeLabel_xru FILTER (lang(?placeLabel_xru) = "ru"). #S6# } #S6# UNION { #S6# ?place p:P31/ps:P31 wd:Q486972. #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q131596. }. #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q5084. }. #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q2514025 }. #S6# FILTER NOT EXISTS { ?place wdt:P36 ?capitalplace }. #S6# #FILTER(NOT EXISTS { ?item rdfs:label ?lang_labelx. FILTER(LANG(?lang_labelx) = "en") }). #S6# ?place rdfs:label ?placeLabel_xru FILTER (lang(?placeLabel_xru) = "ru"). #S6# } #S6# UNION { #S6# ?place p:P31/ps:P31/wdt:P279* wd:Q486972. #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q131596. }. #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q5084. }. #S6# FILTER NOT EXISTS { ?place wdt:P31 wd:Q2514025 }. #S6# FILTER NOT EXISTS { ?place wdt:P36 ?capitalplace }. #S6# #FILTER(NOT EXISTS { ?item rdfs:label ?lang_labelx. FILTER(LANG(?lang_labelx) = "en") }). #S6# ?place rdfs:label ?placeLabel_xru FILTER (lang(?placeLabel_xru) = "ru"). #S6# } #S7# FILTER EXISTS { ?place wdt:P190 ?sistercity_x.} #S8# VALUES ?GeoNames_ID {"3383494"} #S8# ?place wdt:P1566 ?GeoNames_ID. #S9# VALUES ?searchnames {"#ne_name#"@en "#ne_name#"@es "#ne_name#"@sv #S9# "#ne_name#"@de "#ne_name#"@fr "#ne_name#"@pt #S9# "#ne_name#"@it "#ne_name#"@da "#ne_name#"@pl #S9# "#ne_name#"@cz "#ne_name#"@sk "#ne_name#"@hu #S9# "#ne_name#"@lt "#ne_name#"@et "#ne_name#"@lv #S9# "#ne_name#"@no "#ne_name#"@nl "#ne_name#"@fi } #S9# ?place rdfs:label ?searchnames . SERVICE wikibase:around { # "#ne_name#" , "#ne_adm0name#" ?place wdt:P625 ?location. bd:serviceParam wikibase:center "Point(16.373064 48.20833)"^^geo:wktLiteral. bd:serviceParam wikibase:radius "#distance#". bd:serviceParam wikibase:distance ?distance. } } } AS %places WHERE { INCLUDE %places . SERVICE wikibase:label { bd:serviceParam wikibase:language "en".} OPTIONAL {?place rdfs:label ?placeLabelru FILTER (lang(?placeLabelru)="ru").} OPTIONAL {?place wdt:P31 ?property. ?property rdfs:label ?pLabel FILTER (lang(?pLabel)="en").} OPTIONAL {?place wdt:P17 ?country. ?country rdfs:label ?countryLabelx FILTER (lang(?countryLabelx)="en").} OPTIONAL {?place wdt:P17 ?country.} OPTIONAL {?place wdt:P1566 ?GeoNames_ID.} OPTIONAL {?place wdt:P190 ?sistercity.} OPTIONAL {?place wdt:P1082 ?population.} OPTIONAL {?sitelink_en schema:about ?place . ?sitelink_en schema:isPartOf <https://en.wikipedia.org/>.} OPTIONAL {?sitelink_es schema:about ?place . ?sitelink_es schema:isPartOf <https://es.wikipedia.org/>.} OPTIONAL {?sitelink_ru schema:about ?place . ?sitelink_ru schema:isPartOf <https://ru.wikipedia.org/>.} OPTIONAL {?sitelink_zh schema:about ?place . ?sitelink_zh schema:isPartOf <https://zh.wikipedia.org/>.} OPTIONAL {?sitelink_ceb schema:about ?place . ?sitelink_ceb schema:isPartOf <https://ceb.wikipedia.org/>.} OPTIONAL {?place skos:altLabel ?place_alternative FILTER((LANG(?place_alternative)) = "en").} } GROUP BY ?place ?placeLabel ?placeDescription ORDER BY ?distance """ q=query_template.replace('16.373064',ne_lon).replace('48.20833',ne_lat) q=q.replace('#ne_name#',ne_name).replace('#ne_adm0name#',ne_adm0name) q=q.replace('"3383494"','"'+ne_geonameid+'"') if _step==1: q=q.replace('#S1#','') elif _step==2: q=q.replace('#S2#','') elif _step==3: q=q.replace('#S3#','') elif _step==4: q=q.replace('#S4#','') elif _step==5: q=q.replace('#S5#','') elif _step==6: q=q.replace('#S6#','') elif _step==7: q=q.replace('#S7#','') elif _step==8: q=q.replace('#S8#','') elif _step==9: q=q.replace('#S9#','') else: print("Internal error, _step: ", _step ) sys.exit(1) search_distance=0 if ( -10 <= float(ne_lon) <= 60) and ( float(ne_lat) >30 ): if _step==1: search_distance=50 elif _step==2: search_distance=50 elif _step==3: search_distance=50 elif _step==4: search_distance=50 elif _step==5: search_distance=50 elif _step==6: search_distance=50 elif _step==7: search_distance=50 elif _step==8: search_distance=1200 elif _step==9: search_distance=100 else: if _step==1: search_distance=150 elif _step==2: search_distance=150 elif _step==3: search_distance=120 elif _step==4: search_distance=100 elif _step==5: search_distance=100 elif _step==6: search_distance=100 elif _step==7: search_distance=100 elif _step==8: search_distance=1200 elif _step==9: search_distance=100 print("_step:",_step , " search_distance=", search_distance) # remove double spaces while ' ' in q: q = q.replace(' ', ' ') # remove comments qs='' for line in q.splitlines(): if len(line)>0 and line[:2] != ' #' and line[:2] != '#S' : qs+=line+'\n' q=qs ts = datetime.datetime.now() max_score=-1000 results = None retries = 0 max_retries=14 while results == None and retries < max_retries: try: results = None sleeptime= retries*10 + 5 qs=q.replace('#distance#', str(search_distance) ) print("distance-ok") if retries > 0: print("Try - retries:",retries," Distance:",search_distance," Sleeptime:",sleeptime) if args.filter_name!='': print(qs) sparql.setQuery(qs) sparql.setTimeout(2000) sparql.setReturnFormat(JSON) results = sparql.query().convert() except SPARQLExceptions.EndPointNotFound as e: print("ERRwikidata-SPARQLExceptions-EndPointNotFound: Retrying in (seconds) : ",sleeptime, flush=True ) time.sleep(sleeptime) retries += 1 continue except SPARQLExceptions.EndPointInternalError as e: print("ERRwikidata-SPARQLExceptions-EndPointInternalError: Retrying in (seconds) : ",sleeptime, flush=True ) time.sleep(sleeptime) retries += 1 # Decrease search distance if retries > 3: search_distance=int( search_distance*0.9) continue except TimeoutError: print("ERRwikidata-SPARQLExceptions TimeOut : Retrying in (seconds) : ",sleeptime, flush=True ) time.sleep(sleeptime) retries += 1 continue except SPARQLExceptions.QueryBadFormed as e: print("ERRwikidata-SPARQLExceptions-QueryBadFormed : Check! " , flush=True ) return "error" except HTTPError as e: print("ERRwikidata: Got an HTTPError while querying. Retrying in (seconds) : ",sleeptime, flush=True ) time.sleep(sleeptime) retries += 1 continue except: print("ERRwikidata: other error. Retrying in (seconds) : ",sleeptime, flush=True ) time.sleep(sleeptime) retries += 1 continue if results == None and retries >= max_retries : print("Wikidata request failed ; system stopped! ") sys.exit(1) _runtime= (datetime.datetime.now() - ts).total_seconds() rc_list_wikidataid=[] #TODO empty answer .. for result in results['results']['bindings']: _score=0; wd_id = result['place']['value'].split('/')[4] wd_distance = float( result['distance']['value'] ) if 'placeLabel' in result: wd_label = result['placeLabel']['value'] else: wd_label = '' # Check if already queryed? if wd_id in list_wikidataid: print("Already exist:", wd_id, wd_label) continue else: rc_list_wikidataid.append(wd_id) if 'placeLabelru' in result: wd_label_ru = result['placeLabelru']['value'] else: wd_label_ru = '' if 'placeDescription' in result: wd_description = result['placeDescription']['value'] else: wd_description = '' if 'type_grp' in result: wd_type = "#"+result['type_grp']['value']+"#" else: wd_type = '' if 'countryLabel' in result: wd_countrylabel = result['countryLabel']['value'] cldiff= - ( 20 - ( 20 * Levenshtein.jaro_winkler( unidecode.unidecode(ne_adm0name) , unidecode.unidecode(wd_countrylabel) ) ) ) #print( cldiff, ne_adm0name, wd_countrylabel ) _score+= cldiff else: wd_countrylabel ='' if 'sitelink_en' in result: wd_sitelink_en = result['sitelink_en']['value'] else: wd_sitelink_en='' if wd_sitelink_en != '': _score+= 40 else: _score+= -120 if 'sitelink_es' in result: wd_sitelink_es = result['sitelink_es']['value'] else: wd_sitelink_es='' if 'sitelink_ru' in result: wd_sitelink_ru = result['sitelink_ru']['value'] else: wd_sitelink_ru='' if 'sitelink_zh' in result: wd_sitelink_zh = result['sitelink_zh']['value'] else: wd_sitelink_zh='' if 'sitelink_ceb' in result: wd_sitelink_ceb = result['sitelink_ceb']['value'] else: wd_sitelink_ceb='' if wd_sitelink_en == '': if wd_sitelink_es != '': _score+= 100 elif wd_sitelink_ru != '': _score+= 80 elif wd_sitelink_zh != '': _score+= 60 elif wd_sitelink_ceb != '': _score+= -1000 # penalty for only ceb import if 'GeoNames_ID_grp' in result: wd_geonames_id_grp="#"+result['GeoNames_ID_grp']['value']+"#" else: wd_geonames_id_grp='' if 'max_population' in result: wd_max_population = result['max_population']['value'] if wd_max_population!='': _score+=8 else: wd_max_population='' if 'place_alternative_grp' in result: wd_place_alternative_grp="#"+result['place_alternative_grp']['value']+"#" else: wd_place_alternative_grp='' if ('#'+ne_name+'#' in wd_place_alternative_grp) : _in_altnames='Y' _score+=72 if ('#'+unidecode.unidecode(ne_name)+'#' in unidecode.unidecode(wd_place_alternative_grp)) : _in_altnames='Y' _score+=58 else: _in_altnames='N' wd_has_sistercity="" if ('sistercity_sample' in result): if result['sistercity_sample']['value'] != '': wd_has_sistercity="Y" _score+=15 uni_ne_name=unidecode.unidecode(ne_name) uni_ne_ls_name=unidecode.unidecode(ne_ls_name) uni_ne_namealt=unidecode.unidecode(ne_namealt) uni_ne_adm0name=unidecode.unidecode(ne_adm0name) uni_ne_adm1name=unidecode.unidecode(ne_adm1name) uni_wd_name=unidecode.unidecode(wd_label) if wd_label==wd_id and wd_label_ru != '': _lev_jaro_winkler_ru = Levenshtein.jaro_winkler( uni_ne_name, unidecode.unidecode(wd_label_ru)) else: _lev_jaro_winkler_ru = 0 _lev_ratio = Levenshtein.ratio(uni_ne_name, uni_wd_name) _lev_distance = Levenshtein.distance(uni_ne_name, uni_wd_name) _lev_jaro = Levenshtein.jaro(uni_ne_name, uni_wd_name) _lev_jaro_winkler = Levenshtein.jaro_winkler(uni_ne_name, uni_wd_name) _lev_jaro_winkler_ls = Levenshtein.jaro_winkler(uni_ne_ls_name, uni_wd_name) _lev_jaro_winkler_alt = Levenshtein.jaro_winkler(uni_ne_namealt, uni_wd_name) _lev_jaro_winkler_adm0 = Levenshtein.jaro_winkler(uni_ne_name+','+uni_ne_adm0name, uni_wd_name ) _lev_jaro_winkler_adm1 = Levenshtein.jaro_winkler(uni_ne_name+','+uni_ne_adm1name, uni_wd_name ) _max_lev_jaro_winkler = max(_lev_jaro_winkler,_lev_jaro_winkler_ls,_lev_jaro_winkler_alt,_lev_jaro_winkler_adm0,_lev_jaro_winkler_adm1, _lev_jaro_winkler_ru) _match_rating_comparison = jellyfish.match_rating_comparison(uni_ne_name, uni_wd_name) _damerau_levenshtein_distance= jellyfish.damerau_levenshtein_distance(uni_ne_name, uni_wd_name) _hamming_distance = jellyfish.hamming_distance(uni_ne_name, uni_wd_name) _score+= _max_lev_jaro_winkler*10; if ne_name == wd_label: _name_status='R01-Equal' _score+=100 elif ne_name.lower()==wd_label.lower(): _name_status='R12-Lowcase_equal' _score+=99 elif uni_ne_name==uni_wd_name: _name_status='R13-Unidecode_equal' _score+=90 elif uni_ne_ls_name==uni_wd_name: _name_status='R31-ls_name eq' _score+=60 elif uni_ne_namealt==uni_wd_name: _name_status='R32-namealt eq' _score+=60 elif uni_ne_namealt==uni_wd_name: _name_status='R33-namealt eq' _score+=60 elif _max_lev_jaro_winkler == 1.0 : _name_status='R41- max(jaro_winkler)=1' _score+=50 elif _max_lev_jaro_winkler >= 0.9 : _name_status='R42- max(jaro_winkler) 0.9-1.0' _score+=40 elif _max_lev_jaro_winkler >= 0.8 : _name_status='R43- max(jaro_winkler) 0.8-0.9' _score+=30 else: _name_status='' if wd_distance < 5: _score += 10 elif wd_distance < 10: _score += 5 elif wd_distance > 60: _score += -30 elif wd_distance > 30: _score += -15 elif wd_distance > 15: _score += -5 if ne_geonameid != '' and ('#'+ne_geonameid+'#' in wd_geonames_id_grp) : _geonames_status='EQ' _score+=40 elif ne_geonameid != '' and ne_geonameid != '-1' and wd_geonames_id_grp!='##' and ('#'+ne_geonameid+'#' not in wd_geonames_id_grp) : _geonames_status='NE' _score+=0 else: _geonames_status='Na' if (ne_wikidataid != '' ) and (wd_id !='' ) and (ne_wikidataid==wd_id): _wikidata_status='EQ' _score+=15 elif (ne_wikidataid != '' ) and (wd_id !='' ): _wikidata_status='NE' # smaller wikidataid is sometimes better if float( ne_wikidataid[1:]) > float(wd_id[1:]): _score+= 3 else: _score+= -3 else: _wikidata_status='Na' if _score > max_score: max_score=_score if _score > 140: print("@@_score>120:" , ne_name , " :: ", wd_id, wd_label, wd_description, wd_type ) c.execute("INSERT INTO wd VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", ( ne_fid, ne_wikidataid, wd_id, ne_name, wd_label, ne_adm0name, wd_countrylabel, ne_adm1name, ne_ls_name, ne_namealt, wd_description, wd_type, ne_geonameid, wd_geonames_id_grp, _geonames_status, wd_place_alternative_grp, wd_sitelink_en, wd_sitelink_es, wd_sitelink_ru, wd_sitelink_zh, wd_sitelink_ceb, wd_label_ru, wd_has_sistercity, wd_max_population, wd_distance, _step, _score, _name_status, _wikidata_status, _in_altnames, _lev_ratio, _lev_distance, _lev_jaro, _lev_jaro_winkler, ne_scalerank, ne_labelrank, ne_natscale, ne_xid, ts, search_distance, retries, _runtime )) conn.commit() sys.stdout.flush() if max_score <= 30: print(" Low score .. stop ", max_score) return list_wikidataid + rc_list_wikidataid , max_score
# -*-coding:utf-8-*- # @ auth ivan # @ time 2021-02-09 # @ goal 105.Test_LevenshteinDistance, import Levenshtein s1, s2 = "ABCD", "ACE" print(Levenshtein.distance(s1, s2), Levenshtein.distance(s2, s1), Levenshtein.editops(s1, s2), Levenshtein.ratio(s1, s2), Levenshtein.jaro(s1, s2), Levenshtein.jaro_winkler(s1, s2)) # s1, s2 = "广东省广州市番禺区luoxi海bin花园A座1房", "广州洛溪海滨花园A座1房"
def jarowinkler_sim(field_1, field_2): similarity = Levenshtein.jaro_winkler(field_1, field_2) return similarity
def Levenshtein_jaro_winkler(text1, text2): text1 = text1.replace(" ", "") text2 = text2.replace(" ", "") return Levenshtein.jaro_winkler(text1, text2)
def lvmatch(s1, s2): "how well does s2 match s1?" return int(Levenshtein.jaro_winkler(s1, s2) * 100)
def test_jarao_winkler(s, arr): for x in arr: print Levenshtein.jaro_winkler(s, x)
def proba_duplicate(cls, person1, person2): """ SEE: check with levenshtein / soundex..... Probas can be calculated as the worst case (most popular french names (Martin, Marie) since 1881 Order matters for some tests (C03) : person1 have to be the old value, person2 the new value. Checks : C01: same lastname, firstnames, birthday => we are sure ! C02: missing other firstnames, same lastname, firstname, birthday C03: married :-) lastname to birthname, same firsntames, birthdays C04: fuzzy : errors in lastname, firstname, but same birthday C05: same names but not matching birthdays (not nulls) C06: same names, """ # Check C01 if (person1.lastname == person2.lastname and person1.firstname == person2.firstname and person1.firstnames == person2.firstnames and person1.birthday == person2.birthday and person1.birthday is not None): return 1.0 # Check C02 if (person1.lastname == person2.lastname and person1.firstname == person2.firstname and person1.birthday is not None and person1.birthday == person2.birthday): return 1.0 # Check C03 if (person2.lastname == person1.birthname and person1.firstname == person2.firstname and person1.firstnames == person2.firstnames and person1.birthday == person2.birthday and person1.birthday is not None): return 1.0 # Check C04 if ((Levenshtein.jaro_winkler(unicode(person1.lastname), unicode(person2.lastname)) > 0.85) and (Levenshtein.jaro_winkler(unicode(person1.firstname), unicode(person2.firstname)) > 0.85) and (person1.birthday == person2.birthday) and (person1.birthday is not None)): return 0.96 # Check C05 if (person1.lastname == person2.lastname and person1.firstname == person2.firstname and person1.firstnames == person2.firstnames and person1.birthday is not None and person2.birthday is not None and Levenshtein.distance( datetime.datetime.strftime(person1.birthday, '%d/%m/%Y'), datetime.datetime.strftime(person2.birthday, '%d/%m/%Y') ) < 3): return 0.96 # Check 06 if (person1.lastname == person2.lastname and person1.firstname == person2.firstname and len(person1.firstnames) > 2 and person1.firstnames == person2.firstnames and (person1.birthday is None or person2.birthday is None)): return 0.96 # Check 07 if (person1.lastname == person2.lastname and person1.firstname == person2.firstname and (person1.birthday is None or person2.birthday is None)): return 0.5 return 0
def org_alternate_names(self, slot_type): # load china province city list china_province_city = cPickle.load(open('data/dict/china_province_city.pkl', 'rb')) city_list = [] for p in china_province_city: if p['type'] == 0 and p['name'] != (u'台湾' or u'臺灣'): # type 0 means 直辖市 continue for c in p['sub']: city_list.append(c['name']) if p['name'] == (u'台湾' or u'臺灣'): continue for d in c['sub']: city_list.append(d['name']) # load china province list province_dict = [] f = io.open('data/dict/china_province_dict', 'r', -1, 'utf-8') for line in f: province_dict.append(line.strip()) # load country list country_list = [] f = io.open('data/dict/country_list', 'r', -1, 'utf-8') for line in f: country_list.append(line.strip()) line_outputs = [] # find query name segmentation query_name_seg = [] for e in self.evidences[slot_type]: if self.query.name not in ''.join(e.parse_result['text']): continue org_list = self.find_org(e.parse_result['words']) for org in org_list: if self.query.name in ''.join([word[0] for word in org]): query_name_seg = org for e in self.evidences[slot_type]: org_list = self.find_org(e.parse_result['words']) alternate_name = [] for org in org_list: org_name = ''.join([w[0] for w in org]) if org_name == self.query.name: continue # ======================== organization name pattern match ======================= # # edit distance simi_score = Levenshtein.distance(self.query.name, org_name) if simi_score < 2: alternate_name.append(org) continue # alternate name must consist of words from query name if set(org_name) - set(self.query.name): continue # org name should not be the name of a single city, country or state/province def foo(): for element in list(itertools.chain(city_list, province_dict, country_list)): if org_name in element: return False return True if not foo(): continue # abbreviation match query_name_abbre = ''.join(w[0][0] for w in query_name_seg) if query_name_abbre in org_name or org_name in query_name_abbre: alternate_name.append(org) continue # jaro_winkler score: the closer the word to the beginning, the higher weight it has. simi_score = Levenshtein.jaro_winkler(self.query.name, org_name) if simi_score > 0.8: alternate_name.append(org) continue for org in alternate_name: slot_filler = ''.join([w[0] for w in org]) l = self.create_line_output(e, slot_filler, 0, slot_type, combined_slot_filler=True) line_outputs.append(l) return line_outputs
# Jaro Distance # Jaro-Winkler Distance # Match Rating Approach Comparison # Hamming Distance # Phonetic encoding: # American Soundex # Metaphone # NYSIIS (New York State Identification and Intelligence System) # Match Rating Codex import jellyfish print(jellyfish.levenshtein_distance('jellyfish', 'smellyfish')) # 2; 编辑距离 print(jellyfish.jaro_distance('jellyfish', 'smellyfish')) # 0.89629629629629637 print(jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')) # 1; 编辑距离, 带翻转的 print(jellyfish.metaphone('Jellyfish')) # 'JLFX' print(jellyfish.soundex('Jellyfish')) # 'J412' print(jellyfish.nysiis('Jellyfish')) # 'JALYF' print(jellyfish.match_rating_codex('Jellyfish')) # 'JLLFSH' ################################################################## ## Lenvenshtein import Levenshtein print(Levenshtein.hamming('hello', 'helol')) # 2; 计算汉明距离; 要求 str1 和 str2 必须长度一致; 是描述两个等长字串之间对应位置上不同字符的个数 print(Levenshtein.distance('hello', 'helol')) # 2; 计算编辑距离(也成 Levenshtein 距离); 是描述由一个字串转化成另一个字串最少的操作次数, 在其中的操作包括插入 & 删除 & 替换 print(Levenshtein.distance('hello world asdf', 'helolaaaa world asdf')) # 5 print(Levenshtein.ratio('hello', 'helol')) # 0.8; 计算莱文斯坦比; 计算公式 r = (sum - ldist) / sum, 其中 sum 是指 str1 和 str2 字串的长度总和, ldist 是类编辑距离 # 注意: 这里的类编辑距离不是 2 中所说的编辑距离, 2 中三种操作中每个操作+1, 而在此处, 删除、插入依然+1, 但是替换+2 # 这样设计的目的: ratio('a', 'c'), sum=2, 按 2 中计算为(2-1)/2 = 0.5,' a','c'没有重合, 显然不合算, 但是替换操作+2, 就可以解决这个问题 print(Levenshtein.jaro('hello', 'helol')) # 0.9333333333333332; 计算 jaro 距离; 用于健康普查 print(Levenshtein.jaro_winkler('hello', 'helol')) # 0.9533333333333333; 计算 Jaro – Winkler 距离
def get_videos(): """ @api {GET} /api/v1/videos/videos 获取视频(已完成) @apiName 获取视频(已完成) @apiGroup 视频 @apiVersion 1.0.0 @apiDescription 可用来获取全部视频、指定数量的视频、指定排序顺序的视频、关键词匹配的视频以及分页视频 @apiHeader {String="application/json"} Content-Type 浏览器编码类型 @apiParam{String="likes -- 点赞数降序","views -- 观看量降序","releaseTime -- 发布时间降序","total -- 综合"}[order="likes"] 用来指定视频的排序方式。 @apiParam {String} [searchValue] 搜索内容 @apiParam {Boolean} [isPagination=false] 是否分页 @apiParam {Number{大于0}} [pageNumber=1] 页码 @apiParam {Number{大于0}} [pageSize=10] 页面大小 @apiParamExample {json} 参数示例 { "order": "likes", "searchValue":"hello", "isPagination":true, "pageNumber":1, "pageSize":5 } @apiUse Success200 @apiSuccess {object[]} data.videos 获取到的视频 @apiSuccess {Number} data.videos.id 视频编号 @apiSuccess {String} data.videos.name 视频名称 @apiSuccess {Number} data.videos.authorId 作者编号 @apiSuccess {String} data.videos.authorName 作者名称 @apiSuccess {String} data.videos.introduction 视频简介 @apiSuccess {Number} data.videos.likes 点赞数 @apiSuccess {Number} data.videos.views 观看数 @apiSuccess {String} data.videos.releaseTime 发布时间 @apiSuccess {String} data.videos.imageUrl 封面图片url @apiSuccess {String} data.videos.videoUrl 视频url @apiSuccessExample {json} 返回值示例 { "result":true, "code":200, "message":"", "header":{}, "data":{ "videos": [ { "id": 12, "name": "testVideo", "authorId": 1, "authorName": "testMan", "introduction": "test video", "likes": 50, "views": 5023, "releaseTime": "2021-4-21", "imageUrl": "http://xxx", "videoUrl": "http://xxx" }, {...}, ] } } @apiUse Errors """ data = request.args order = data.get("order") search_value = data.get("searchValue") is_pagination = data.get("isPagination") page_number = data.get("pageNumber") page_size = data.get("pageSize") video_filter = None # 处理搜索 if search_value is not None: # 匹配到的视频的名字的队列 match_video_name_list = list() # 从数据库获取所有视频的名字 videos_name = Videos.query.with_entities(Videos.name).all() for video_name in videos_name: # 搜索内容和视频相似度 similarity = Levenshtein.jaro_winkler(search_value, video_name[0]) # print(video_name[0] + ':' + str(similarity)) # 如果相似度大于设定的值就将视频名字放到匹配队列中去 if similarity >= Config.SEARCH_SIMILARITY: match_video_name_list.append(video_name[0]) video_filter = Videos.query.filter(Videos.name.in_(match_video_name_list)) else: video_filter = Videos.query # 处理排序 if order is None: order = "likes" if order == "likes": video_filter = video_filter.order_by(Videos.likes.desc()) if order == "views": video_filter = video_filter.order_by(Videos.views.desc()) if order == "releaseTime": video_filter = video_filter.order_by(Videos.releaseTime.desc()) if is_pagination is None: is_pagination = False if is_pagination == 'true': is_pagination = True elif is_pagination == 'false': is_pagination = False if is_pagination: # 设定页码 if page_size is None: page_size = 10 if page_number is None: page_number = 1 try: page_size = int(page_size) page_number = int(page_number) except ValueError: return jsonify(result=False, code=400, message="参数类型错误!", header={}, data={}), 400 video_filter = video_filter.paginate(page_number, page_size, False).items # print(video_filter) else: video_filter = video_filter.all() # print(video_filter) return_data = { "videos": [] } for v in video_filter: author_name = Users.query.filter_by(id=v.author_id).with_entities(Users.username).first()[0] video_dict = { "id": v.id, "name": v.name, "authorId": v.author_id, "authorName": author_name, "introduction": v.introduction, "likes": v.likes, "views": v.views, "releaseTime": str(v.releaseTime.strftime("%Y年%m月%d日 %H:%M")), "imageUrl": v.image_url, "videoUrl": v.video_url } return_data.get("videos").append(video_dict) # print(return_data) return jsonify(result=True, code=200, message="", header={}, data=return_data), 200
def are_similar(name1, name2): name1, name2 = (asciipunct(s.strip().lower()) for s in (name1, name2)) ratio = Levenshtein.jaro_winkler(name1, name2) return ratio >= 0.8 or name1 in name2 or name2 in name1
def are_similar(name1, name2): name1, name2 = (asciipunct(s.strip().lower()) for s in (name1, name2)) ratio = Levenshtein.jaro_winkler(name1, name2, 0.0) # no common prefix length return ratio >= 0.8
def jaro_winkle_distance(str1, str2): sim = Levenshtein.jaro_winkler(str1, str2) return sim