def GetScore(src_name, input_name, min_score): src_name = src_name.translate(str.maketrans('', '', string.punctuation)) input_name = input_name.translate( str.maketrans('', '', string.punctuation)) jarowinkler = JaroWinkler() result = [] total_score_scr_part = 0 for input_name_part in input_name.split(): column = [] for src_name_part in src_name.split(): winkler_part = jarowinkler.similarity( input_name_part, src_name_part) difference = ParsedDifference(input_name_part, src_name_part) avg = (winkler_part + difference) / 2 column.append(avg) result.append(max(column)) full_inputted_jaro = jarowinkler.similarity(input_name, src_name) score = Average(result) if (full_inputted_jaro > score): score = full_inputted_jaro return score * 100
def check_repeat_similar(table_info: TableInfo, target_list: List, target_tag: str) -> TableInfo: # 已重复,直接跳出 if table_info.result == 1: return table_info jk = JaroWinkler() similar_list: List[RepeatInfo] = list() for t_name in target_list: similar_point = jk.similarity(t_name, table_info.t_name) restore_result(t_name, similar_point, similar_list) # 处理查询后的结果 similar_msg_list = list() similar_result = 0 for repeat_info in similar_list: if repeat_info.similar_point > 0.97: similar_result = 1 similar_msg_list.append('与' + repeat_info.t_name + '(' + target_tag + ')完全重复') # 完全重复就不需要在查找疑似重复 break elif repeat_info.similar_point > 0.7: similar_result = similar_result if similar_result == 1 else 2 similar_msg_list.append('与' + repeat_info.t_name + '(' + target_tag + ')疑似重复') table_info.result = similar_result table_info.msg = similar_msg_list return table_info
def fuzzy_line_equality_detection(self, lines): new_lines = [] jarowinkler = JaroWinkler() #Compare all lines against each other for k in range(len(lines.split("\n"))): max_sim = 0 for l in range(len(lines.split("\n"))): if k == l: continue jaro_sim = jarowinkler.similarity( lines.split("\n")[k].lower(), lines.split("\n")[l].lower()) #Get maximum similarity if jaro_sim > max_sim: max_sim = jaro_sim #If maximum similarity >= similarity threshold: make all tokens technical(T) if max_sim >= self.similarity_threshold and lines.split( "\n")[k].replace(" ", ""): new_lines.append(" ".join( [w + "_T" for w in lines.split("\n")[k].split(" ")])) else: new_lines.append(" ".join( [w + "_N" for w in lines.split("\n")[k].split(" ")])) return "\n".join(new_lines)
def best_match(self, search_track, tracks): jw = JaroWinkler() title_similarities = [] artists_similarities = [] totals = [] for track in tracks: title_similarity = jw.similarity(search_track.title.lower(), track.title.lower()) title_similarities.append(title_similarity) artists_similarity = jw.similarity(search_track.artists.lower(), track.artists.lower()) artists_similarities.append(artists_similarity) totals.append(artists_similarity + title_similarity) max_index = totals.index(max(totals)) max_total = totals[max_index] if max_total > 1.5: return tracks[max_index] else: return None
def correct(word, dictionary): if is_correct(word, dictionary): return (word, 1.0) else: jarowinkler = JaroWinkler() score = [] for dict_word in dictionary: score.append(jarowinkler.similarity(word, dict_word)) return (dictionary[score.index(max(score))], max(score))
def get_similarity_score(name_1, name_2): if name_1 == '' and name_2 == '': return 1 if (name_1 == '' and name_2 != '') or (name_1 != '' and name_2 == ''): return 0 c_name_1 = normalize_text(curate_author_name(name_1)).lower() c_name_2 = normalize_text(curate_author_name(name_2)).lower() jarowinkler = JaroWinkler() similarity_score = jarowinkler.similarity(c_name_1, c_name_2) return similarity_score
def jw(df): jarowinkler = JaroWinkler() df["jarowinkler_sim"] = [ jarowinkler.similarity(i, j) for i, j in zip(df["Tags2"], df["UserInput"]) ] df.sort_values(by=['jarowinkler_sim'], inplace=True, ascending=False) final = df.drop(['Category', 'ReviewText2', 'Tags2'], axis=1).iloc[:5, :] return final
def find_most_apt(name, results): jarowinkler = JaroWinkler() deg = [] for el in results: if name.upper() == el.upper(): return el else: deg.append(jarowinkler.similarity(name.upper(), el.upper())) indd = int(deg.index(max(deg))) mostapt = results[indd] return mostapt
class DuplicatesListPipeline(object): def __init__(self): self.jaro_winkler = JaroWinkler() self.list_check = list() def process_item(self, item, spider): name_tour = item['name_tour'] url_tour = item['url_tour'] number_date = item['number_date'] if number_date is None: number_date = '' start_date = item['start_date'] if start_date is None: start_date = '' check = name_tour + url_tour + str(number_date) + start_date list_new = self.list_check score = 0 for value in list_new: name_tour_new = value['name_tour'] if name_tour_new is None: name_tour_new = '' url_tour_new = value['url_tour'] number_date_new = value['number_date'] if number_date_new is None: print(type(number_date_new)) number_date_new = '' start_date_new = value['start_date'] if start_date_new is None: start_date_new = '' check_new = name_tour_new + url_tour_new + str(number_date_new) + start_date_new score = self.jaro_winkler.similarity(check, check_new) if score >= 0.85: print("SCORE: " + score) raise DropItem("Duplicate item found ") else: self.list_check.append(item) return item if len(list_new) == 0: self.list_check.append(item) return item
def are_names_similar(name_1, name_2, use_approximation_algorithm=False, similarity_threshold=0.95): if name_1 == '' and name_2 == '': return True if (name_1 == '' and name_2 != '') or (name_1 != '' and name_2 == ''): return False c_name_1 = normalize_text(curate_author_name(name_1)).lower() c_name_2 = normalize_text(curate_author_name(name_2)).lower() if use_approximation_algorithm: jarowinkler = JaroWinkler() similarity_score = jarowinkler.similarity(c_name_1, c_name_2) return similarity_score > similarity_threshold else: return c_name_1 == c_name_2
def __affiliations_to_save(affiliations, new_affiliations): jarowinkler = JaroWinkler() similarity_threshold = 0.95 affiliations_to_save = [] for new_affiliation in new_affiliations: exist_affiliation = False for affiliation in affiliations: # normalize text before comparison affiliation_nor = normalize_text(affiliation) new_affiliation_nor = normalize_text(new_affiliation) similarity_score = jarowinkler.similarity( affiliation_nor.lower(), new_affiliation_nor.lower()) if similarity_score >= similarity_threshold: exist_affiliation = True if not exist_affiliation: affiliations_to_save.append(new_affiliation) return affiliations_to_save
def similarity(self, question, answer): stopword = self.read_from(folder_path + '上证专用停用词.txt') stopwords = [] for sw in stopword: sw = sw.strip('\n') sw = sw.strip(' ') stopwords.append(sw) # print(stopwords) meaningful_words1 = [] meaningful_words2 = [] words2 = jieba.cut(str(question)) words3 = jieba.cut(str(answer)) for word in words2: if word not in stopwords: meaningful_words1.append(word) for word in words3: if word not in stopwords: meaningful_words2.append(word) s2 = ''.join(meaningful_words1) # print(s2) s3 = ''.join(meaningful_words2) a1 = Cosine(1) b1 = Damerau() c1 = Jaccard(1) d1 = JaroWinkler() e1 = Levenshtein() f1 = LongestCommonSubsequence() g1 = MetricLCS() h1 = NGram(2) i1 = NormalizedLevenshtein() j1 = OptimalStringAlignment() k1 = QGram(1) l1 = SorensenDice(2) m1 = WeightedLevenshtein(character_substitution=CharSub()) line_sim = [] cos_s = a1.similarity(s2, s3) line_sim.append(cos_s) cos_d = a1.distance(s2, s3) line_sim.append(cos_d) dam = b1.distance(s2, s3) line_sim.append(dam) jac_d = c1.distance(s2, s3) line_sim.append(jac_d) jac_s = c1.similarity(s2, s3) line_sim.append(jac_s) jar_d = d1.distance(s2, s3) line_sim.append(jar_d) jar_s = d1.similarity(s2, s3) line_sim.append(jar_s) lev = e1.distance(s2, s3) line_sim.append(lev) lon = f1.distance(s2, s3) line_sim.append(lon) met = g1.distance(s2, s3) line_sim.append(met) ngr = h1.distance(s2, s3) line_sim.append(ngr) nor_d = i1.distance(s2, s3) line_sim.append(nor_d) nor_s = i1.similarity(s2, s3) line_sim.append(nor_s) opt = j1.distance(s2, s3) line_sim.append(opt) qgr = k1.distance(s2, s3) line_sim.append(qgr) sor_d = l1.distance(s2, s3) line_sim.append(sor_d) sor_s = l1.similarity(s2, s3) line_sim.append(sor_s) wei = m1.distance(s2, s3) line_sim.append(wei) return line_sim
print("Matching name", name1, "from", index_path) numMatch = 0 numMatchApprox = 0 aux = True for name2, ids2 in map2.items(): if name1.lower() == name2.lower(): for id1 in ids1: for id2 in ids2: matches.add((index_path, id1, id2[0], id2[1])) numMatch += 1 aux = False sim_threshold = 0.995 while aux and sim_threshold >= 0.9: aux = True for name2, ids2 in map2.items(): if jarowinkler.similarity(name1.lower()[::-1], name2.lower()[::-1]) > sim_threshold: for id1 in ids1: for id2 in ids2: matches.add((index_path, id1, id2[0], id2[1])) numMatchApprox += 1 aux = False sim_threshold -= 0.005 num_index_places += 1 if numMatch > 0 or numMatchApprox > 0: num_index_places_matched += 1 if numMatch == 0 and numMatchApprox > 0: num_index_places_matched_approx += 1 if numMatch == 1 or (numMatch == 0 and numMatchApprox == 1): num_index_places_matched_single += 1 results = [] for match in matches:
class TextProcessing(object): def __init__(self): self.stemmer = SnowballStemmer("english") self.jaroWinkler = JaroWinkler() self.diceScore = None #SorensenDice() def removeStopWords(slef, text): stopList = {'the', 'of', 'by', 'in', 'on', 'at', 'for', 'an'} textsplited = text.replace("_", " ").split(" ") return " ".join([w for w in textsplited if w not in stopList]) def removeSpecialCharacters2(self, text): #_text = text.replace("\'", "") pos = text.split("__") if len(pos) > 1: _text = pos[1] pos = pos[0] + "__" else: pos = '' _text = text _text = _text.replace("_", " ") _text = _text.replace('&', ' and ').strip() _text = _text.replace('%', ' percentage ').strip() _text = _text.replace('#', ' no. ').strip() _text = _text.replace('$', ' currency ').strip() characters = [ '/', '\\', '>', '<', "'s", "(s)", '\"', "[", "]", "(", ")", "{", "}", "." ] for c in characters: if c in _text: _text = _text.replace(c, ' ').strip() _text = re.sub('\\s+', ' ', _text).strip() return pos + _text.replace(" ", "_") #result = re.sub(r'[?|$|.|!\-\[\]/\(\)#,:]',r'', result) def stemWord(self, text): pos = text.split("__") if len(pos) > 1: _text = pos[1] pos = pos[0] + "__" else: pos = '' _text = text _text = " ".join(_text.split("_")) result = [self.stemmer.stem(t) for t in _text.split(" ")] result = " ".join(result) return pos + result.strip().replace(" ", "_") def cleanForSimilarity(self, text): if "protag_article" in text: return "" if "__" in text: cleant = text.split("__")[1] else: cleant = text if " :" in cleant: cleant = cleant.split(" :")[1] if "@en" in cleant: cleant = cleant.split("@")[0] cleant = [self.stemmer.stem(t) for t in cleant.split(" ")] cleant = "".join(cleant) cleant = cleant.replace("*", "").replace("_", "").replace("spancol", "").split("@")[0] return cleant def textSimilarity(self, text1, text2): #print("text inicial: ", text1, text2) t1 = self.cleanForSimilarity(text1) t2 = self.cleanForSimilarity(text2) if len(t1) == 0 and len(t2) == 0: return 0 score1 = self.jaroWinkler.similarity(t1, t2) score2 = self.diceScore.similarity(t1, t2) mins = min([score1, score2]) #print(t1,t2,mins) return mins def cleanCellHeader(self, cellText): _cellText = re.sub('\\s+', ' ', cellText).strip() _cellText = self.removeSpecialCharacters2(_cellText) _cellText = self.stemWord(_cellText) return _cellText def orderHeaders(self, headers): _headers = headers[:] hd = {hi: [] for hi in _headers} for i, hi in enumerate(_headers): hd[hi].append(i) headersD = {} for k, v in hd.items(): if len(v) > 1: _v = v _v.sort() i = 1 for posh in _v: headersD[posh] = str(i) + "__" + k i += 1 else: headersD[v[0]] = k _headers = [] for i in range(len(headers)): _headers.append(headersD.get(i)) return _headers def cleanTableHeader(self, headers): dataTypes = [h.split("@")[len(h.split("@")) - 1] for h in headers] _headers = [h.split("@")[0] for h in headers] _headers = [self.removeSpecialCharacters2(h) for h in _headers] _headers = [self.stemWord(h) for h in _headers] _headers = ['spancol' if hi == "" else hi for hi in _headers] hd = {hi: [] for hi in _headers} for i, hi in enumerate(_headers): hd[hi].append(i) headersD = {} for k, v in hd.items(): if len(v) > 1: _v = v _v.sort() i = 1 for posh in _v: headersD[posh] = str(i) + "__" + k i += 1 else: headersD[v[0]] = k _headers = [] for k, v in headersD.items(): _headers.append(v + "@" + dataTypes[k]) return _headers
temp_article.append(data[i][0]) print(len(data)) my_string = "human moblity prediction spatiotemporal next place future location point-of-interest hotspot forecasting modelling mobility behaviors traffic trajectory mobile phone" p = [] filter_thresh_45 = [] for i in range(len(temp_article)): jarowinkler = JaroWinkler() sim = jarowinkler.similarity(my_string, temp_article[i]) if sim > 0.45: filter_thresh_45.append(data[i]) normalized_levenshtein = NormalizedLevenshtein() filter_normalized_levenshtein = [] for i in range(len(filter_thresh_45)): sim = normalized_levenshtein.distance(my_string, filter_thresh_45[i][0]) if sim >= 0.7:
def search(): global result result = [] data = json.loads(request.get_data()) jarowinkler = JaroWinkler() page_list = [] suchwort = [] first_set = [] second_set = [] # nlp = spacy.load('de_core_news_sm') nlp = spacy.load('de_core_news_sm') # nlp = spacy.load('en_core_web_sm', disable=["parser",'ner']) word = ' '.join([i.capitalize() for i in data['nlp']['source'].split(' ')]) doc = nlp(word) for token in doc: # if token.tag_ in ['NNP','NNPS', 'NN', 'NNS']: if token.tag_ in ['NE','NNE', 'NN']: suchwort.append(token.text) print(word) if suchwort: if len(suchwort) >= 2: for d in dict_list_bereinigt: for key, value in d.items(): for i in value: if jarowinkler.similarity(i.lower(), suchwort[-1].lower()) > 0.95: first_set.append(key) for d in dict_list_bereinigt: for key, value in d.items(): for i in value: if jarowinkler.similarity(i.lower(), suchwort[-2].lower()) > 0.95: second_set.append(key) found_pages = list(set(first_set).intersection(set(second_set))) else: for d in dict_list_bereinigt: for key, value in d.items(): for i in value: if jarowinkler.similarity(i.lower(), suchwort[-1].lower()) > 0.95: first_set.append(key) found_pages = first_set searchlist = list(set(found_pages)) page_list = [int(i[0]) for i in [i.split('.') for i in searchlist]] sentence = "Außerdem habe {} Seite(n) im Skript mit {} finden können".format(len(page_list),' '.join(suchwort)) pic_urls = [dictionary[sorted(searchlist)[i]] for i in range(0,len(searchlist),3)] result.append({'type': 'text', 'content':sentence + ". Hier sind ein paar Beispiele " + " ".join(str(i) for i in sorted(page_list))}) for i in pic_urls: myd = {'type': 'picture','content':''} myd['content'] = i result.append(myd) if len(page_list) == 0: result = [{'type': 'text','content': 'Ich konnte nichts im Skript zum Wort {} finden'.format(suchwort[0])}] replies=result # return replies return jsonify( status=200, replies=result, conversation={ 'memory': { 'key': 'value' } } )
def similarityFind(srcText, srcStart, dstText, maxWords=30): jarowinkler = JaroWinkler() dstText = dstText.lower().strip() dstLen = len(dstText) lastword = dstText.split()[-1] maxSim = {'sim': 0, 'begin': -1, 'end': -1} try: idx = srcStart count = 0 while count < maxWords: # 计算开始位置 begin = idx while srcText[begin] == ' ': begin += 1 end = begin + dstLen while srcText[end] != ' ': end += 1 # 如果最后一个单词没有出现在查找范围中,适当的加大范围 tempIdx = srcText[begin:end].lower().rfind(lastword) if tempIdx < 0: tempIdx = srcText[end:end + 15].lower().find(lastword) if tempIdx > 0: end += tempIdx + len(lastword) while srcText[end] != ' ': end += 1 else: # 标点符号结尾 tempIdx2 = srcText[begin:end].lower().rfind(', ') if tempIdx2 > tempIdx: end = begin + tempIdx2 + 1 else: tempIdx2 = srcText[begin:end].lower().rfind('. ') if tempIdx2 > tempIdx: end = begin + tempIdx2 + 1 else: tempIdx2 = srcText[begin:end].lower().rfind('! ') if tempIdx2 > tempIdx: end = begin + tempIdx2 + 1 # 去掉标点符号 temp = srcText[begin:end].lower() temp = temp.replace('"', '') temp = temp.replace('!', '') temp = temp.replace('?', '') temp = temp.replace('.', '') temp = temp.replace(',', '') temp = temp.replace('“', '') temp = temp.replace('”', '') temp = temp.replace('’', '') print('try:%s' % (temp)) # 检查是否相似 sim = jarowinkler.similarity(temp, dstText) print('sim:', sim) if sim > maxSim['sim']: #相似度开始下降时返回结果。 maxSim['sim'] = sim maxSim['begin'] = begin maxSim['end'] = end else: srcWordList = srcText[maxSim['begin']:maxSim['end']].split() if len(srcWordList) > 0 and lastword != srcWordList[-1]: print('aaaaaaaaaaaaaaaa', srcWordList) print('bbbbbbbbbbbbbbbb', lastword) for i in range(len(srcWordList) - 1, -1, -1): if srcWordList[i].find(lastword) >= 0: temp = ' '.join(srcWordList[0:i + 1]).lower() temp = temp.replace('"', '') temp = temp.replace('!', '') temp = temp.replace('?', '') temp = temp.replace('.', '') temp = temp.replace(',', '') temp = temp.replace('“', '') temp = temp.replace('”', '') temp = temp.replace('’', '') print('ccccccccccccccccc1', temp) print('ccccccccccccccccc2', dstText) sim = jarowinkler.similarity(temp, dstText) print('ccccccccccccccccc3', sim) if sim > maxSim['sim']: maxSim['sim'] = sim end = srcText.rfind(lastword, begin, maxSim['end']) while srcText[end] != ' ': end += 1 maxSim['end'] = end print('eeeeeeeeeeeeeeeeeeee', srcText[maxSim['begin']:maxSim['end']]) break return maxSim # 继续从一下个单词开始比较。 while srcText[begin] != ' ': begin += 1 idx = begin count += 1 except IndexError as e: print('error:', e) return maxSim