def retrieve_inner(context, result): retrieve = retriever.retrieve(context, top_k=1) # print('searched for:', context) text = retrieve[0].text # print('found:', text) lcs = pylcs.lcs2(text, context) if lcs >= min(len(text), len(context)) * 0.5: result.append(context)
def _score(self, title, ent_info): comp, comp_score = ent_info["company"], 0 if not DataUtil.is_null(comp): t = pylcs.lcs2(title, comp) if t > 1: comp_score = 1 fun, fun_score = ent_info["functions"], 0 if not DataUtil.is_null(fun): t = pylcs.lcs2(title, fun) if t > 1: fun_score = 1 bases, bases_score = ent_info["bases"], 0 if not DataUtil.is_null(bases): t = pylcs.lcs2(title, bases) if t > 1: bases_score = 1 final_score = comp_score + fun_score + bases_score return final_score
def triple_word_sim(triple: dict, word: str): """ 计算triple和word的相似度 由triple中的s、o与word的相似度组成 相似度包含最长子串和莱文斯坦比: 莱文斯坦比。计算公式 r = (sum – ldist) / sum, 其中sum是指str1 和 str2 字串的长度总和, ldist是类编辑距离。注意这里是类编辑距离,在类编辑距离中删除、插入依然+1,但是替换+2。 :param triple: dict, triple["s"]为一个实体, triple["r"]为一个关系, triple["o"]为一个literal字符串, :param word: :return: """ similarity1 = 0 # 与literal的相似度 similarity2 = 0 # 与头实体的相似度 s_name = sparql_get_name(triple["s"]) similarity1 += Levenshtein.ratio(triple["o"], word) similarity1 += (pylcs.lcs2(triple["o"], word) * 2) / ( len(triple["o"]) + len(word)) # 最长子串 similarity2 += Levenshtein.ratio(s_name, word) similarity2 += (pylcs.lcs2(s_name, word) * 2) / (len(s_name) + len(word) ) # 最长子串 return similarity1 / 2, similarity2 / 2
def longest_common_subsequence(self, config, sentence1, sentence2): """ Computes the length of the longest common subsequence or substring of the two sentences, depending on the configuration. L. C. Subsequence example: ("We ate a delicious pizza", "We ate a not so delicious pizza") -> "We ate a delicious pizza" L. C. Substring example: ("We ate a delicious pizza", "We ate a not so delicious pizza") -> " delicious pizza" """ mode = config['mode'] if 'mode' in config else 'subsequence' if mode == 'subsequence': return pylcs.lcs(sentence1[1], sentence2[1]) else: # mode == 'substring' return pylcs.lcs2(sentence1[1], sentence2[1])
def score(self, query, ent_info): pylcs.lcs2(query, ent_info["functions"])
def lcs_distance(A, B): lcs = pylcs.lcs2(A, B) if lcs == 0: return 1 else: return 1 / lcs
def relative_lcs(A, B): return 1 - pylcs.lcs2(A, B) / max(len(A), len(B))
def preproc(question,article): cnt_error = 0 dataset = [] error_dataset = [] for q_i, q in enumerate(question): dialogue_history = " ".join([f'{c["speaker"]}: {c["text"]}'.replace("\n"," ").replace("\t"," ") for c in article[q["article_segment_id"]]['seg_dialog']]) if len(q["answers"])>0: idx_start = dialogue_history.lower().find(q["answers"][0].lower()) lsc = pylcs.lcs2(q["answers"][0].lower(), dialogue_history.lower()) if idx_start == -1 and lsc!=0: temp_anw = q["answers"][0].lower()[:lsc-1] idx_start = dialogue_history.lower().find(temp_anw) if idx_start == -1 and lsc!=0: temp_anw = q["answers"][0].lower()[-lsc:] idx_start = dialogue_history.lower().find(temp_anw) if idx_start != -1 and q["answers"][0].lower()!="": temp = { # "title":q["article_segment_id"]+"___"+str(q_i), "paragraphs": [ { "context": dialogue_history.lower(), "qas":[ { "question":q["question"].lower(), "id":q["id"], "answers":[ { "answer_start":idx_start, "text":q["answers"][0].lower() } ], "is_impossible": False } ] } ] } dataset.append(temp) else: temp = { "paragraphs": [ { "context": dialogue_history.lower(), "qas":[ { "question":q["question"].lower(), "id":q["id"], "answers":[ { "text":q["answers"][0].lower() } ], } ] } ] } error_dataset.append(temp) cnt_error += 1 else: temp = { "paragraphs": [ { "context": dialogue_history.lower(), "qas":[ { "plausible_answers": [], "question":q["question"].lower(), "id":q["id"], "answers":[], "is_impossible": True } ] } ] } dataset.append(temp) print(f"NUMBER OF SKIPPED QA CANNOT FIND SPAN: {cnt_error}") print(f"NUMBER of QA FOUND: {len(dataset)}") return dataset, error_dataset
score = 0 total_combinations = 0 levenstein_score = 0 substring_score = 0 for i in range(len(phonetic_result)): for j in range(i + 1, len(phonetic_result)): total_combinations += 1 (word1, sound1) = phonetic_result[i] (word2, sound2) = phonetic_result[j] levenstein_distance = pylcs.levenshtein_distance( sound1, sound2) longest_substr_len = pylcs.lcs2(sound1, sound2) levenstein_score += levenstein_distance / max( len(sound1), len(sound2)) substring_score += longest_substr_len / min( len(sound1), len(sound2)) score += 0.5 * ((1 - (levenstein_score / total_combinations)) * 20) score += 0.5 * ((substring_score / total_combinations) * 20) scores[subject][variable][Scores.PHONETIC_SCORE] = int( round(score)) # Determine TOTAL_SCORE scores[subject][variable][Scores.TOTAL_SCORE] = int(
def test_lcs2(): assert pylcs.lcs2("aaa", "bbb") == 0 assert pylcs.lcs2("aaa", "aabbbaa") == 2 assert pylcs.lcs2("你好", "中国") == 0 assert pylcs.lcs2("aaa你好", "好呀你") == 1