Exemplo n.º 1
0
def retrieve_inner(context, result):
    retrieve = retriever.retrieve(context, top_k=1)
    # print('searched for:', context)
    text = retrieve[0].text
    # print('found:', text)
    lcs = pylcs.lcs2(text, context)
    if lcs >= min(len(text), len(context)) * 0.5:
        result.append(context)
Exemplo n.º 2
0
 def _score(self, title, ent_info):
     comp, comp_score = ent_info["company"], 0
     if not DataUtil.is_null(comp):
         t = pylcs.lcs2(title, comp)
         if t > 1:
             comp_score = 1
     fun, fun_score = ent_info["functions"], 0
     if not DataUtil.is_null(fun):
         t = pylcs.lcs2(title, fun)
         if t > 1:
             fun_score = 1
     bases, bases_score = ent_info["bases"], 0
     if not DataUtil.is_null(bases):
         t = pylcs.lcs2(title, bases)
         if t > 1:
             bases_score = 1
     final_score = comp_score + fun_score + bases_score
     return final_score
def triple_word_sim(triple: dict, word: str):
    """
    计算triple和word的相似度
    由triple中的s、o与word的相似度组成
    相似度包含最长子串和莱文斯坦比:
    莱文斯坦比。计算公式 r = (sum – ldist) / sum,
        其中sum是指str1 和 str2 字串的长度总和,
        ldist是类编辑距离。注意这里是类编辑距离,在类编辑距离中删除、插入依然+1,但是替换+2。
    :param triple: dict, triple["s"]为一个实体, triple["r"]为一个关系, triple["o"]为一个literal字符串,
    :param word:
    :return:
    """
    similarity1 = 0  # 与literal的相似度
    similarity2 = 0  # 与头实体的相似度
    s_name = sparql_get_name(triple["s"])
    similarity1 += Levenshtein.ratio(triple["o"], word)
    similarity1 += (pylcs.lcs2(triple["o"], word) * 2) / (
        len(triple["o"]) + len(word))  # 最长子串
    similarity2 += Levenshtein.ratio(s_name, word)
    similarity2 += (pylcs.lcs2(s_name, word) * 2) / (len(s_name) + len(word)
                                                     )  # 最长子串
    return similarity1 / 2, similarity2 / 2
Exemplo n.º 4
0
 def longest_common_subsequence(self, config, sentence1, sentence2):
     """
     Computes the length of the longest common subsequence or substring of the two sentences, depending on the
     configuration.
     L. C. Subsequence example:
       ("We ate a delicious pizza", "We ate a not so delicious pizza") -> "We ate a delicious pizza"
     L. C. Substring example:
       ("We ate a delicious pizza", "We ate a not so delicious pizza") -> " delicious pizza"
     """
     mode = config['mode'] if 'mode' in config else 'subsequence'
     if mode == 'subsequence':
         return pylcs.lcs(sentence1[1], sentence2[1])
     else:  # mode == 'substring'
         return pylcs.lcs2(sentence1[1], sentence2[1])
Exemplo n.º 5
0
 def score(self, query, ent_info):
     pylcs.lcs2(query, ent_info["functions"])
Exemplo n.º 6
0
def lcs_distance(A, B):
    lcs = pylcs.lcs2(A, B)
    if lcs == 0:
        return 1
    else:
        return 1 / lcs
Exemplo n.º 7
0
def relative_lcs(A, B):
    return 1 - pylcs.lcs2(A, B) / max(len(A), len(B))
Exemplo n.º 8
0
def preproc(question,article):
    cnt_error = 0
    dataset = []
    error_dataset = []
    for q_i, q in enumerate(question):
        dialogue_history = " ".join([f'{c["speaker"]}: {c["text"]}'.replace("\n"," ").replace("\t"," ") for c in article[q["article_segment_id"]]['seg_dialog']])
        if len(q["answers"])>0:
            idx_start = dialogue_history.lower().find(q["answers"][0].lower())
            lsc = pylcs.lcs2(q["answers"][0].lower(), dialogue_history.lower())
            if idx_start == -1 and lsc!=0:
                temp_anw  = q["answers"][0].lower()[:lsc-1]
                idx_start = dialogue_history.lower().find(temp_anw)
                if idx_start == -1 and lsc!=0:
                    temp_anw  = q["answers"][0].lower()[-lsc:]
                    idx_start = dialogue_history.lower().find(temp_anw)

            if idx_start != -1 and q["answers"][0].lower()!="":
                temp = {
                        # "title":q["article_segment_id"]+"___"+str(q_i),
                        "paragraphs": [
                            {
                            "context": dialogue_history.lower(),
                            "qas":[
                                    {
                                    "question":q["question"].lower(),
                                    "id":q["id"],
                                    "answers":[
                                        {
                                        "answer_start":idx_start,
                                        "text":q["answers"][0].lower()
                                        }
                                    ],
                                    "is_impossible": False
                                    }
                                ]
                            }
                            ]
                        }
                dataset.append(temp)
            else:
                temp = {
                        "paragraphs": [
                            {
                            "context": dialogue_history.lower(),
                            "qas":[
                                    {
                                    "question":q["question"].lower(),
                                    "id":q["id"],
                                    "answers":[
                                        {
                                        "text":q["answers"][0].lower()
                                        }
                                    ],
                                    }
                                ]
                            }
                            ]
                        }
                error_dataset.append(temp)
                cnt_error += 1
        else:
            temp = {
                    "paragraphs": [
                        {
                        "context": dialogue_history.lower(),
                        "qas":[
                                {
                                "plausible_answers": [],
                                "question":q["question"].lower(),
                                "id":q["id"],
                                "answers":[],
                                "is_impossible": True
                                }
                            ]
                        }
                        ]
                    }
            dataset.append(temp)

    print(f"NUMBER OF SKIPPED QA CANNOT FIND SPAN: {cnt_error}")
    print(f"NUMBER of QA FOUND: {len(dataset)}")
    return dataset, error_dataset
Exemplo n.º 9
0
            score = 0

            total_combinations = 0
            levenstein_score = 0
            substring_score = 0

            for i in range(len(phonetic_result)):
                for j in range(i + 1, len(phonetic_result)):
                    total_combinations += 1

                    (word1, sound1) = phonetic_result[i]
                    (word2, sound2) = phonetic_result[j]

                    levenstein_distance = pylcs.levenshtein_distance(
                        sound1, sound2)
                    longest_substr_len = pylcs.lcs2(sound1, sound2)

                    levenstein_score += levenstein_distance / max(
                        len(sound1), len(sound2))

                    substring_score += longest_substr_len / min(
                        len(sound1), len(sound2))

            score += 0.5 * ((1 - (levenstein_score / total_combinations)) * 20)
            score += 0.5 * ((substring_score / total_combinations) * 20)

            scores[subject][variable][Scores.PHONETIC_SCORE] = int(
                round(score))

            # Determine TOTAL_SCORE
            scores[subject][variable][Scores.TOTAL_SCORE] = int(
Exemplo n.º 10
0
def test_lcs2():
	assert pylcs.lcs2("aaa", "bbb") == 0
	assert pylcs.lcs2("aaa", "aabbbaa") == 2
	assert pylcs.lcs2("你好", "中国") == 0
	assert pylcs.lcs2("aaa你好", "好呀你") == 1