def calc_diffscore(blocks1, blocks2):
    scores = list()
    for i in range(len(blocks1)):
        s1 = blocks1[i].text
        s2 = blocks2[i].text

        if not s1:
            s1 = ""
            w1 = 0
        else:
            s1 = str(s1).strip("\t\n ")
            w1 = len(s1)

        if not s2:
            s2 = ""
            w2 = 0
        else:
            s2 = str(s2).strip("\t\n ")
            w2 = len(s2)

        if w1 == 0 and w2 == 0:
            scores.append(0.0)
        else:
            score = (w1 + w2 - 2 * pylcs.lcs(s1, s2)) / (w1 + w2)
            scores.append(score)

    return scores
示例#2
0
    def train_evaluate(self, batch_pred_tag, batch_text, batch_arguments,
                       text_map_seg_idxs, seg_idx_map_bert_idxs,
                       bert_idx_map_seg_idxs, seg_idx_map_texts, raw_texts):
        """评测函数(跟官方评测结果不一定相同,但很接近)
        """

        X, Y, Z = 1e-10, 1e-10, 1e-10

        for pred_tag, text, arguments, text_map_seg_idx, seg_idx_map_bert_idx, bert_idx_map_seg_idx, seg_idx_map_text, raw_text in zip(
                batch_pred_tag, batch_text, batch_arguments, text_map_seg_idxs,
                seg_idx_map_bert_idxs, bert_idx_map_seg_idxs,
                seg_idx_map_texts, raw_texts):

            inv_arguments_label = {v: k for k, v in arguments.items()}
            pred_arguments = bert_extract_arguments(
                text,
                pred_tag,
                self.schema,
                class_id=self.class_id,
                text_map_seg_idx=text_map_seg_idx,
                seg_idx_map_bert_idx=seg_idx_map_bert_idx,
                bert_idx_map_seg_idx=bert_idx_map_seg_idx,
                seg_idx_map_text=seg_idx_map_text,
                raw_text=raw_text)
            pred_inv_arguments = {v: k for k, v in pred_arguments.items()}
            Y += len(pred_inv_arguments)
            Z += len(inv_arguments_label)
            for k, v in pred_inv_arguments.items():
                if k in inv_arguments_label:
                    # 用最长公共子串作为匹配程度度量
                    l = pylcs.lcs(v, inv_arguments_label[k])
                    X += 2. * l / (len(v) + len(inv_arguments_label[k]))
        # f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z

        return X, Y, Z
示例#3
0
def evaluate(data):
    """评测函数(跟官方评测结果不一定相同,但很接近)"""
    X, Y, Z = 1e-10, 1e-10, 1e-10
    #text='雀巢裁员4000人:时代抛弃你时,连招呼都不会打!'
    #arguments={'4000人': ('组织关系-裁员', '裁员人数'), '雀巢': ('组织关系-裁员', '裁员方')}
    for text, arguments in tqdm(data):  #一次验证一个样本
        #{  ('组织关系-裁员', '裁员人数'): '4000人',
        #   ('组织关系-裁员', '裁员方'): '雀巢'             }
        inv_arguments = {v: k for k, v in arguments.items()}  #真实标签
        #pred_arguments:
        #  {  '雀巢裁': ('组织关系-裁员', '裁员方'),
        #    '4000人': ('组织关系-裁员', '裁员人数'),
        #     '时代': ('灾害/意外-坍/垮塌', '时间'),}
        pred_arguments = extract_arguments(text)  #从文本预测,得到预测标签
        #pred_inv_arguments
        #{('灾害/意外-坍/垮塌', '时间'): '时代',
        #('组织关系-裁员', '裁员人数'): '4000人',
        #('组织关系-裁员', '裁员方'): '雀巢裁'}
        pred_inv_arguments = {v: k for k, v in pred_arguments.items()}
        Y += len(pred_inv_arguments)  #预测Y=3
        Z += len(inv_arguments)  #标签Z=2
        for k, v in pred_inv_arguments.items():  #k ('组织关系-裁员', '裁员方') v'雀巢裁'
            if k in inv_arguments:
                # 用最长公共子串作为匹配程度度量
                l = pylcs.lcs(v, inv_arguments[k])  #('雀巢裁','雀巢')   2
                X += 2. * l / (len(v) + len(inv_arguments[k])
                               )  #2*2/(2+3)=0.8如果全对就得1
    #2*0.8/(2+3)     0.8/3     0.8/2
    f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
    return f1, precision, recall
示例#4
0
def get_stop_id(stop_name):
    best = 0
    res = -1
    for stop in stops:
        cur = pylcs.lcs(stop["Name"], stop_name)
        if best < cur:
            best = cur
            res = stop["StopId"]
    return res
示例#5
0
文件: score.py 项目: hiepbkhn/ml2017
def compute_acc_list(true_list, pred_list):
    assert len(true_list) == len(pred_list)
    p_list, r_list, f1_list = [], [], []
    for x, y in zip(true_list, pred_list):
        val = pylcs.lcs(x, y)
        p = float(val) / len(y) if len(y) > 0 else 0.0
        r = float(val) / len(x) if len(x) > 0 else 0.0
        f1 = 2 * p * r / (p + r) if val > 0 else 0.0
        p_list.append(p)
        r_list.append(r)
        f1_list.append(f1)

    return p_list, r_list, f1_list
示例#6
0
 def longest_common_subsequence(self, config, sentence1, sentence2):
     """
     Computes the length of the longest common subsequence or substring of the two sentences, depending on the
     configuration.
     L. C. Subsequence example:
       ("We ate a delicious pizza", "We ate a not so delicious pizza") -> "We ate a delicious pizza"
     L. C. Substring example:
       ("We ate a delicious pizza", "We ate a not so delicious pizza") -> " delicious pizza"
     """
     mode = config['mode'] if 'mode' in config else 'subsequence'
     if mode == 'subsequence':
         return pylcs.lcs(sentence1[1], sentence2[1])
     else:  # mode == 'substring'
         return pylcs.lcs2(sentence1[1], sentence2[1])
def checkPatternSimilarity(parentPatternsDict, toCheckDict):
    if not toCheckDict:
        print("The new program doest not contain OpenMP Code")
        return -1

    _res = defaultdict(dict)

    for toCheckKey, toCheckValue in toCheckDict.items():
        for parKey, parValue in parentPatternsDict.items():
            lcs_len = pylcs.lcs(toCheckValue[0], parValue[0])
            lcs_perc = lcs_len / len(toCheckValue[0])
            _res[toCheckKey][parKey] = lcs_perc

    return _res
示例#8
0
def producer2(queue, datum):
    segdrugJader, drugBank = datum
    for drugJader in segdrugJader:
        scores = []
        for drugBankName in drugBank:
            # edScore = ed.eval(drugJader, drugBankName)
            edScore = (-pylcs.lcs(drugJader, drugBankName) + ed.eval(
                drugJader, drugBankName)) / np.log(len(drugBankName))
            scores.append(edScore)
        scores = np.asarray(scores)
        args = np.argsort(scores)
        assert scores[args[0]] <= scores[args[1]]
        candidateIds = args[:10]
        candidates = [drugBank[i] for i in candidateIds]

        queue.put([drugJader, candidates])
示例#9
0
def evaluate(data, model, CRF):
    """评测函数(跟官方评测结果不一定相同,但很接近)"""
    X, Y, Z = 1e-10, 1e-10, 1e-10
    for text, arguments in tqdm(data):
        inv_arguments = {v: k for k, v in arguments.items()}
        pred_arguments = extract_arguments(text, model, CRF)
        pred_inv_arguments = {v: k for k, v in pred_arguments.items()}
        Y += len(pred_inv_arguments)
        Z += len(inv_arguments)
        for k, v in pred_inv_arguments.items():
            if k in inv_arguments:
                # 用最长公共子串作为匹配程度度量
                l = pylcs.lcs(v, inv_arguments[k])
                X += 2. * l / (len(v) + len(inv_arguments[k]))
    f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
    return f1, precision, recall
def evaluate(data):
    """评测函数(跟官方评测结果不一定相同,但很接近)
    """
    X, Y, Z = 1e-10, 1e-10, 1e-10
    for text, arguments in tqdm(data):
        for subject,pk_v in arguments.items():
            inv_arguments = {k: v for k, v in pk_v} #(obj,label)
            pred_arguments = extract_arguments(text,subject)
            pred_inv_arguments = {v: k for k, v in pred_arguments.items()} #(label,obj)
            Y += len(pred_inv_arguments)
            Z += len(inv_arguments)
            for k, v in pred_inv_arguments.items():
                if k in inv_arguments:
                    # 用最长公共子串作为匹配程度度量
                    l = pylcs.lcs(v, inv_arguments[k])
                    X += 2. * l / (len(v) + len(inv_arguments[k]))
    f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z
    return f1, precision, recall
示例#11
0
    def evaluate(self, batch_pred_tag, batch_text, batch_arguments,
                 text_map_seg_idxs, seg_idx_map_bert_idxs,
                 bert_idx_map_seg_idxs, seg_idx_map_texts, raw_texts):
        """评测函数(跟官方评测结果不一定相同,但很接近)
        """

        X, Y, Z = 1e-10, 1e-10, 1e-10

        for pred_tag, text, arguments, text_map_seg_idx, seg_idx_map_bert_idx, bert_idx_map_seg_idx, seg_idx_map_text, raw_text in zip(
                batch_pred_tag, batch_text, batch_arguments, text_map_seg_idxs,
                seg_idx_map_bert_idxs, bert_idx_map_seg_idxs,
                seg_idx_map_texts, raw_texts):

            inv_arguments_label = {v: k for k, v in arguments.items()}
            pred_arguments = bert_extract_arguments(
                text,
                pred_tag,
                self.schema,
                class_id=self.class_id,
                text_map_seg_idx=text_map_seg_idx,
                seg_idx_map_bert_idx=seg_idx_map_bert_idx,
                bert_idx_map_seg_idx=bert_idx_map_seg_idx,
                seg_idx_map_text=seg_idx_map_text,
                raw_text=raw_text)
            pred_inv_arguments = {v: k for k, v in pred_arguments.items()}

            Y += len(pred_inv_arguments)

            Z += len(inv_arguments_label)
            for k, v in pred_inv_arguments.items():
                if k in inv_arguments_label:
                    argument_str = inv_arguments_label[k].split('_')[0]
                    # 用最长公共子串作为匹配程度度量
                    l = pylcs.lcs(v, argument_str)
                    # X += 2. * l / (len(v) + len(inv_arguments_label[k]))
                    y = len(v)
                    p = l / y + 0.000001
                    z = len(argument_str)
                    r = l / z + 0.000001
                    f1 = 2 * p * r / (p + r)
                    X += f1
        # f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z

        return X, Y, Z
示例#12
0
文件: test.py 项目: yuconan/baidu_ee
    def evaluate(batch_pred_tag, batch_text, batch_arguments):
        """评测函数(跟官方评测结果不一定相同,但很接近)
        """

        X, Y, Z = 1e-10, 1e-10, 1e-10

        for pred_tag, text, arguments in zip(batch_pred_tag, batch_text, batch_arguments):

            inv_arguments_label = {v: k for k, v in arguments.items()}
            pred_arguments = bert_extract_arguments(text, pred_tag, schema)
            pred_inv_arguments = {v: k for k, v in pred_arguments.items()}
            Y += len(pred_inv_arguments)
            Z += len(inv_arguments_label)
            for k, v in pred_inv_arguments.items():
                if k in inv_arguments_label:
                    # 用最长公共子串作为匹配程度度量
                    l = pylcs.lcs(v, inv_arguments_label[k])
                    X += 2. * l / (len(v) + len(inv_arguments_label[k]))
        # f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z

        return X, Y, Z
    def search(self, key, value):
        """
        Given a key/property and an intended value,
        returns True if this rep has the intended value
        :param key: A string property
        :param value: The value searched for
        :return: True or False if this rep is part of the search
        """

        if key == 'source':
            return value in self.sources.values()
        elif key == 'name':
            v = value.lower()
            v = ''.join([let for let in v if 'a' <= let <= 'z'])
            name = self.basics['name'].lower()
            name = ''.join([let for let in name if 'a' <= let <= 'z'])
            lcs = pylcs.lcs(v, name)
            return lcs == len(v)
        elif key == 'chamber':
            if value == 'House':
                return self.basics['title'] == 'Representative'
            elif value == 'Senate':
                return self.basics['title'] == 'Senator'
        elif key == 'alive':
            return not self.basics['death'] == value
        elif key == 'party':
            return value == self.get_current_party()
        elif key == 'state':
            state = us.states.lookup(value).name
            return state == self.get_state()
        elif key == 'district':
            state, dist = value
            state = us.states.lookup(state).name
            return state == self.get_state() and dist == self.get_district()
        elif key == 'active':
            return value == self.get_active()
        else:
            print('Unknown property for representative. Returning False')

        return False
示例#14
0
def pseudo_summary(texts):
    """构建伪标签摘要数据集
    """
    source_idxs, target_idxs = list(range(len(texts))), []
    while True:
        sims = []
        for i in source_idxs:
            new_source_idxs = [j for j in source_idxs if j != i]
            new_target_idxs = sorted(target_idxs + [i])
            new_source = gather_join(texts, new_source_idxs)
            new_target = gather_join(texts, new_target_idxs)
            sim = pylcs.lcs(new_source, new_target)
            sims.append(sim)
        new_idx = source_idxs[np.argmax(sims)]
        source_idxs.remove(new_idx)
        target_idxs = sorted(target_idxs + [new_idx])
        source = gather_join(texts, source_idxs)
        target = gather_join(texts, target_idxs)
        if (len(source_idxs) == 1
                or 1.0 * len(target) / len(source) > summary_rate):
            break
    if len(source) < len(target):
        source, target = target, source
    return source, target
示例#15
0
 def compute_normalized(s1, s2):
     max_len = max(len(s1), len(s2))
     lcs = pylcs.lcs(s1, s2).__float__()
     return 1 - float(max_len - lcs) / float(max_len)
示例#16
0
 def lcs(s1, s2):
     return pylcs.lcs(s1, s2)
示例#17
0
 def pylcs_len(self, A, B):
     return pylcs.lcs(A, B)
示例#18
0
def lexsim(val1, val2):
    lcs = pylcs.lcs(val1, val2) # the longest common subsequence LCS
    lcsr = lcs/max(len(val1),len(val2)) # the longest common subsequence ratio LCSR
    lexsim = lcsr/edit_distance(val1,val2)
    return lexsim*100
示例#19
0
def test_lcs():
	assert pylcs.lcs("aaa", "bbb") == 0
	assert pylcs.lcs("aaa", "aabbbaa") == 3
	assert pylcs.lcs("你好", "中国") == 0
	assert pylcs.lcs("aaa你好", "你好呀") == 2
示例#20
0
    n2 = len(x2)
    
    S = np.zeros((n1, n2))
    
    for i in range(n1):
        for j in range(n2):
            if x1[i] == x2[j]:
                S[i, j] = s
            else:
                S[i, j] = -s
    
    H = np.zeros((n1 + 1, n2 + 1))
    max2 = np.zeros(n1 + 1)
    
    for j in range(1, n2 + 1):
        for t in range(n1 + 1):
            max2[t] = max(max2[t], H[t, j - 1]) - w
        max1 = 0
        for i in range(1, n1 + 1):
            max1 = max(H[i - 1, j], max1) - w
            H[i, j] = max(max(H[i - 1, j - 1] + S[i - 1, j - 1], max1), max2[i], 0)
    return np.max(H)

x = 'aaabaaab'
y = 'aaabaaabaaababba'
a = pylcs.lcs(x, y) / max(len(x), len(y))
print(a)
a = pylcs.smith_w('aaabaaabaaabaaab', 'aaabaaabaaababba', 0.03, 0.03)
print(a)
c = pylcs.get_lcs([45, 65], [])
print(c)
示例#21
0
 def compare_password(self, password, new_password):
     lcs = pylcs.lcs(password, new_password)
     if (lcs / len(new_password)) < 0.8:
         return True
     return False
示例#22
0
def cal_lcs(text1, text2):
    return pylcs.lcs(text1, text2) / max(len(text1), len(text2))