Exemplo n.º 1
0
def check(sent):
    words = word_tokenize(sent)
    t = ''
    code = ''
    while len(words) > 0:
        t = " ".join(words)
        if any(fuzz.UQRatio(t, item) > 80 for item in items):
            code = 'item'
            break
        elif any(fuzz.UQRatio(t, place) > 80 for place in places):
            code = 'place'
            break
        else:
            words.pop(len(words) - 1)
        t = ''
    return t, code
Exemplo n.º 2
0
    def match_constraint(self, qa_res, constraint, id2linked_ent):
        non_empty_constr = [(constr_name, constr_val)
                            for constr_name, constr_val in constraint.items()
                            if constr_val != '' and constr_val is not None]
        for ans in qa_res:
            linked_ent = id2linked_ent[ans['id']]
            match_res = None
            try:
                match_res = self.constr_extractor.match_constraint(
                    constraint, linked_ent)
            except:
                traceback.print_exc()

            ans['constr_score'] = 0
            if match_res is not None and len(match_res) != 0:
                # logger.info('限制匹配结果: '+str(match_res))
                for constr, is_match in match_res.items():
                    if is_match:
                        ans['constr_score'] += 0.3
                        ans['constr_name'] = constr
                        ans['constr_val'] = linked_ent['ent'][constr]
                    else:
                        ans['constr_score'] += -0.2
            else:
                for constr_name, constr_val in non_empty_constr:
                    if fuzz.UQRatio(constr_val[0], ans['entity']) >= 60:
                        ans['link_score'] -= 0.3
                    else:
                        ans['constr_score'] += -0.2
Exemplo n.º 3
0
    def match_constraint(self, constraint: dict, linked_ent):
        # check whether limit is available
        exist_constr = []
        res_constr = {}
        ent = linked_ent['ent']
        for constr_name, constr_val in constraint.items():
            if constr_val != '' and constr_val is not None \
                    and fuzz.UQRatio(constr_val[0], ent['name']) < 50:
                match_constr = False
                for rel, rel_val in ent.items():
                    if rel in self.remove_prop:
                        continue
                    rel_val = str(rel_val)
                    if (constr_name in rel
                        or constr_name in rel_val
                        or constr_val[0] in rel
                            or constr_val[0] in rel_val):
                        match_constr = True
                        res_constr[rel] = True
                        exist_constr.append((rel, constr_name))
                if not match_constr:
                    return None

        # filter result
        time_pattern = re.compile(r'\d+[:, :]')
        for rel, constr_name in exist_constr:
            rel_val = ent[rel].lower()
            if '地点' in constr_name:
                for item in constraint['地点']:
                    if item not in rel_val:
                        res_constr[rel] = False
            elif '时间' in constr_name:
                for item in constraint['时间']:
                    #  ''' or item == '最早' or item == '最晚'''''
                    if (item == '24小时' or item == '最早' or item == '最晚') and item not in rel_val:
                        res_constr[rel] = False
                        continue
                    bg_ed = time_pattern.findall(rel_val)
                    bg_ed = [int(x[:-1]) for x in bg_ed]
                    if '时' not in item:
                        if len(bg_ed) == 2:
                            if not (bg_ed[0] < int(item) < bg_ed[1]):
                                res_constr[rel] = False

            elif '币种' in constr_name or '银行' in constr_name or '航空公司' in constr_name or '价格' in constr_name:
                for item in constraint[constr_name]:
                    if item not in rel_val:
                        res_constr[rel] = False

        return res_constr
Exemplo n.º 4
0
 def link(self, sent, sent_cut, pos_tag, limits=None):
     # use bert embedding to fuzzy match entities
     # mention_list = recognize_entity(sent)
     mention_list = retrieve_mention(sent_cut, pos_tag)
     if mention_list == []:
         return []
     logger.debug('指称: ' + str(mention_list))
     # self.sent_cut = LTP.customed_jieba_cut(sent, cut_stop=True)
     # print('cut:', self.cut)
     res = []
     for mention in mention_list:
         mention = mention.lower()
         one_res = []
         if self.is_not_entity(mention):
             continue
         # cand_name = self.convert_abstract_verb(
         #     mention, sent, limits)
         cand_names = self.convert_mention2ent(mention)
         for ent in self.id2ent.values():
             # for ent_name in self.ent_names:
             ent_name = ent['name']
             ent_name_rewrite = self.rewrite_ent_name(ent_name)
             if ent_name_rewrite == '':
                 continue
             for cand_name in cand_names:
                 # 该实体为英文而问的有汉语或相反
                 if contain_chinese(cand_name) and not contain_chinese(ent_name) or contain_english(
                         cand_name) and not contain_english(ent_name):
                     continue
                 RATIO = 0.5
                 score = cosine_word_similarity(cand_name, ent_name_rewrite)
                 score1 = fuzz.UQRatio(cand_name, ent_name_rewrite)/100
                 score = RATIO*score + (1-RATIO) * score1
                 one_res.append({
                     'ent': ent,
                     'mention': mention,
                     'id': ent['neoId'],
                     'score': score,
                     'source': 'rule'
                 })
         one_res.sort(key=lambda x: x['score'], reverse=True)
         for a_res in one_res[:3]:
             if a_res['score'] > config.simi_ths:
                 res.append(a_res)
     res.sort(key=lambda x: x['score'], reverse=True)
     return res
Exemplo n.º 5
0
    def link(self, sent, sent_cut, pos_tag, limits=None):
        # use bert embedding to fuzzy match entities
        # mention_list = recognize_entity(sent)
        mention_list = retrieve_mention(sent_cut, pos_tag)
        #mention_list.append("中国航空公司")
        is_list = False
        if '哪些' in mention_list:
            is_list = True
        logger.debug('指称: ' + str(mention_list))
        if mention_list == []:
            return []
        res = []
        country_list = ['俄罗斯', '挪威', '美国', '蒙古', '泰国']
        if is_list:
            if all([word in mention_list for word in ['中国', '航空公司']]):
                for ent in self.id2ent.values():
                    if '类别' not in ent:
                        continue
                    if ent['类别'] == '国内航空公司':
                        res.append({
                            'ent': ent,
                            'mention': ''.join(['中国', '航空公司']),
                            'id': ent['neoId'],
                            'score': 1.5,
                            'source': 'rule'
                        })
            elif any([word in mention_list
                      for word in country_list]) and '航空公司' in mention_list:
                word = [word for word in mention_list
                        if word in country_list][0]
                #print(word)
                for ent in self.id2ent.values():
                    flag = False
                    if '类别' not in ent:
                        continue
                    if ent['类别'] == '国外航空公司':
                        if '别名' in ent and any(
                            [word in name for name in eval(ent['别名'])]):
                            flag = True
                        if '公司名称' in ent and word in ent['公司名称']:
                            flag = True
                        if flag:
                            print(ent['name'], ent)
                            res.append({
                                'ent': ent,
                                'mention': '国外航空公司',
                                'id': ent['neoId'],
                                'score': 1.5,
                                'source': 'rule'
                            })
            return res

        for mention in mention_list:
            mention = mention.lower()
            one_res = []
            if not contain_chinese(mention):
                search_list = []
                if '机场' in mention_list:
                    search_list = ['机场三字码', 'ICAO机场代码']
                elif '航空公司' in mention_list:
                    search_list = ['IATA代码', 'ICAO代码']
                for ent in self.id2ent.values():
                    if (len(search_list) == 0) or (not any(
                        [key in ent for key in search_list])):
                        continue
                    ent_iata = ''
                    ent_icao = ''
                    ent_three = ''
                    ent_icao_a = ''
                    if 'IATA代码' in ent:
                        ent_iata = ent['IATA代码']
                    if 'ICAO代码' in ent:
                        ent_icao = ent['ICAO代码']
                    if '机场三字码' in ent:
                        ent_three = ent['机场三字码']
                    if 'ICAO机场代码' in ent:
                        ent_icao_a = ent['ICAO机场代码']
                    if mention.upper() == ent_iata or mention.upper() == ent_icao or \
                    mention.upper() == ent_three or mention.upper() == ent_icao_a:
                        res.append({
                            'ent': ent,
                            'mention': mention,
                            'id': ent['neoId'],
                            'score': 2.5,
                            'source': 'rule'
                        })
                continue
            if self.is_not_entity(mention):
                continue
            # cand_name = self.convert_abstract_verb(
            #     mention, sent, limits)
            cand_names = self.convert_mention2ent(mention)  # entity别名设置
            for ent in self.id2ent.values():
                # for ent_name in self.ent_names:
                if 'name' not in ent:
                    continue
                if '机场' not in mention and (ent['类别'] == '国外机场'
                                            or ent['类别'] == '国内机场'):
                    continue
                ent_name = ent['name']
                ent_name_rewrite = self.rewrite_ent_name(ent_name)
                if ent_name_rewrite == '':
                    continue
                for cand_name in cand_names:
                    # 该实体为英文而问的有汉语或相反
                    # 原因:当时用bert encode以后进行相似度匹配的时候,输入的实体或者图谱中的实体有英文的话有时语义不相近bert也会给出较高的值,所以过滤掉只有一方出现英文的情况
                    if contain_chinese(cand_name) and not contain_chinese(
                            ent_name) or contain_english(
                                cand_name) and not contain_english(ent_name):
                        continue

                    RATIO = 0.5
                    score = cosine_word_similarity(cand_name, ent_name_rewrite)
                    score1 = fuzz.UQRatio(cand_name, ent_name_rewrite) / 100
                    score = RATIO * score + (1 - RATIO) * score1
                    one_res.append({
                        'ent': ent,
                        'mention': mention,
                        'id': ent['neoId'],
                        'score': score,
                        'source': 'rule'
                    })
            one_res.sort(key=lambda x: x['score'], reverse=True)
            for a_res in one_res[:3]:
                if a_res['score'] > config.simi_ths:
                    res.append(a_res)
        res.sort(key=lambda x: x['score'], reverse=True)
        return res
Exemplo n.º 6
0
    top_ratio = 0
    top_index = 0
    for index in xrange(len(broker_data)):
        ratio = fuzz.partial_token_sort_ratio(item, broker_data[index])
        if top_ratio < ratio:
            top_ratio = ratio
            top_index = index
    print "Partial Token Sort Ratio: {0} : {1} - {2}%".format(
        item, broker_data[top_index], top_ratio)

# NOT BAD!!!
for item in raw_data:
    top_ratio = 0
    top_index = 0
    for index in xrange(len(broker_data)):
        ratio = fuzz.UQRatio(item, broker_data[index])
        if top_ratio < ratio:
            top_ratio = ratio
            top_index = index
    print "UQRatio: {0} : {1} - {2}%".format(item, broker_data[top_index],
                                             top_ratio)

for item in raw_data:
    top_ratio = 0
    top_index = 0
    for index in xrange(len(broker_data)):
        ratio = fuzz.UWRatio(item, broker_data[index])
        if top_ratio < ratio:
            top_ratio = ratio
            top_index = index
    print "UWRatio: {0} : {1} - {2}%".format(item, broker_data[top_index],
Exemplo n.º 7
0
    def extract_rel(self,
                    sent_cut,
                    linked_ent,
                    limits=None,
                    thresh=config.prop_ths):
        ent = linked_ent['ent']
        mention = linked_ent.get('mention', ent['name'])
        # extract all prop, 限制支持一个
        props_dict = {}

        for prop, value in ent.items():
            if prop not in self.remove_prop:
                props_dict[prop] = str(value)

        # 计算满足限制
        '''
        try:
            res_limit = self.cal_limit(limits, props_dict)
        except:
            pdb.set_trace()

        wrong_restriction = ''
        accepted_limit = {}
        if res_limit is None:
            return None
        for limit in res_limit.keys():
            if not res_limit[limit]:
                wrong_restriction += ', ' + limit + ' 限制错误'
            else:
                accepted_limit[limit] = props_dict[limit]

        # cut
        limit_list = list(map(lambda x: x[1], list(limits.items())))
        rest_words = list(filter(
            lambda x: x not in cand_name and '机场' not in x and x not in limit_list, cut_words))
        '''
        rest_words = [
            w for w in sent_cut if w not in mention and '机场' not in w
        ]
        props_set = list(props_dict.keys())
        props_set.remove('name')
        # cal prop rel similarity
        res = []
        used_pairs = set()

        for prop in props_set:
            old_prop = prop
            for word in rest_words:
                prop = prop.replace('服务', '')
                cos_score = cosine_word_similarity(word,
                                                   self.normalize_prop(prop))
                text_score = fuzz.UQRatio(word, prop) / 100
                ratio = 0.6
                score = ratio * cos_score + (1 - ratio) * text_score
                rule_score = self.normalize_ratio(word, prop)
                score = rule_score if rule_score > 1 else score
                if word in prop and len(word) > 1:
                    score *= 1.2
                if score > thresh and (word, prop) not in used_pairs:
                    used_pairs.add((word, prop))
                    # res.append([neoId, cand_name, ent_name, {
                    #    prop: props_dict[prop]}, accepted_limit, score, ent_score])
                    res.append({
                        'id': ent['neoId'],
                        'mention': mention,
                        'entity': ent['name'],
                        'rel_name': old_prop,
                        'rel_val': props_dict[old_prop],
                        'link_score': linked_ent['score'],
                        'rel_score': score,
                        'rel_source': 'match'
                    })
        if len(res) == 0:
            return []
        res.sort(key=lambda x: x['rel_score'], reverse=True)
        # res_lang = []
        # for item in res:
        #     rel = list(item[3].keys())[0]
        #     val = item[3][rel]
        #     ans = item[2] + '的' + rel + '是' + val
        #     if wrong_restriction != '':
        #         ans += ' ' + wrong_restriction
        #     res_lang.append([ans] + item)
        # 如果前两个属性都很高,那返回两个答案
        sel_num = 1
        if len(res) > 1 and res[1]['rel_score'] > 0.91:
            sel_num += 1
        # sel_num = 2 if res_lang[0][res] > 0.91 and res_lang[0][6] > 0.91 else 1
        # return res_lang[:min(len(res), sel_num)]
        return res[:sel_num]
Exemplo n.º 8
0
    def testFuzzy(self):
        print(
            'ratio',
            fuzz.ratio('MISSION HOSPITAL',
                       'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'ratio',
            fuzz.ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                       'MISSION HOSPITAL'))

        print(
            'partial_ratio',
            fuzz.partial_ratio('MISSION HOSPITAL',
                               'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'partial_ratio',
            fuzz.partial_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                               'MISSION HOSPITAL'))

        print(
            'token_sort_ratio',
            fuzz.token_sort_ratio('MISSION HOSPITAL',
                                  'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'token_sort_ratio',
            fuzz.token_sort_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                                  'MISSION HOSPITAL'))

        print(
            'partial_token_sort_ratio',
            fuzz.partial_token_sort_ratio(
                'MISSION HOSPITAL',
                'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'partial_token_sort_ratio',
            fuzz.partial_token_sort_ratio(
                'MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                'MISSION HOSPITAL',
            ))

        print(
            'token_set_ratio',
            fuzz.token_set_ratio('MISSION HOSPITAL',
                                 'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'token_set_ratio',
            fuzz.token_set_ratio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                                 'MISSION HOSPITAL'))

        print(
            'partial_token_set_ratio',
            fuzz.partial_token_set_ratio(
                'MISSION HOSPITAL',
                'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'partial_token_set_ratio',
            fuzz.partial_token_set_ratio(
                'MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                'MISSION HOSPITAL',
            ))

        print(
            'QRatio',
            fuzz.QRatio('MISSION HOSPITAL',
                        'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'QRatio',
            fuzz.QRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                        'MISSION HOSPITAL'))

        print(
            'UQRatio',
            fuzz.UQRatio('MISSION HOSPITAL',
                         'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'UQRatio',
            fuzz.UQRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                         'MISSION HOSPITAL'))

        print(
            'WRatio',
            fuzz.WRatio('MISSION HOSPITAL',
                        'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'WRatio',
            fuzz.WRatio(
                'MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                'MISSION HOSPITAL',
            ))

        print(
            'UWRatio',
            fuzz.UWRatio('MISSION HOSPITAL',
                         'MISSION HOSPITAL REGIONAL MEDICAL CENTER'))
        print(
            'UWRatio',
            fuzz.UWRatio('MISSION HOSPITAL REGIONAL MEDICAL CENTER',
                         'MISSION HOSPITAL'))

        pass
Exemplo n.º 9
0
        ele1 = data_line[0].split('||')
        ele2 = data_line[1].split('||')
        kod1 = ele1[0]
        firm1 = ele1[1]
        kod2 = ele2[0]
        firm2 = ele2[1]
        # score = fuzz.token_set_ratio(firm1, firm2)

        score_r = fuzz.ratio(firm1, firm2)
        score_pr = fuzz.partial_ratio(firm1, firm2)
        score_tsor = fuzz.token_sort_ratio(firm1, firm2)
        score_tser = fuzz.token_set_ratio(firm1, firm2)
        score_ptsor = fuzz.partial_token_sort_ratio(firm1, firm2)
        score_ptser = fuzz.partial_token_set_ratio(firm1, firm2)
        score_qr = fuzz.QRatio(firm1, firm2)
        score_uqr = fuzz.UQRatio(firm1, firm2)
        score_wr = fuzz.WRatio(firm1, firm2)
        score_uwr = fuzz.UWRatio(firm1, firm2)

        # print('kod1:' + kod1)
        # print('firm1:' + firm1)
        # print('kod2:' + kod2)
        # print('firm2:' + firm2)
        # print('score:' + str(score))

        # if score_r > 90 or score_pr > 90 or score_tsor > 90 or score_tser > 90 or score_ptsor > 90 or score_ptser > 90 \
        #         or score_qr > 90 or score_uqr > 90 or score_wr > 90 or score_uwr > 90:

        if score_tser > 90:
            temp3 = (
                kod1, firm1, kod2, firm2, score_r, score_pr, score_tsor, score_tser, score_ptsor, score_ptser, score_qr,