Пример #1
0
def dataset_test():
    '''
    用训练问答对中的实体+属性,去知识库中进行问答测试准确率上限
    :return:
    '''
    with open(file) as f:
        total = 0
        recall = 0
        correct = 0

        for line in f:
            question, entity, attribute, answer, ner = line.split("\t")
            ner = ner.replace("#", "").replace("[UNK]", "%")
            # case1: entity and attribute Exact Match
            sql_e1_a1 = "select * from nlpccQA where entity='" + \
                entity+"' and attribute='"+attribute+"' limit 10"
            result_e1_a1 = upload_data(sql_e1_a1)

            # case2: entity Fuzzy Match and attribute Exact Match
            sql_e0_a1 = "select * from nlpccQA where entity like '%" + \
                entity + "%' and attribute='" + attribute + "' limit 10"
            #result_e0_a1 = upload_data(sql_e0_a1, True)

            # case3: entity Exact Match and attribute Fuzzy Match
            sql_e1_a0 = "select * from nlpccQA where entity like '" + \
                entity + "' and attribute='%" + attribute + "%' limit 10"
            #result_e1_a0 = upload_data(sql_e1_a0)

            if len(result_e1_a1) > 0:
                recall += 1
                for l in result_e1_a1:
                    if l[2] == answer:
                        correct += 1
            else:
                result_e0_a1 = upload_data(sql_e0_a1)
                if len(result_e0_a1) > 0:
                    recall += 1
                    for l in result_e0_a1:
                        if l[2] == answer:
                            correct += 1
                else:
                    result_e1_a0 = upload_data(sql_e1_a0)
                    if len(result_e1_a0) > 0:
                        recall += 1
                        for l in result_e1_a0:
                            if l[2] == answer:
                                correct += 1
                    else:
                        loginfo.logger.info(sql_e1_a0)
            if total > 100:
                break
            total += 1
            time.sleep(1)
            loginfo.logger.info(
                "total: {}, recall: {}, correct:{}, accuracy: {}%".format(
                    total, recall, correct, correct * 100.0 / recall))
Пример #2
0
def kb_fuzzy_classify_test():
    '''
    进行问答测试:
    1、 实体检索:输入问题,ner得出实体集合,在数据库中检索与输入实体相关的所有三元组
    2、 属性映射——bert分类/文本相似度
        + 非语义匹配:如果所得三元组的关系(attribute)属性是 输入问题 字符串的子集,将所得三元组的答案(answer)属性与正确答案匹配,correct +1
        + 语义匹配:利用bert计算输入问题(input question)与所得三元组的关系(attribute)属性的相似度,将最相似的三元组的答案作为答案,并与正确
          的答案进行匹配,correct +1
    3、 答案组合
    :return:
    '''
    with open(file, encoding='utf-8') as f:
        total = 0
        recall = 0
        correct = 0
        ambiguity = 0  # 属性匹配正确但是答案不正确

        for line in f:
            try:
                total += 1
                question, entity, attribute, answer, ner = line.split("\t")
                ner = ner.replace("#", "").replace("[UNK]",
                                                   "%").replace("\n", "")
                # case: entity Fuzzy Match
                # 找出所有包含这些实体的三元组
                sql_e0_a1 = "select * from nlpccQA where entity like '%" + \
                    ner + "%' order by length(entity) asc limit 20"
                # sql查出来的是tuple,要转换成list才不会报错
                result_e0_a1 = list(upload_data(sql_e0_a1))

                if len(result_e0_a1) > 0:
                    recall += 1

                    flag_fuzzy = True
                    # 非语义匹配,加快速度
                    # l1[0]: entity
                    # l1[1]: attribute
                    # l1[2]: answer
                    flag_ambiguity = True
                    for l in result_e0_a1:
                        if l[1] in question or l[1].lower(
                        ) in question or l[1].upper() in question:
                            flag_fuzzy = False

                            if estimate_answer(l[2], answer):
                                correct += 1
                                flag_ambiguity = False
                            else:
                                loginfo.logger.info("\t".join(l))

                    # 非语义匹配成功,继续下一次
                    if not flag_fuzzy:

                        if flag_ambiguity:
                            ambiguity += 1

                        time.sleep(1)
                        loginfo.logger.info(
                            "total: {}, recall: {}, correct:{}, accuracy: {}%, ambiguity:{}"
                            .format(total, recall, correct,
                                    correct * 100.0 / recall, ambiguity))
                        continue

                    # 语义匹配
                    result_df = pd.DataFrame(
                        result_e0_a1, columns=['entity', 'attribute', 'value'])
                    # loginfo.logger.info(result_df.head(100))

                    attribute_candicate_sim = [
                        (k, bs.predict(question, k)[0][1])
                        for k in result_df['attribute'].tolist()
                    ]
                    attribute_candicate_sort = sorted(
                        attribute_candicate_sim,
                        key=lambda candicate: candicate[1],
                        reverse=True)
                    loginfo.logger.info("\n".join([
                        str(k) + " " + str(v)
                        for (k, v) in attribute_candicate_sort
                    ]))

                    answer_candicate_df = result_df[result_df["attribute"] ==
                                                    attribute_candicate_sort[0]
                                                    [0]]
                    for row in answer_candicate_df.index:
                        if estimate_answer(
                                answer_candicate_df.loc[row, "value"], answer):
                            correct += 1
                        else:
                            loginfo.logger.info("\t".join(
                                answer_candicate_df.loc[row].tolist()))
                time.sleep(1)
                loginfo.logger.info(
                    "total: {}, recall: {}, correct:{}, accuracy: {}%, ambiguity:{}"
                    .format(total, recall, correct, correct * 100.0 / recall,
                            ambiguity))
            except Exception as e:
                loginfo.logger.info("the question id % d occur error %s" %
                                    (total, repr(e)))
Пример #3
0
def kbqa_api(sentence):
    """
    do online prediction. each time make prediction for one instance.
    you can change to a batch if you want.

    :param line: a list. element is: [dummy_label,text_a,text_b]
    :return:
    """
    def convert(line):
        feature = convert_single_example(0, line, label_list, FLAGS.max_seq_length, tokenizer, 'p')
        input_ids = np.reshape([feature.input_ids],(batch_size, FLAGS.max_seq_length))
        input_mask = np.reshape([feature.input_mask],(batch_size, FLAGS.max_seq_length))
        segment_ids = np.reshape([feature.segment_ids],(batch_size, FLAGS.max_seq_length))
        label_ids =np.reshape([feature.label_ids],(batch_size, FLAGS.max_seq_length))
        return input_ids, input_mask, segment_ids, label_ids

    global graph
    with graph.as_default():
        print(id2label)
        sentence = str(sentence)
        start = datetime.now()
        if len(sentence) < 2:
            print(sentence)
            return None
        sentence = tokenizer.tokenize(sentence)
        # print('your input is:{}'.format(sentence))
        input_ids, input_mask, segment_ids, label_ids = convert(sentence)

        feed_dict = {input_ids_p: input_ids,
                     input_mask_p: input_mask,
                     segment_ids_p:segment_ids,
                     label_ids_p:label_ids}
        # run session get current feed_dict result
        pred_ids_result = sess.run([pred_ids], feed_dict)
        pred_label_result = convert_id_to_label(pred_ids_result, id2label)
        print(pred_label_result)
        #todo: 组合策略
        result = strage_combined_link_org_loc(sentence, pred_label_result[0], True)
        print('识别的实体是:{}'.format(''.join(result)))
        #print('Time used: {} sec'.format((datetime.now() - start).seconds))
        ner = ''.join(result)
        ner = ner.replace("#", "").replace("[UNK]", "%").replace("\n", "")
        if len(ner) == 0:
            print('can not recognize this entity')
            return None
        sql_e0_a1 = "select * from nlpccQA where entity like '%" + ner + "%' order by length(entity) asc limit 20"
        result_e0_a1 = list(upload_data(sql_e0_a1))
        if len(result_e0_a1) == 0:
            print('can not find this NE in kb') 
        else:
            result_df = pd.DataFrame(result_e0_a1, columns=['entity', 'attribute', 'value'])
            attribute_candicate_sim = [(k, bs.predict(sentence, k)[0][1]) for k in result_df['attribute'].tolist()]
            attribute_candicate_sort = sorted(attribute_candicate_sim, key=lambda candicate: candicate[1], reverse=True)
            print ('\n知识库中相关的实体是: ', result_df)
            print('\n属性相似性排序结果:')
            print("\n".join([str(k)+" "+str(v) for (k, v) in attribute_candicate_sort]))
            print('\n问题是:', sentence)
            answer_candicate_df = result_df[result_df["attribute"] == attribute_candicate_sort[0][0]]
            for row in answer_candicate_df.index:
                print
                print('\n识别实体: ', ner, '最相似关系:', attribute_candicate_sort[0][0], '问题的答案是:', answer_candicate_df.loc[row,'value'] )
                return answer_candicate_df.loc[row,'value']
Пример #4
0
def kbqa_api(str_input, str_output):
    """
    do online prediction. each time make prediction for one instance.
    you can change to a batch if you want.

    :param line: a list. element is: [dummy_label,text_a,text_b]
    :return:
    """
    dict_output = {'status': 'yes'}
    dict_input = json.loads(str_input)
    print(dict_input, type(dict_input))
    if 'question' in dict_input:
        dict_output['question'] = dict_input['question']
        sentence = dict_input['question']
    else:
        dict_output['status'] = 'there is no question keyword in json format'
        str_output = json.dumps(dict_output)
        return str_output

    def convert(line):
        feature = convert_single_example(0, line, label_list,
                                         FLAGS.max_seq_length, tokenizer, 'p')
        input_ids = np.reshape([feature.input_ids],
                               (batch_size, FLAGS.max_seq_length))
        input_mask = np.reshape([feature.input_mask],
                                (batch_size, FLAGS.max_seq_length))
        segment_ids = np.reshape([feature.segment_ids],
                                 (batch_size, FLAGS.max_seq_length))
        label_ids = np.reshape([feature.label_ids],
                               (batch_size, FLAGS.max_seq_length))
        return input_ids, input_mask, segment_ids, label_ids

    global graph
    with graph.as_default():
        print(id2label)
        sentence = str(sentence)
        start = datetime.now()
        if len(sentence) < 2:
            print(sentence)
            dict_output['status'] = 'question value is too short'
            str_output = json.dumps(dict_output)
            return str_output
        sentence = tokenizer.tokenize(sentence)
        print('your input is:{}'.format(sentence))
        input_ids, input_mask, segment_ids, label_ids = convert(sentence)

        feed_dict = {
            input_ids_p: input_ids,
            input_mask_p: input_mask,
            segment_ids_p: segment_ids,
            label_ids_p: label_ids
        }
        pred_ids_result = sess.run([pred_ids], feed_dict)
        pred_label_result = convert_id_to_label(pred_ids_result, id2label)
        print(pred_label_result)
        #todo: 组合策略
        result = strage_combined_link_org_loc(sentence, pred_label_result[0],
                                              True)
        print('识别的实体是:{}'.format(''.join(result)))
        ner = ''.join(result)
        ner = ner.replace("#", "").replace("[UNK]", "%").replace("\n", "")
        if len(ner) == 0:
            print('can not recognize this entity')
            dict_output['status'] = 'can not recognize entity'
            str_output = json.dumps(dict_output)
            return str_output
        else:
            dict_output['entity'] = ner
        sql_e0_a1 = "select * from nlpccQA where entity like '%" + ner + "%' order by length(entity) asc limit 20"
        result_e0_a1 = list(upload_data(sql_e0_a1))
        #print('Time used: {} sec'.format((datetime.now() - start).seconds))
        dict_output['time'] = (datetime.now() - start).microseconds / 1000.0
        if len(result_e0_a1) == 0:
            print('can not find this NE in kb')
            dict_output['status'] = 'can not find this entity in kb'
            str_output = json.dumps(dict_output)
            return str_output
        else:
            result_df = pd.DataFrame(result_e0_a1,
                                     columns=['entity', 'attribute', 'value'])
            list_df = []
            for i in range(0, len(result_df)):
                list_df.append([
                    result_df.iloc[i]['entity'],
                    result_df.iloc[i]['attribute'], result_df.iloc[i]['value']
                ])
            attribute_candicate_sim = [
                (k, bs.predict(sentence, k)[0][1])
                for k in result_df['attribute'].tolist()
            ]
            attribute_candicate_sort = sorted(
                attribute_candicate_sim,
                key=lambda candicate: candicate[1],
                reverse=True)
            print('\n知识库中相关的实体是: ', result_df)
            print('\n属性相似性排序结果:')
            print("\n".join([
                str(k) + " " + str(v) for (k, v) in attribute_candicate_sort
            ]))
            print('\n问题是:', sentence)
            answer_candicate_df = result_df[result_df["attribute"] ==
                                            attribute_candicate_sort[0][0]]
            print(answer_candicate_df.index)
            for row in answer_candicate_df.index:
                print(row)
                print('\n识别实体: ', ner, '最相似关系:',
                      attribute_candicate_sort[0][0], '问题的答案是:',
                      answer_candicate_df.loc[row, 'value'])
                dict_output['status'] = 'ok'
                dict_output['answer'] = answer_candicate_df.loc[row, 'value']
                dict_output['kb'] = list_df
                dict_output['attribute'] = "\n".join([
                    str(k) + " " + str(v)
                    for (k, v) in attribute_candicate_sort
                ])
                str_output = json.dumps(dict_output)
                return str_output
Пример #5
0
from run_similarity import BertSim
import tensorflow as tf
from global_config import Logger

loginfo = Logger("recommend_articles.log", "info")
bs = BertSim()
bs.set_mode(tf.estimator.ModeKeys.PREDICT)

while True:
    choice = {}
    question = input("question:")
    start1 = datetime.now()
    ner = predict_service(question)
    print("识别出的实体:{}".format(ner))
    sql_e1 = "select * from nlpccQA where entity ='" + ner + "' order by length(entity) asc "
    result_e1 = list(upload_data(sql_e1))
    print("从数据库中精确找到实体{}个".format(len(result_e1)))
    result = result_e1
    if len(result_e1) == 0:
        print("精确查找没有查找到实体,采用模糊查找")
        sql_e0 = "select * from nlpccQA where entity like '%" + ner + "%' order by length(entity) asc "
        result_e0 = list(upload_data(sql_e0))
        print(result_e0)
        if len(result_e0) == 0:
            print("这个问题我也不知道呀~~")
            continue
        k = 1
        entity_candidate = [result_e0[0][0], 0]
        # [实体,start, end]
        flag = 0
        for i in range(1, len(result_e0)):