Пример #1
0
def entity_error_check():
    ht0 = HarvestText()
    typed_words = {"人名":["武磊"]}
    ht0.add_typed_words(typed_words)
    sent1 = "武磊和吴力只差一个拼音"
    print(sent1)
    print(ht0.entity_linking(sent1, pinyin_recheck=True))
    sent2 = "武磊和吴磊只差一个字"
    print(sent2)
    print(ht0.entity_linking(sent2, char_recheck=True))
    sent3 = "吴磊和吴力都可能是武磊的代称"
    print(sent3)
    print(ht0.get_linking_mention_candidates(sent3, pinyin_recheck=True, char_recheck=True))
Пример #2
0
def filter_el_with_rule():
    # 当候选实体集很大的时候,实体链接得到的指称重可能有很多噪声,可以利用一些规则进行筛选
    # 1. 词性:全部由动词v,形容词a, 副词d, 连词c,介词p等组成的,一般不是传统意义上会关心的实体
    # 2. 词长:指称长度只有1的,一般信息不足
    # 由于这些规则可以高度定制化,所以不直接写入库中,而在外部定义。这段代码提供一个示例:
    def el_filtering(entities_info, ch_pos):
        return [([l, r], (entity0, type0))
                for [l, r], (entity0, type0) in entities_info if not all(
                    bool(re.search("^(v|a|d|c|p|y|z)", pos))
                    for pos in ch_pos[l:r]) and (r - l) > 1]

    ht0 = HarvestText()
    text = "《记得》:谁还记得 是谁先说 永远的爱我"
    entity_mention_dict = {
        '记得(歌曲)': ['记得', '《记得》'],
        "我(张国荣演唱歌曲)": ['我', '《我》']
    }
    entity_type_dict = {'记得(歌曲)': '歌名', '我(张国荣演唱歌曲)': '歌名'}
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    entities_info, ch_pos = ht0.entity_linking(
        text, with_ch_pos=True)  # 显式设定了with_ch_pos=True才有
    print("filter_el_with_rule")
    print("Sentence:", text)
    print("Original Entities:", entities_info)
    filtered_entities = el_filtering(entities_info, ch_pos)
    # 我 因为词长被过滤,而 记得 因为是纯动词而被过滤,但是《记得》包括了标点,不会被过滤
    print("filtered_entities:", filtered_entities)
Пример #3
0
def test_entity_error_check():
    sys.stdout, expected = open(get_current_function_name()+"_current","w"), open(get_current_function_name()+"_expected").read()
    ht0 = HarvestText()
    typed_words = {"人名":["武磊"]}
    ht0.add_typed_words(typed_words)
    sent1 = "武磊和吴力只差一个拼音"
    print(sent1)
    print(ht0.entity_linking(sent1, pinyin_recheck=True))
    sent2 = "武磊和吴磊只差一个字"
    print(sent2)
    print(ht0.entity_linking(sent2, char_recheck=True))
    sent3 = "吴磊和吴力都可能是武磊的代称"
    print(sent3)
    print(ht0.get_linking_mention_candidates(sent3, pinyin_recheck=True, char_recheck=True))

    sys.stdout.close()
    assert open(get_current_function_name() + "_current").read() == expected
Пример #4
0
def el_keep_all():
    ht0 = HarvestText()
    entity_mention_dict = {'李娜1': ['李娜'], "李娜2": ['李娜']}
    entity_type_dict = {'李娜1': '运动员', '李娜2': '歌手'}
    ht0.add_entities(entity_mention_dict, entity_type_dict)
    print(ht0.entity_linking("打球的李娜和唱歌的李娜不是一个人", keep_all=True))