Пример #1
0
    def semantic_role_label(self):
        #依存句法分析
        parser = Parser()
        parser.load('ltp_data/parser.model')
        arcs = parser.parse(self.words, self.postags)
        parser.release()

        labeller = SementicRoleLabeller()
        labeller.load('ltp_data/srl')
        roles = labeller.label(self.words, self.postags, self.netags, arcs)

        Label_AX = []  #存放A0或者A1标签的列表
        for role in roles:
            Label_AX.extend([
                arg for arg in role.arguments
                if arg.name == "A0" or arg.name == "A1"
            ])
        for label in Label_AX:
            #排除一些长度异常的标签为A0或者A1的动作实施者或者动作接受者
            if label.range.end - label.range.start > 0 and label.range.end - label.range.start < 10:
                for i in range(label.range.start, label.range.end + 1):
                    #将动作实施者或者动作接受者中的名词,人名,地名拿出来作为实体
                    if self.postags[i] == "n" or self.postags[
                            i] == "ns" or self.postags[
                                i] == "nh" or self.postags[i] == "ni":
                        self.entity.append(self.words[i])
                    else:
                        pass
            else:
                pass
        labeller.release()
Пример #2
0
class LTP_word():
    """docstring for parser_word
    deal处理文本,返回词表、词性及依存关系,语义,命名实体五个值
    release释放缓存"""
    def __init__(self, model_path):
        self.model_path = model_path
        self.segmentor = Segmentor()  # 分词初始化实例
        self.segmentor.load_with_lexicon(path.join(self.model_path, 'cws.model'), path.join(self.model_path, 'dictionary_kfc.txt'))
        self.postagger = Postagger() # 词性标注初始化实例
        self.postagger.load(path.join(self.model_path, 'pos.model') ) # 加载模型
        self.recognizer = NamedEntityRecognizer() # 命名实体识别初始化实例
        self.recognizer.load(path.join(self.model_path, 'ner.model'))
        self.parser = Parser() # 依存句法初始化实例 s
        self.parser.load(path.join(self.model_path, 'parser.model'))  # 加载模型
        self.labeller = SementicRoleLabeller() # 语义角色标注初始化实例
        self.labeller.load(path.join(self.model_path, 'srl'))
    def deal (self, text):  #把所有该要使用的东西都提取出来
        words =self.segmentor.segment(text)    # 分词 
        postags = self.postagger.postag(words)  # 词性标注
        netags = self.recognizer.recognize(words, postags)	#命名实体
        arcs = self.parser.parse(words, postags)  # 句法分析
        roles = self.labeller.label(words, postags, netags, arcs)  # 语义角色标注
        return words,postags,arcs,roles,netags
    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.labeller.release()
Пример #3
0
class ModelLoader:
    __instance = None

    def __new__(cls):
        if cls.__instance is None:
            cls.__instance = super(ModelLoader, cls).__new__(cls)
            cls.__instance.__initialized = False
        return cls.__instance

    def __init__(self):
        if (self.__initialized): return
        self.__initialized = True
        LTP_DIR = "./ltp_data"
        #客製化分詞,並且後處理更改詞性
        self.segmentor = Segmentor()
        self.segmentor.load_with_lexicon(
            os.path.join(LTP_DIR, "cws.model"),
            os.path.join(LTP_DIR, 'customized.txt'))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))

        self.sentenceSplitter = SentenceSplitter()
Пример #4
0
    def role_label(self, words, postags, arcs):
        """
        语义角色标注
        :param words:
        :param postags:
        :param arcs:
        :return:
        """
        srl_model = os.path.join(self.MODEL_PATH, 'pisrl_win.model')

        labeller = SementicRoleLabeller()  # 初始化实例
        labeller.load(srl_model)  # 加载模型

        roles = labeller.label(words, postags, arcs)  # 语义角色标注

        for role in roles:
            print(
                role.index, "".join([
                    "{0}:({1},{2})".format(arg.name, arg.range.start,
                                           arg.range.end)
                    for arg in role.arguments
                ]))
        labeller.release()

        return "roles{}".format(roles)
Пример #5
0
class LTP:
    def __init__(self):
        self.segmentor = Segmentor()  # 分词器
        self.segmentor.load_with_lexicon(
            Config.SEGMENTOR_PATH, Config.PERSONAL_SEGMENTOR_PATH)  # 加载模型
        self.postagger = Postagger()  # 词性分析器
        self.postagger.load(Config.POSTAGGER_PATH)  # 加载模型
        self.parser = Parser()  # 句法分析器
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(Config.NAMED_ENTITY_RECONGNTION_PATH)
        self.parser.load(Config.PARSER_PATH)  # 加载模型
        self.labeller = SementicRoleLabeller()  # 语义角色分析器
        self.labeller.load(Config.LABELLER_PATH)  # 加载模型
        self.negative_list = get_negative_list()
        self.no_list = get_no_list()
        self.limit_list = get_limit_list()
        self.special_list = get_special_list()
        self.key_sentences = []

    def __del__(self):
        """
        资源释放
        """
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
        self.labeller.release()
Пример #6
0
def role(words, postags, netags, arcs):
    labeller = SementicRoleLabeller()  # 初始化实例
    # labeller.load('/usr/local/src/ltp_data/srl')  # 加载模型
    labeller.load(srl_model_path)  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    """
    #arg.name 表示语义角色关系
    #arg.range.start 表示起始词位置
    #arg.range.end 表示结束位置
    roletype = {'C-A0':'施事','A0':'施事','A1':'受事','A2':'间接对象','A3':'直接目标','A4':'直接方法','A5':'其它','ADV':'附词','BNE':'受益人'
        , 'CND': '条件','DIR':'方向','DGR':'程度','EXT':'扩展','FRQ':'频率','LOC':'地点','MNR':'方式','PRP':'目的或原因'
        , 'TMP': '时间', 'TPC': '主题', 'CRD': '并列', 'PRD': '谓词', 'PSR': '持有者', 'PSE': '被持有','DIS': '转折'}
    postype = {'A0':'施事','A1':'受事','A2':'间接对象','A3':'直接目标','A4':'直接方法','A5':'其它','ADV':'附词','BNE':'受益人'
        , 'CND': '条件','DIR':'方向','DGR':'程度','EXT':'扩展','FRQ':'频率','LOC':'地点','MNR':'方式','PRP':'目的或原因'
        , 'TMP': '时间', 'TPC': '主题', 'CRD': '并列', 'PRD': '谓词', 'PSR': '持有者', 'PSE': '被持有'}
    for role in roles:
        #print role.index, "".join(["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])

        outstr = ""
        for arg in role.arguments:
            block = ''

            for num in range(arg.range.start, arg.range.end+1):
                block = block + words[num]+'[%d-%s]'%(num,postags[num])
            outstr = outstr + roletype[arg.name] + "(%s);" % block
        print '%d-%s'%(role.index,words[role.index])+ ":"+outstr
    """
    labeller.release()  # 释放模型
    return roles
class LtpParser:
    def __init__(self):
        LTP_DIR = "./ltp_data_v3.4.0"
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))

    '''语义角色标注'''
    def format_labelrole(self, words, postags):
        arcs = self.parser.parse(words, postags)
        roles = self.labeller.label(words, postags, arcs)
        roles_dict = {}
        for role in roles:
            roles_dict[role.index] = {arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments}
        return roles_dict

    '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典'''
    def build_parse_child_dict(self, words, postags, arcs):
        child_dict_list = []
        format_parse_list = []
        for index in range(len(words)):
            child_dict = dict()
            for arc_index in range(len(arcs)):
                if arcs[arc_index].head == index+1:   #arcs的索引从1开始
                    if arcs[arc_index].relation in child_dict:
                        child_dict[arcs[arc_index].relation].append(arc_index)
                    else:
                        child_dict[arcs[arc_index].relation] = []
                        child_dict[arcs[arc_index].relation].append(arc_index)
            child_dict_list.append(child_dict)
        rely_id = [arc.head for arc in arcs]  # 提取依存父节点id
        relation = [arc.relation for arc in arcs]  # 提取依存关系
        heads = ['Root' if id == 0 else words[id - 1] for id in rely_id]  # 匹配依存父节点词语
        for i in range(len(words)):
            # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n']
            a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]]
            format_parse_list.append(a)

        return child_dict_list, format_parse_list

    '''parser主函数'''
    def parser_main(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs)
        roles_dict = self.format_labelrole(words, postags)
        return words, postags, child_dict_list, roles_dict, format_parse_list
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(srl_model_path)  # 加载模型
    roles = labeller.label(words, postags, arcs)  # 语义角色标注
    #for role in roles:
    #   print (role.index, "".join(   ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
    labeller.release()  # 释放模型
    return roles
Пример #9
0
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller() # 初始化实例
    labeller.load('/Users/chenming/Spyder/3.3.1/ltp_data/srl/')  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    for role in roles:
        print (role.index, "".join(
            ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
    labeller.release()  # 释放模型
Пример #10
0
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller() # 初始化实例
    model = "srl"
    labeller.load(os.path.join(modelPath, model))  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    for role in roles:
        print role.index, "".join(
            ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])
    labeller.release()  # 释放模型
Пример #11
0
def yuyijuese(words, postags, netags, arcs):
    """语义角色标注  """
    labeller = SementicRoleLabeller()
    labeller.load(os.path.join(MODELDIR, "srl/"))
    roles = labeller.label(words, postags, netags, arcs)

    for role in roles:
        print (role.index, "".join(
                ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
def ltp_sementic_role_labeller(LTP_DATA_DIR, words, postags, arcs):
    # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。
    # windos下开发使用pisrl_win.model模型
    srl_model_path = os.path.join(LTP_DATA_DIR, 'srl/pisrl_win.model')
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(srl_model_path)  # 加载模型
    # arcs 使用依存句法分析的结果
    roles = labeller.label(words, postags, arcs)  # 语义角色标注
    labeller.release()  # 释放模型
    return roles
Пример #13
0
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load('/Users/zhangqinyuan/Downloads/ltp_data_v3.4.0/srl')  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    for role in roles:
        print(
            role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ]))
    labeller.release()  # 释放模型
Пример #14
0
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load('../ltp_data/srl')  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    # print '----------------'
    # for role in roles:
    #     print role.index, "".join(
    #         ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])
    # print '----------------'
    labeller.release()  # 释放模型
    return roles
Пример #15
0
def role_label(words, postags, netags, arcs):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(os.path.join(LTP_DATA_DIR, 'srl'))  # 加载模型
    roles = labeller.label(words, postags, netags, arcs)  # 语义角色标注
    for role in roles:
        print(
            role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ]))
    labeller.release()  # 释放模型
Пример #16
0
def SrlFunction(contents):
    from pyltp import Segmentor
    segmentor = Segmentor()  # 初始化实例
    # segmentor.load(cws_model_path)  # 加载模型
    segmentor.load_with_lexicon(cws_model_path,
                                'E:\\ltp_data_v3.4.0\\personal_seg.txt')
    words = segmentor.segment(contents)  # 分词
    k = 1
    for word in words:
        print(word + str(k) + '  ', end='')
        k = k + 1
    print('\n')
    # print('\t'.join(words))
    segmentor.release()  # 释放模型
    wordslist = list(words)

    from pyltp import Postagger
    postagger = Postagger()
    # postagger.load(pos_model_path)
    postagger.load_with_lexicon(pos_model_path,
                                'D:\\ltp_data_v3.4.0\\personal_pos.txt')
    postags = postagger.postag(wordslist)
    print('\t'.join(postags))
    postagger.release()

    # wordslist = ['人力资源社会保障局','主管','医疗保险','工作']
    # postags = ['n','v','n','v']

    from pyltp import Parser
    parser = Parser()  # 初始化实例
    parser.load(par_model_path)  # 加载模型
    arcs = parser.parse(wordslist, postags)  # 句法分析
    print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
    parser.release()  # 释放模型

    from pyltp import SementicRoleLabeller
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(srl_model_path)  # 加载模型
    # arcs 使用依存句法分析的结果
    roles = labeller.label(wordslist, postags, arcs)  # 语义角色标注

    # 打印结果
    for role in roles:
        print(
            role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ]))
    labeller.release()  # 释放模型
Пример #17
0
 def get_roles_by_pyltp(self, words_list, postags_list, arcs_list):
     roles_list = list()
     # 语义角色标注模型路径,模型名称为‘pisrl.model’
     srl_model_path = os.path.join(self.ltp_dir_path, "pisrl.model")
     labeller = SementicRoleLabeller()
     labeller.load(srl_model_path)
     roles = labeller.label(words_list, postags_list, arcs_list)
     labeller.release()
     # 尝试释放内存
     # import gc
     # del labeller
     # gc.collect()
     # 算了,这个不行
     roles_list = list(roles)
     return roles_list
Пример #18
0
    def get_role_list(self, words, postags):
        parser = Parser()
        parser.load(Dependency.par_model)

        rolelabel = SementicRoleLabeller()
        rolelabel.load(Dependency.pisrl_model)
        try:
            parsers = parser.parse(words, postags)
            roles = rolelabel.label(words, postags, parsers)
        except Exception as e:
            roles = [[]]
        finally:
            parser.release()
            rolelabel.release()
            return roles
Пример #19
0
def srl(words, postags, arcs):
    global labeller
    if labeller is None:
        srl_model_path = os.path.join(LTP_DATA_DIR, 'srl')  # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。
        labeller = SementicRoleLabeller() # 初始化实例
        labeller.load(srl_model_path)  # 加载模型

    # arcs 使用依存句法分析的结果
    roles = labeller.label(words, postags, arcs)  # 语义角色标注

    # 打印结果
    role_list = []
    for role in roles:
        for arg in role.arguments:
            args = (role.index, arg.name, arg.range.start, arg.range.end)
            role_list.append(args)
    return role_list
def get_srl(sentence):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(srl_model_path)  # 加载模型
    words = list(pyltp_cut(sentence))  # pyltp分词
    postags = list(postagger.postag(words))  # 词性标注
    arcs = get_parsing(sentence)
    # arcs 使用依存句法分析的结果
    roles = labeller.label(words, postags, arcs)  # 语义角色标注

    # 打印结果
    for role in roles:
        print(
            role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ]))
        labeller.release()  # 释放模型
Пример #21
0
    def get_srl(self, words):
        # 语义角色标注
        labeller = SementicRoleLabeller()  # 初始化实例
        labeller.load(self.srl_model_path)  # 加载模型
        # arcs 使用依存句法分析的结果
        postags = self.get_postags(words)
        arcs = self.get_dependency(words)
        roles = labeller.label(words, postags, arcs)  # 语义角色标注

        # 打印结果
        for role in roles:
            print(
                role.index, "".join([
                    "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                    for arg in role.arguments
                ]))
        labeller.release()  # 释放模型
        return roles
Пример #22
0
def labeller(word_tag, arcs, srl_model_path):
    '''
    Desc: 语义角色标注
    Args: word_tag(dict) 词性词典
          arcs 依存关系
          srl_model_path 语义角色标注模型
    '''

    labeller = SementicRoleLabeller()
    labeller.load(srl_model_path)
    roles = labeller.label(list(word_tag.keys()), list(word_tag.values()),
                           arcs)
    for role in roles:
        print(
            role.index, "".join([
                "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
                for arg in role.arguments
            ]))
    labeller.release()
def sentence_label(parse_result):
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load(srl_model_path)  # 加载模型
    i = 0
    final_result = []

    for key, value in parse_result.items():
        i += 1
        if i % 50 == 0:
            print('休息一下')
            time.sleep(5)
        words = value[0]
        postags = value[1]
        arcs = value[2]
        roles = labeller.label(words, postags, arcs)

    print('done')
    print(final_result)
    labeller.release()
Пример #24
0
class LtpParser(object):
    def __init__(self, data_dir: str):
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(data_dir, "cws.model"))
        self.postagger = Postagger()
        self.postagger.load(os.path.join(data_dir, "pos.model"))
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(data_dir, "ner.model"))
        self.parser = Parser()
        self.parser.load(os.path.join(data_dir, "parser.model"))
        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(data_dir, "pisrl.model"))

    def parse(self, text: str) -> List[str]:
        tokens = self.segmentor.segment(text)
        postags = self.postagger.postag(tokens)
        netags = self.recognizer.recognize(tokens, postags)
        arcs = self.parser.parse(tokens, postags)
        roles = self.labeller.label(tokens, postags, arcs)
        srlabels = {}
        for role in roles:
            srlabels[role.index] = {
                arg.name: {
                    "start": arg.range.start,
                    "end": arg.range.end
                }
                for arg in role.arguments
            }
        return {
            "tokens": list(tokens),
            "postags": list(postags),
            "netags": list(netags),
            "srlabels": srlabels,
        }

    def release(self):
        self.segmentor.release()
        self.postagger.release()
        self.recognizer.release()
        self.parser.release()
        self.labeller.release()
Пример #25
0
def ltp(t_str):
    segmentor = Segmentor()
    segmentor.load('cws.model')
    postagger = Postagger()  # 初始化实例
    postagger.load('pos.model')  # 加载模型
    recognizer = NamedEntityRecognizer()  # 初始化实例
    recognizer.load('ner.model')  # 加载模型
    labeller = SementicRoleLabeller()  # 初始化实例
    labeller.load('pisrl.model')  # 加载模型
    parser = Parser()
    parser.load('parser.model')

    cut_line = '\t'.join(segmentor.segment(t_str))
    words_list = cut_line.split('\t')  # 分词

    postags = postagger.postag(words_list)  # 词性标注
    pos_line = '\t'.join(postags)
    pos_list = pos_line.split('\t')

    netags = recognizer.recognize(words_list, pos_list)  # 命名实体识别
    ner_line = '\t'.join(netags)
    ner_list = ner_line.split('\t')

    arcs = parser.parse(words_list, pos_list)  # 句法分析
    arcs_line = "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)
    arcs_list = arcs_line.split('\t')
    i = 0
    for word, arc in zip(words_list, arcs):
        i = i + 1
        print(
            str(i) + '/' + word + '/' + str(arc.head) + '/' +
            str(arc.relation))

    # roles = labeller.label(words_list, pos_list, arcs)  # 语义角色标注
    # for role in roles:
    #     print(role.index, "".join(
    #         ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))

    words_list = merge_base(words_list, pos_list, ner_list, arcs_list)
    print(words_list)
Пример #26
0
class Ltp_parser:
    def __init__(self):
        self.segmentor = Segmentor()
        self.segmentor.load('/home/student/project/project-01/Four-Little-Frogs/ltp_data_v3.4.0/cws.model')
        self.postagger = Postagger()
        self.postagger.load('/home/student/project/project-01/Four-Little-Frogs/ltp_data_v3.4.0/pos.model')
        self.parser = Parser()
        self.parser.load('/home/student/project/project-01/Four-Little-Frogs/ltp_data_v3.4.0/parser.model')
        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load('/home/student/project/project-01/Four-Little-Frogs/ltp_data_v3.4.0/ner.model')
        self.labeller = SementicRoleLabeller()
        self.labeller.load('/home/student/project/project-01/Four-Little-Frogs/ltp_data_v3.4.0/pisrl.model')

    '''依存句法分析'''
    def get_parser(self, words, postags):
        arcs = self.parser.parse(words, postags)
        # arcs = ' '.join("%d:%s" % (arc.head, arc.relation) for arc in arcs)
        return arcs

    '''命名实体识别'''
    def get_name_entity(self, words, postags):
        netags = self.recognizer.recognize(words, postags)
        netags = list(netags)
        return netags

    '''ltp模型释放'''
    def ltp_release(self):
        self.segmentor.release()
        self.postagger.release()
        self.parser.release()
        self.recognizer.release()

    '''LTP主函数'''
    def parser_main(self, sentence):
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))
        arcs = self.get_parser(words, postags)
        netags = self.get_name_entity(words, postags)
        return words, postags, arcs, netags
Пример #27
0
class LtpParser:
    def __init__(self):
        LTP_DIR = 'D:\LTP\MODEL\ltp_data'  # ltp模型目录的路径
        self.segmentor = Segmentor()
        self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))# 分词模型路径,模型名称为`cws.model`

        self.postagger = Postagger()
        self.postagger.load(os.path.join(LTP_DIR, "pos.model"))# 词性标注模型路径,模型名称为`pos.model`

        self.parser = Parser()
        self.parser.load(os.path.join(LTP_DIR, "parser.model"))# 依存句法分析模型路径,模型名称为`parser.model

        self.recognizer = NamedEntityRecognizer()
        self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))# 命名实体识别模型路径,模型名称为`ner.model`

        self.labeller = SementicRoleLabeller()
        self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))# 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。
    def ner(self,words, postags):
        netags = self.recognizer.recognize(words, postags)  # 命名实体识别
        #for word, ntag in zip(words, netags):
        #   print(word + '/' + ntag)
        self.parser.release()  # 释放模型
        return netags
Пример #28
0
print "\t".join(words)

postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
#postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
netags = recognizer.recognize(words, postags)
print "\t".join(netags)

labeller = SementicRoleLabeller()
labeller.load(os.path.join(MODELDIR, "srl/"))
#labeller.load("/home/yjliu/ltp/model/srl/")
roles = labeller.label(words, postags, netags, arcs)

for role in roles:
    print role.index, "".join([
        "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end)
        for arg in role.arguments
    ])
Пример #29
0
class AnsExtractor(object):

    # init做的事情有:
    # 1、加载模型
    # 2、加载同义词词林
    # 3、加载补充的问题分类所需的规则数组
    # 其他参数传递给主流程函数do_ans_extract
    def __init__(self):
        self.segmentor = Segmentor()  # 初始化实例
        self.segmentor.load(cws_model_path)  # 加载模型
        self.postagger = Postagger()
        self.postagger.load(pos_model_path)
        self.recognizer = NamedEntityRecognizer()  # 初始化实例
        self.recognizer.load(ner_model_path)  # 加载模型
        self.parser = Parser()
        self.parser.load(par_model_path)
        self.labeller = SementicRoleLabeller()
        self.labeller.load(srl_model_path)
        # 以下是补充的问题分类所需的规则数组
        self.istime_lst = [
            '年份是', "时间是", "哪一年", "何时", "什么时候", "什么时间", "哪一月", "哪一日"
        ]
        self.iscolor_lst = ['什么颜色', "哪种颜色", "哪个颜色", "颜色是"]
        self.unit_lst = [
            "回", "对", "山", "只", "刀", "群", "江", "条", "个", "打", "尾", "手", "双",
            "张", "溪", "挑", "坡", "首", "令", "网", "辆", "座", "阵", "队", "顶", "匹",
            "担", "墙", "壳", "炮", "场", "扎", "棵", "支", "颗", "钟", "单", "曲", "客",
            "罗", "岭", "阙", "捆", "丘", "腔", "贯", "袭", "砣", "窠", "岁", "倍", "枚",
            "次"
        ]
        self.islocation_lst = [
            '哪个城市', "哪个国家", '国籍是', "什么国籍", "哪个省", "哪座城市", "县份是", "地址在哪里", "哪里",
            "何处", "何地", "哪儿", "什么地方", "什么地点"
        ]
        self.isorganization_lst = ['哪个组织', "组织是", "哪个机构", "什么组织", "什么机构"]
        self.isperson_lst = [
            '哪个皇帝', "是谁", "什么名字", "者是", "身份是", "学家是", "什么人", "哪个人"
        ]
        self.isnum_lst = list()
        for unit in self.unit_lst:
            self.isnum_lst.append("多少" + unit)
        self.stop_words = []  # 停用词目前还没用到
        self.sim_word_code = {}  # 每个词有一个list,是它的编码(可能多个)
        self.get_sim_cloud()

    # 读取同义词词林
    def get_sim_cloud(self):
        """
        同义词词林中的词有三种关系,同义、相关(不一定同义)、独立词
        如果用于计算相似度的话,相关的词语具有相同的code,也是能接受的
        所以并没有区分词关系,而是直接读取了词的code
        填充sim_word_code
        """
        sim_file = open("similarity.txt", 'r', encoding="utf-8")
        lines = sim_file.readlines(1000000)
        # 对行按格式进行处理
        for line in lines:
            code = line[0:7]
            the_type = line[7]
            words = line[9:]
            words = words.split(' ')
            # 解析过的一行,放进模型
            for word in words:
                if word in self.sim_word_code:
                    self.sim_word_code[word].append(code)
                else:
                    self.sim_word_code[word] = []
                    self.sim_word_code[word].append(code)
        sim_file.close()
        pass

    def get_all_NER(self, ans_sentence, type):
        '''
        得到答案中的所有某一类型的命名实体
        :param ans_sentence: 答案句子
        :param type: 命名实体类型
        :return: 该类型的所有命名实体集合
        '''
        words = self.segmentor.segment(ans_sentence)  # 分词
        postags = self.postagger.postag(words)  # 词性标注
        netags = self.recognizer.recognize(words, postags)  # 命名实体识别

        ner_lst = list()  # 命名实体集合
        temp_str = ''
        for i in range(len(netags)):
            if netags[i] == 'S-' + type:
                ner_lst.append(words[i])
            elif netags[i] == 'B-' + type:
                temp_str = words[i]
            elif netags[i] == 'I-' + type:
                temp_str += words[i]
            elif netags[i] == 'E-' + type:
                temp_str += words[i]
                ner_lst.append(temp_str)
        return ner_lst

    def get_pos_lst(self, sentence, type):
        '''
        获得句子中的某种词性集合
        :param sentence:
        :return: 返回词性集合
        '''
        words = list(self.segmentor.segment(sentence))
        postags = list(self.postagger.postag(words))

        temp_tag = ''
        postag_lst = list()
        for i in range(len(postags)):
            if postags[i] == type:
                temp_tag += words[i]
            else:
                if temp_tag != '':
                    postag_lst.append(temp_tag)
                    temp_tag = ''
        return postag_lst

    def get_context_type(self, ques):
        '''
        判断问题类型 是上一句还是下一句
        :param ques: 问题语句
        :return: '上句' '下句'
        '''
        next_word = ['下句', '下一句', '下文', '后文']
        for word in next_word:
            if ques.find(word) != -1:
                return '下文'
        return '上文'

    def get_parse_oneclass(self, sent):
        '''
        获得句子的第二层 节点
        :param sent:
        :return:返回词语及其依存关系
        '''
        words = list(self.segmentor.segment(sent))
        postags = list(self.postagger.postag(words))
        arcs = self.parser.parse(words, postags)
        result_arc = dict()
        # 依存关系的下标 是从1开始的   0表示root
        i = 0
        for arc in arcs:
            if arc.head == 0:
                root = {
                    'word': words[i],
                    'rel': arc.relation,
                    'rel_index': i + 1,
                    'tag': postags[i]
                }
                head = words[i]
            i = i + 1
        i = 0
        for arc in arcs:
            if arc.head == root['rel_index']:
                result_arc[words[i]] = {'rel': arc.relation, 'tag': postags[i]}
            i = i + 1

        return head, result_arc

    def list_has_intersection(self, lsta, lstb):
        '''
        lstb中的单词 有否存在某一个单词  是lsta的某个单词  子串
        :param lsta:
        :param lstb:
        :return: lsta的某个单词
        '''
        for wa in lsta:
            for wb in lstb:
                if wa.find(wb) > -1:
                    return wa
        return None

    def get_arc_by_index(self, arcs, index):
        '''
        根据index 得到arc
        :param index: 这里的index 与语法依存树保持一致  即从1开始
        :return: arc
        '''
        i = 1
        for arc in arcs:
            if i == index:
                return arc
            i = i + 1

    def has_spe_words(self, text, lst):
        '''
        用于判断某句话里面 是否有列表中的单词
        :param text:
        :param lst:
        :return:
        '''
        for word in lst:
            if text.find(word) > -1:
                return True
        return False

    def get_core_rel(self, arcs, words, word):
        index = words.index(word) + 1
        arc = self.get_arc_by_index(arcs, index)
        while (arc.relation == 'ATT'):
            arc = self.get_arc_by_index(arcs, arc.head)
        arc = self.get_arc_by_index(arcs, arc.head)
        return arc

    def get_index_list(self, str, word):
        '''
        获得word在str中的index列表
        :param str:
        :param word:
        :return: 返回一个列表  没有返回空列表
        '''
        start = 0
        lst = list()
        while start < len(str):
            index = str.find(word, start)
            if index == -1:
                break
            else:
                lst.append(index)
                start = index + 1
        return lst

    def cal_dis_with_dict(self, ques_kw_dic, ans_kw_dic):
        '''
        计算 问题关键词index 和 答案关键词index 之间的距离
        :param ques_kw_dic:
        :param ans_ke_dic:
        :return: 按照距离从小到大的 tuple组成的list
        '''
        result_dic = dict()
        for ans_kw, ans_kw_index_lst in ans_kw_dic.items():
            temp = 99999
            for ans_kw_index in ans_kw_index_lst:
                ans_kw_dis = 0
                for ques_kw, ques_kw_index_lst in ques_kw_dic.items():
                    temp_dis = 9999
                    for ques_kw_index in ques_kw_index_lst:
                        if abs(ques_kw_index - ans_kw_index) < temp_dis:
                            temp_dis = abs(ques_kw_index - ans_kw_index)
                    ans_kw_dis += temp_dis
                if ans_kw_dis < temp:
                    temp = ans_kw_dis
            result_dic[ans_kw] = temp
        # 排序 从小到大
        result_tup = sorted(result_dic.items(), key=lambda item: item[1])
        return result_tup

    def calc_dis_ner_with_dict(self, ques_kw_dic, ner_lst, ans):
        '''
        实体集合 计算距离问题关键词最近的
        :param ques_kw_dic:
        :param ner_lst:
        :param ans:
        :return: 返回 tuple组成的lst
        '''
        ner_dic = dict()
        for ner in ner_lst:
            temp = self.get_index_list(ans, ner)
            if temp:
                ner_dic[ner] = temp
        return self.cal_dis_with_dict(ques_kw_dic, ner_dic)

    def get_final_result(self, result_lst):
        '''
        对最终返回结果进行包装,若没有结果,返回"未找到准确答案"
        '''
        if len(result_lst) > 0:
            return result_lst[0][0]
        else:
            return "未找到准确答案"

    def gen_short_ans(self, ques_kw_lst, ans):
        '''
        对于超过 20字的答案 进行截断
        :param ques_kw_lst:
        :param ans:
        :return: 返回答案
        '''
        if len(ans) <= 20:
            return ans
        pattern = r"[,,。\.!!?\?]"
        lst = re.split(pattern, ans)
        result_dic = dict()
        for senten in lst:
            score = 0
            for kw in ques_kw_lst:
                if senten.find(kw) > -1:
                    score += 1
            result_dic[senten] = score
        result_dic = sorted(result_dic.items(),
                            key=lambda item: item[1],
                            reverse=True)
        if len(result_dic) > 0:
            return result_dic[0][0][0:20]
        else:
            return "未找到精确答案"

    def do_ans_extract(self, sents, key_words, ques_type, ques, a, b):
        '''
        调用答案抽取算法,这是主流程函数,返回即为答案
        :params: 这几个参数有候选答案句集合、关键词集合(没有用到)、问题种类、问句、算法参数
        :return:返回最终的答案
            返回的答案大多是一个词,如果不能返回一个词
            那么返回一个长度不超过20个字的句子
            如果句子和词都找不到
                返回"未找到准确答案"
            如果没有得到合适的候选答案句集合
                返回"没有找到相关内容"
        '''
        self.sentences = sents  # 候选答案句集合
        self.key_words = key_words  # 关键词集合
        self.question_type = ques_type  # 问题种类
        self.question = ques  # 问句
        self.a = a
        self.b = b  # 句法相似度计算算法的两个参数
        tfidf = analyse.extract_tags

        # 问题 关键词 位置
        ques_kw_lst = tfidf(ques)
        ques_kw_dic = dict()
        for kw in ques_kw_lst:
            lst = self.get_index_list(ques, kw)
            if lst:
                ques_kw_dic[kw] = lst
        # 补充的基于规则的问题分类
        if self.has_spe_words(self.question, self.isnum_lst):
            self.question_type = "NUMBER"
            ques_type = "NUMBER"
        elif self.has_spe_words(self.question, self.iscolor_lst):
            self.question_type = "COLOR"
            ques_type = "COLOR"
        elif self.has_spe_words(self.question, self.istime_lst):
            self.question_type = "TIME"
            ques_type = "TIME"
        elif self.has_spe_words(self.question, self.islocation_lst):
            self.question_type = "LOCATION"
            ques_type = "LOCATION"
        elif self.has_spe_words(self.question, self.isperson_lst):
            self.question_type = "PERSON"
            ques_type = "PERSON"
        elif self.has_spe_words(self.question, self.isorganization_lst):
            self.question_type = "ORGANIZATION"
            ques_type = "ORGANIZATION"

        # 去掉候选答案句中的空白字符
        for i in range(len(self.sentences)):
            self.sentences[i] = ''.join(self.sentences[i].split())
        # 首先得到最有可能包含答案的句子
        ans_sentences = self.sort_sentences()

        # 然后根据问题类型,在这五个句子中进行答案抽取
        if len(ans_sentences) == 0:
            return "没有找到相关内容"
        # 取最可能包含答案的句子,进入下一步
        ans = ans_sentences[0]

        # 基于问题分类器的分类和规则补充的分类,采取不同的抽取策略
        if self.question_type == "PERSON":
            final_anses = self.get_all_NER(ans, 'Nh')
            temp_lst = list()
            # 去除出现在问句中的人物实体
            for ner in final_anses:
                if ques.find(ner) == -1:
                    temp_lst.append(ner)
            final_anses = temp_lst
            if len(final_anses) == 0:
                return self.gen_short_ans(ques_kw_lst, ans)
            else:
                # 返回和问题关键词最接近的实体
                return self.get_final_result(
                    self.calc_dis_ner_with_dict(ques_kw_dic, final_anses, ans))
        elif self.question_type == 'LOCATION':
            final_anses = self.get_all_NER(ans, 'Ns')
            temp_lst = list()
            for ner in final_anses:
                if ques.find(ner) == -1:
                    temp_lst.append(ner)
            final_anses = temp_lst
            if len(final_anses) == 0:
                return self.gen_short_ans(ques_kw_lst, ans)
            else:
                return self.get_final_result(
                    self.calc_dis_ner_with_dict(ques_kw_dic, final_anses, ans))
        elif self.question_type == 'ORGANIZATION':
            final_anses = self.get_all_NER(ans, 'Ni')
            temp_lst = list()
            for ner in final_anses:
                if ques.find(ner) == -1:
                    temp_lst.append(ner)
            final_anses = temp_lst
            if len(final_anses) == 0:
                return self.gen_short_ans(ques_kw_lst, ans)
            else:
                return self.get_final_result(
                    self.calc_dis_ner_with_dict(ques_kw_dic, final_anses, ans))
        elif self.question_type == 'NUMBER':
            for sentence in ans_sentences:
                for num_word in self.isnum_lst:
                    if self.question.find(num_word) > -1:
                        pattern = re.compile(
                            "([\d|零|一|二|三|四|五|六|七|八|九|十|百|千|万|亿]+){unit}".
                            format(unit=num_word[-1]))
                        final_anses = pattern.findall(ans)
                        if final_anses:
                            return self.get_final_result(
                                self.calc_dis_ner_with_dict(
                                    ques_kw_dic, final_anses, ans))

                num_lst = self.get_pos_lst(sentence, 'm')
                if len(num_lst) == 0:
                    return self.gen_short_ans(ques_kw_lst, ans)
                else:
                    return self.get_final_result(
                        self.calc_dis_ner_with_dict(ques_kw_dic, num_lst, ans))
            return ans_sentences[0]  # 没有找到 返回排名最高的句子,注:这句话没用
        elif self.question_type == 'TIME':
            for sentence in ans_sentences:
                time_lst = self.get_pos_lst(sentence, 'nt')
                if len(time_lst) == 0:
                    return self.gen_short_ans(ques_kw_lst, ans)
                else:
                    return self.get_final_result(
                        self.calc_dis_ner_with_dict(ques_kw_dic, time_lst,
                                                    ans))
            return ans_sentences[0]  # 没有找到  返回排名最高的句子,注:这句话没用
        elif self.question_type == 'NEXT_SENTENCE':
            type = self.get_context_type(ques)
            pattern1 = re.compile('“(.*?)”')
            pattern2 = re.compile('"(.*?)"')
            shici_sent_lst = pattern1.findall(ques)
            shici_sent_lst.extend(pattern2.findall(ques))
            if len(shici_sent_lst) == 0:
                return self.gen_short_ans(ques_kw_lst, ans)
            shici_sent = shici_sent_lst[-1]
            # 寻找合适的答案
            for sent in ans_sentences:
                if sent.find(shici_sent) > -1:
                    ans = sent
                    break

            punc_lst = [
                ',', '.', '?', ',', '。', '?', '!', '!', '「', '」', '"', '“',
                '”', "'", "‘", "’"
            ]
            start_index = -1
            end_index = -1
            if type == '下文':
                index = ans.find(shici_sent)
                for i in range(index, len(ans)):
                    if ans[i] in punc_lst and start_index == -1:
                        start_index = i
                    elif ans[i] in punc_lst and end_index == -1:
                        end_index = i
                        break
                return ans[start_index + 1:end_index][0:20]
            else:
                index = ans.find(shici_sent)
                start_index = -1
                end_index = -1
                for i in range(index, -1, -1):
                    if ans[i] in punc_lst:
                        end_index = i
                        break
                for i in range(end_index - 1, -1, -1):
                    if ans[i] in punc_lst:
                        start_index = i
                        break
                return ans[start_index + 1:end_index][0:20]
        elif self.question_type == 'COLOR':  #对颜色进行提取
            ans_words = list(self.segmentor.segment(ans))
            ans_postags = list(self.postagger.postag(ans_words))
            final_anses = list()
            i = 0
            for tag in ans_postags:
                if tag == 'a' and ans_words[i].find("色") > -1 and len(
                        ans_words[i]) > 1:
                    final_anses.append(ans_words[i])
                i = i + 1
            if final_anses:
                return '、'.join(final_anses)
            else:
                return self.gen_short_ans(ques_kw_lst, ans)
        elif self.question_type == 'AFFIRMATION':  # 认同关系 是否
            kw_lst = tfidf(self.question)[0:5]
            score = 0
            for kw in kw_lst:
                if ans.find(kw) > -1:
                    score = score + 1
            if score > len(kw_lst) / 2:
                return "是"
            else:
                return "否"
        else:  #通用解决方法,针对一般类型的问题
            # 对问题和候选答案进行关键词提取
            ques_kw_lst = tfidf(ques)
            ans_kw_lst = tfidf(ans)

            # 去掉出现在问题的关键词
            temp_lst = list()
            for kw in ans_kw_lst:
                if ques.find(kw) == -1:
                    temp_lst.append(kw)
            ans_kw_lst = temp_lst

            # 对答案关键词 作词性标注,保留名词性质的词
            ans_kw_postags = self.postagger.postag(ans_kw_lst)
            temp_lst = list()
            index = 0
            for postag in ans_kw_postags:
                if postag in ['n', 'nd', 'nh', 'ni', 'nl', 'ns', 'nt', 'nz']:
                    temp_lst.append(ans_kw_lst[index])
                index = index + 1
            ans_kw_lst = temp_lst

            # 计算关键词的位置
            # 问题关键词
            ques_kw_dic = dict()
            for kw in ques_kw_lst:
                lst = self.get_index_list(ques, kw)
                if lst:
                    ques_kw_dic[kw] = lst
            # 答案关键词
            ans_kw_dic = dict()
            for kw in ans_kw_lst:
                lst = self.get_index_list(ans, kw)
                if lst:
                    ans_kw_dic[kw] = lst

            # 得到 距离排序后,最小的那一个词作为答案
            ans_dis = self.get_final_result(
                self.cal_dis_with_dict(ques_kw_dic, ans_kw_dic))
            return ans_dis

    # 计算候选答案句与问句的相似度,并返回按相似度排序的句子
    def sort_sentences(self):
        # 首先得到问句的c&r词集
        question_cr_words = self.get_centrial_and_rela_words(self.question)
        sims = []
        i = 0
        for sentence in self.sentences:
            try:
                sim = 0.1 * self.calc_similarity(
                    sentence, question_cr_words) + 0.9 * self.cal_sim(
                        sentence, self.question)
            except Exception as err:
                print(err)
                sim = self.cal_sim(sentence, self.question)  # 相似度算法有可能发生除零错误
            sims.append((i, sim))
            i = i + 1
        sims.sort(key=lambda item: item[1], reverse=True)
        ans_sentences = []
        for i in range(0, len(self.sentences)):
            ans_sentences.append(self.sentences[sims[i][0]])
        return ans_sentences

    # 句法分析,得到一个句子的核心词和依附于核心词的词的集合
    def get_centrial_and_rela_words(self, sentence):
        words = self.segmentor.segment(sentence)  # 分词
        postags = self.postagger.postag(words)  # 词性标注
        arcs = self.parser.parse(words, postags)  # 句法分析
        i = 1  # 临时变量,指示句法树当前结点
        layer_2 = []  # 句法树第二层结点索引
        layer_3 = []  # 句法树第三层结点索引
        for arc in arcs:
            # 额外的自然语言处理方案——规则补充的句法分析
            """
            添加几条规则
            1、对表示时间的词进行处理时,去掉时间词之间的依存关系,把连续的时间词看成一个整体。
            2、对助词进行处理,去除句子中助词词性的词,
            如“的”、“地”和“得”,然后将依附这些助词的词语直接依附到这些助词所依附的词语上
            即不改变弧的方向,然后把其它两个词语直接相连
            但是如果依附于这些助词的词语有很多不是唯一的,则此规则不能进行。 
            3、对虚词进行处理,根据前文所做的词性标注处理,去除与句子意思不相关的
            虚词,所依据的规则如上。 
            """
            # 有点复杂,所以没实现这些规则。
            # HED表示,这个词是核心词
            if arc.relation == "HED":
                centrial_word = i  # 核心词
            i = i + 1

        i = 1
        for arc in arcs:
            if arc.head == centrial_word:
                layer_2.append(i)  # 找到第2层节点
            i = i + 1
        i = 1
        for arc in arcs:
            if arc.head in layer_2:
                layer_3.append(i)  # 找到第三层结点
            i = i + 1
        # 找核心词 & 依附于核心词的词语
        # 除了核心词,把第二第三层的词都算作依附于核心词
        rela_words = []
        # 规定:rela_words数组的第一个词是核心词
        rela_words.append(words[centrial_word - 1])
        # 除了第一个词,后面把第二第三层的词算作依附于核心词的相关词
        for j in layer_2:
            rela_words.append(words[j - 1])
        for j in layer_3:
            rela_words.append(words[j - 1])
        return rela_words

    # 计算两个词语的语义距离
    # Dist(A,B) = min{dist(m,n)}
    # dist(m,n) = 2 * (7 - first_diff)
    # first_diff是两个词的code的第一个不同字符所在的位置
    def calc_Dist(self, codes1, codes2):
        dist = 14
        for code1 in codes1:
            for code2 in codes2:
                first_diff = 7
                for i in range(0, 7):
                    if code1[i] != code2[i]:
                        first_diff = i
                        break
                tmp_dist = 2 * (7 - first_diff)
                if tmp_dist < dist:
                    dist = tmp_dist
        return dist

    def cal_sim(self, sentence, ques):
        '''
        计算相似度,基于关键词,不涉及句法
        :param sentence:
        :param question_cr_words:
        :return:
        '''
        tfidf = analyse.extract_tags
        ques_keywords = tfidf(ques)
        score = 0
        for kw in ques_keywords:
            if sentence.find(kw) > -1:
                score = score + 1
        return score

    # 计算某句子与问句的相似度
    # 对于认不出来的词(同义词词林中没有)
    # 有很大可能是专有名词等,这时都识别为 “谜、谜语”即可
    # 专有名词于是被认为是相似的
    def calc_similarity(self, sentence, question_cr_words):
        # 对句子进行句法分析,得到c&r词集
        cr_words = self.get_centrial_and_rela_words(sentence)
        # 计算核心词相似度
        if question_cr_words[0] in self.sim_word_code:
            question_c_codes = self.sim_word_code[question_cr_words[0]]
        else:
            question_c_codes = ["Dk06D01"]  # 注:这个码为 谜、谜语。。。
        if cr_words[0] in self.sim_word_code:
            c_codes = self.sim_word_code[cr_words[0]]
        else:
            c_codes = ["Dk06D01"]
        c_Dist = self.calc_Dist(question_c_codes, c_codes)
        if c_Dist == 0:
            c_sim = 1
        else:
            c_sim = 7 / (7 + c_Dist)

        # 计算非核心词相似度
        question_r_codes = [0] * (len(question_cr_words) - 1)
        for i in range(1, len(question_cr_words)):
            if question_cr_words[i] in self.sim_word_code:
                question_r_codes[i -
                                 1] = self.sim_word_code[question_cr_words[i]]
            else:
                question_r_codes[i - 1] = ["Dk06D01"]
        r_codes = [0] * (len(cr_words) - 1)
        for i in range(1, len(cr_words)):
            if cr_words[i] in self.sim_word_code:
                r_codes[i - 1] = self.sim_word_code[cr_words[i]]
            else:
                r_codes[i - 1] = ["Dk06D01"]
        # 这个略麻烦一点
        q_s_sims = [0] * (len(question_cr_words) - 1)
        q_s_sim = 0
        for i in range(0, len(question_r_codes)):
            for j in range(0, len(r_codes)):
                tmp_Dist = self.calc_Dist(question_r_codes[i], r_codes[j])
                if tmp_Dist == 0:
                    q_s_sims[i] = 1
                else:
                    tmp_sim = 7 / (7 + tmp_Dist)
                    if tmp_sim > q_s_sims[i]:
                        q_s_sims[i] = tmp_sim
            q_s_sim += q_s_sims[i]
        q_s_sim = q_s_sim / len(question_r_codes)

        s_q_sims = [0] * (len(cr_words) - 1)
        s_q_sim = 0
        for i in range(0, len(r_codes)):
            for j in range(0, len(question_r_codes)):
                tmp_Dist = self.calc_Dist(r_codes[i], question_r_codes[j])
                if tmp_Dist == 0:
                    s_q_sims[i] = 1
                else:
                    tmp_sim = 7 / (7 + tmp_Dist)
                    if tmp_sim > s_q_sims[i]:
                        s_q_sims[i] = tmp_sim
            s_q_sim += s_q_sims[i]
        s_q_sim = s_q_sim / len(r_codes)

        res = self.a * c_sim + self.b * ((q_s_sim + s_q_sim) / 2)
        return res
Пример #30
0
    def add_feature(self):
        def getshape(word):
            r = ''
            for w in word:
                if w.isupper():
                    r = r + 'A'
                elif w.islower():
                    r = r + 'a'
                elif w.isdigit():
                    r = r + '0'
                elif w in self.pos:
                    r = r + 'p'
                elif w in ['”','、','“','。',';',',','?','!','','']:
                    r = r + 'b'
                else:
                    r = r + 'c'
                return r
        def path_cal(begin_idx, end_idx, arcs):
            # print(begin_idx, end_idx)
            begin_path_index = []
            flag = False
            while arcs[begin_idx].head != 0:
                if begin_idx == end_idx:
                    flag = True
                    break
                begin_path_index.append(begin_idx)
                begin_idx = arcs[begin_idx].head - 1
            begin_path_index.append(begin_idx)
            if flag:
                # print(begin_path_index)
                return begin_path_index
            else:
                end_path_index = []
                while arcs[end_idx].head != 0:
                    if end_idx == begin_idx:
                        flag = True
                        break
                    end_path_index.append(end_idx)
                    end_idx = arcs[end_idx].head - 1
                end_path_index.append(end_idx)
                if flag:
                    # print(end_path_index)
                    return end_path_index
                else:
                    end_path_index.reverse()
                    path_index = begin_path_index + end_path_index[1:]
                    # print(path_index)
                    return path_index
        def get_answer_pos(l, answer):
            r = [0 for n in range(len(l))]
            r_str = ''
            i = 0
            while r_str != answer[1:-2] and i < len(l):
                if l[i] in answer:
                    r_str += l[i]
                    r[i] = 1
                else:
                    r_str = ''
                    for j in range(i):
                        r[j] = 0
                i += 1
            if sum(r) == 0:
                # print(l)
                # print(answer)
                for j in range(len(r)):
                    # print(answer[1:-2],l[j])
                    if answer[1:-2] in l[j]:
                        r[j] = 1
                if sum(r) == 0:
                    i = 0
                    while r_str != answer[1:-2].replace(' ','') and i < len(l):
                        if l[i] in answer or l[i] in answer.replace(' ',''):
                            r_str += l[i]
                            r[i] = 1
                        else:
                            r_str = ''
                            for j in range(i):
                                r[j] = 0
                        i += 1
            if sum(r) == 1:
                for i in range(len(r)):
                    if r[i] == 1:
                        r[i] = 'S'
                        # r[i] = 'B'
                    else:
                        r[i] = 'O'
            else:
                for i in range(len(r)):
                    if r[i] == 1:
                        r[i] = 'I'
                    else:
                        r[i] = 'O'
                for i in range(len(r)):
                    if r[i] == 'I':
                        r[i] = 'B'
                        break
                for i in range(len(r)-1, 0, -1):
                    if r[i] == 'I':
                        r[i] = 'E'
                        break
            return r
        def adjust_list(l,words):
            l.insert(0,'"')
            n = len(l)
            cut_list = []
            cut_list.append(l[0])
            for i in range(1,n):
                a = ''
                b = l[i]
                if cut_list[-1]+l[i] in words:
                    cut_list[-1] = cut_list[-1]+l[i]
                    continue
                else:
                    cut_list.append(l[i])
            cut_list = cut_list[1:]
            return cut_list
        segmentor = Segmentor()
        segmentor.load('cws.model')
        postagger = Postagger()  # 初始化实例
        postagger.load('pos.model')  # 加载模型
        recognizer = NamedEntityRecognizer()  # 初始化实例
        recognizer.load('ner.model')  # 加载模型
        parser = Parser()
        parser.load('parser.model')
        labeller = SementicRoleLabeller()  # 初始化实例
        labeller.load('pisrl.model')  # 加载模型

        fti = open('AC/tf-idf.txt', 'r')
        all_word = []
        for line in fti:
            k = line[:-1].split('\t')[0]
            all_word.append(k)
        all_word = set(all_word)
        nj = 0
        word_feature = []
        word_group = []
        word_all = []
        for k in self.data.keys():
            q_list = []
            q_pos_list = []
            q_sbv = []
            q_vob = []
            q_v = []
            q_att1 = []
            q_att2 = []
            cut_line = '\t'.join(segmentor.segment(self.data[k][0]))
            word_list = cut_line.split('\t')  # 分词
            # print(word_list)
            for i in word_list:
                if i not in self.stop_word:
                    q_list.append(i)
            # q_list = adjust_list(q_list, all_word)
            postags = postagger.postag(q_list)  # 词性标注
            pos_line = '\t'.join(postags)
            q_pos_list = pos_line.split('\t')
            netags = recognizer.recognize(q_list, postags)  # 命名实体识别
            ner_line = '\t'.join(netags)
            ner_list = ner_line.split('\t')
            q_ner = []
            ner_str = ''
            for nr in range(len(ner_list)):
                if ner_list[nr][0] != 'O':
                    if ner_list[nr][0] == 'S' or ner_list[nr][0] == 'E':
                        ner_str += q_list[nr]
                        q_ner.append(ner_str)
                        ner_str = ''
                    else:
                        ner_str += q_list[nr]
            q_arcs = parser.parse(q_list, q_pos_list)  # 句法分析
            arcs_line = "\t".join("%d %s" % (arc.head, arc.relation) for arc in q_arcs)
            arcs_list = arcs_line.split('\t')
            # print(q_list)
            # for i in range(len(arcs_list)):
            #     # print(arcs_list[i].split(' '))
            #     if int(arcs_list[i].split(' ')[0]) == 0:
            #         q_arcs.append( 'root_' +  q_list[i]  + '_' + arcs_list[i].split(' ')[1])
            #     else:
            #         q_arcs.append(q_list[int(arcs_list[i].split(' ')[0])-1]+ '_' +  q_list[i]  + '_' + arcs_list[i].split(' ')[1])
            # print(q_arcs)
            # roles = labeller.label(q_list, postags, arcs)
            # print(q_list)
            for n in range(len(arcs_list)):
                # print(q_list[int(arcs_list[n].split()[0])-1],q_list[n],arcs_list[n].split()[1])
                if arcs_list[n].split()[1] == 'SBV':
                    q_v.append(q_list[int(arcs_list[n].split()[0]) - 1])
                    q_sbv.append(q_list[n])
                elif arcs_list[n].split()[1] == 'VOB':
                    q_v.append(q_list[int(arcs_list[n].split()[0]) - 1])
                    q_vob.append(q_list[n])
                elif arcs_list[n].split()[1] == 'IOB':
                    q_v.append(q_list[int(arcs_list[n].split()[0]) - 1])
                    q_vob.append(q_list[n])
                elif arcs_list[n].split()[1] == 'FOB':
                    q_vob.append(q_list[int(arcs_list[n].split()[0]) - 1])
                    q_v.append(q_list[n])
                elif arcs_list[n].split()[1] == 'ATT':
                    q_att1.append(q_list[int(arcs_list[n].split()[0]) - 1])
                    q_att2.append(q_list[n])
                # print(q_list[int(arcs_list[n].split()[0]) - 1], q_list[n], arcs_list[n].split()[1])
            # print(self.data[k][0])
            # print('sbv',q_sbv)
            # print('v',q_v)
            # print('vob',q_vob)
            # print('att1',q_att1)
            # print('att2',q_att2)
            q_key = []
            q_key_l =0.0
            for i in range(len(q_list)):
                # if q_pos_list[i][0] == 'n' or q_pos_list[i][0] == 'a' or q_pos_list[i][0] == 'v':
                if q_list[i] not in self.stop_word1 and q_pos_list[i] != 'r':
                    q_key.append(q_list[i])
            q_w = ''
            for q in q_list:
                if q in self.question_word:
                    q_w = q
            if q_w == '':
                q_w = q_list[-3]
            # print('q_k',q_key)
            a_list = []
            a_pos_list = []
            cut_line = '\t'.join(segmentor.segment(self.data[k][1]))
            word_list = cut_line.split('\t')  # 分词
            # print(word_list)
            for i in word_list:
                if i not in self.stop_word:
                    a_list.append(i)
            # a_list = adjust_list(a_list, all_word)
            postags = postagger.postag(a_list)  # 词性标注
            pos_line = '\t'.join(postags)
            a_pos_list = pos_line.split('\t')
            netags = recognizer.recognize(a_list, postags)  # 命名实体识别
            ner_line = '\t'.join(netags)
            ner_list = ner_line.split('\t')
            arcs = parser.parse(a_list, a_pos_list)  # 句法分析
            arcs_line = "\t".join("%d %s" % (arc.head, arc.relation) for arc in arcs)
            arcs_list = arcs_line.split('\t')
            a_arcs = []
            a_ci = 0
            for i in range(len(arcs_list)):
                if arcs_list[i].split(' ')[1] == 'HED':
                    aci = i
                    break
                # print(arcs_list[i].split(' '))
                # if int(arcs_list[i].split(' ')[0]) == 0:
                #     a_arcs.append('root_' + a_list[i] + '_' + arcs_list[i].split(' ')[1])
                # else:
                #     a_arcs.append(a_list[int(arcs_list[i].split(' ')[0]) - 1] + '_' + a_list[i] + '_' + arcs_list[i].split(' ')[
                #             1])
            # print(a_arcs)
            a_key = []
            a_key_l = 0.0
            for i in range(len(a_list)):
                # if a_pos_list[i][0] == 'n' or a_pos_list[i][0] == 'a' or a_pos_list[i][0] == 'v':
                if a_list[i] not in self.stop_word1 and a_list[i] in q_key:
                    a_key.append(a_list[i])
            if a_key == []:
                a_key_l = 5.0
            else:
                for qkw in a_key:
                    # print(path_cal(q_list.index(q_w),q_list.index(qkw),q_arcs))
                    q_key_l += len(path_cal(q_list.index(q_w),q_list.index(qkw),q_arcs))
                q_key_l /= len(a_key)
            r_pos = get_answer_pos(a_list,self.data[k][2])
            # print(a_list)
            # print(r_pos)
            for i in range(len(a_list)):
                str_f = a_list[i]
                w_l = 0.0
                a_l = 0.0
                for j in range(len(a_list)):
                    if a_list[j] in q_key:
                        w_l += 1 / (math.fabs(i-j) + 1)
                if w_l == 0.0:
                    w_l = 5.0
                # for j in range(len(a_list)):
                #     print(a_list[i],arcs_list[i])
                # print(a_l)
                if a_list[i] in set(q_list):
                    str_f += '\tin_q'
                else:
                    str_f += '\tnot_in_q'
                if a_list[i] in set(q_ner):
                    str_f += '\tin_qner'
                else:
                    str_f += '\tnot_in_qner'
                if a_list[i] in set(q_sbv):
                    str_f += '\tin_sbv'
                else:
                    str_f += '\tnot_in_sbv'
                if a_list[i] in set(q_v):
                    str_f += '\tin_qv'
                else:
                    str_f += '\tnot_in_qv'
                if a_list[i] in set(q_vob):
                    str_f += '\tin_qvob'
                else:
                    str_f += '\tnot_in_qvob'
                if a_list[i] in set(q_att1):
                    str_f += '\tin_att1'
                else:
                    str_f += '\tnot_in_att1'
                if a_list[i] in set(q_att2):
                    str_f += '\tin_att2'
                else:
                    str_f += '\tnot_in_att2'
                if a_list[i] in set(self.pos):
                    str_f += '\t' + a_list[i]
                else:
                    str_f += '\tnot_in_pos'
                str_f += '\t'+self.data[k][-1]+ '_' + a_pos_list[i]
                str_f += '\t' + a_pos_list[i]
                str_f += '\t' + str(round(w_l,1))
                # print(a_key)
                if a_key == []:
                    a_key_l = 5.0
                else:
                    for qkw in a_key:
                        a_key_l += len(path_cal(i,a_list.index(qkw), arcs))
                        # else:
                        #     a_key_l += 10.0
                    a_key_l /= len(a_key)
                    a_key_l -= q_key_l
                # print(a_key_l)
                str_f += '\t' + str(round(a_key_l,1))
                str_f += '\t' + str(len(a_list[i]))
                str_f += '\t' + getshape(a_list[i])
                if a_list[i] in self.syn_dict.keys():
                    str_f += '\t' + self.syn_dict[a_list[i]]
                else:
                    str_f += '\t' + 'N-syn'
                str_f += '\t' + str(arcs_list[i].split(' ')[1])
                # str_f += '\t' + str(i)
                str_f += '\t' + str(math.fabs(aci-i))
                str_f += '\t' + str(r_pos[i])
                # i_key = list(set(q_key) | set(a_key))
                # str_f += '\t' + str(get_l(a_list[i],a_arcs,i_key))
                # str_f += '\t' + str(get_l(a_list[i], a_arcs, a_key))
                word_feature.append(str_f)
            word_group.append(str(len(a_list)))
            nj += 1
            # if nj == 5:
            #     break
            if nj % 1000 == 0:
                print(nj)
        with open('AS/train.txt','w') as f1:
            for wf in word_feature[:]:
                f1.write(wf)
                f1.write('\n')
        with open('AS/group.txt','w') as f3:
            for wg in word_group:
                f3.write(str(wg))
                f3.write('\n')
Пример #31
0
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print("\t".join(postags))

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
netags = recognizer.recognize(words, postags)
print("\t".join(netags))

labeller = SementicRoleLabeller()
labeller.load(os.path.join(MODELDIR, "pisrl.model"))
roles = labeller.label(words, postags, arcs)

for role in roles:
    print(role.index, "".join(
            ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))

segmentor.release()
postagger.release()
parser.release()
recognizer.release()
labeller.release()
Пример #32
0
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print "\t".join(postags)

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)

print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
netags = recognizer.recognize(words, postags)
print "\t".join(netags)

labeller = SementicRoleLabeller()
labeller.load(os.path.join(MODELDIR, "srl/"))
roles = labeller.label(words, postags, netags, arcs)

for role in roles:
    print role.index, "".join(
            ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])

segmentor.release()
postagger.release()
parser.release()
recognizer.release()
labeller.release()