def semantic_role_label(self): #依存句法分析 parser = Parser() parser.load('ltp_data/parser.model') arcs = parser.parse(self.words, self.postags) parser.release() labeller = SementicRoleLabeller() labeller.load('ltp_data/srl') roles = labeller.label(self.words, self.postags, self.netags, arcs) Label_AX = [] #存放A0或者A1标签的列表 for role in roles: Label_AX.extend([ arg for arg in role.arguments if arg.name == "A0" or arg.name == "A1" ]) for label in Label_AX: #排除一些长度异常的标签为A0或者A1的动作实施者或者动作接受者 if label.range.end - label.range.start > 0 and label.range.end - label.range.start < 10: for i in range(label.range.start, label.range.end + 1): #将动作实施者或者动作接受者中的名词,人名,地名拿出来作为实体 if self.postags[i] == "n" or self.postags[ i] == "ns" or self.postags[ i] == "nh" or self.postags[i] == "ni": self.entity.append(self.words[i]) else: pass else: pass labeller.release()
class LTP_word(): """docstring for parser_word deal处理文本,返回词表、词性及依存关系,语义,命名实体五个值 release释放缓存""" def __init__(self, model_path): self.model_path = model_path self.segmentor = Segmentor() # 分词初始化实例 self.segmentor.load_with_lexicon(path.join(self.model_path, 'cws.model'), path.join(self.model_path, 'dictionary_kfc.txt')) self.postagger = Postagger() # 词性标注初始化实例 self.postagger.load(path.join(self.model_path, 'pos.model') ) # 加载模型 self.recognizer = NamedEntityRecognizer() # 命名实体识别初始化实例 self.recognizer.load(path.join(self.model_path, 'ner.model')) self.parser = Parser() # 依存句法初始化实例 s self.parser.load(path.join(self.model_path, 'parser.model')) # 加载模型 self.labeller = SementicRoleLabeller() # 语义角色标注初始化实例 self.labeller.load(path.join(self.model_path, 'srl')) def deal (self, text): #把所有该要使用的东西都提取出来 words =self.segmentor.segment(text) # 分词 postags = self.postagger.postag(words) # 词性标注 netags = self.recognizer.recognize(words, postags) #命名实体 arcs = self.parser.parse(words, postags) # 句法分析 roles = self.labeller.label(words, postags, netags, arcs) # 语义角色标注 return words,postags,arcs,roles,netags def release(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.labeller.release()
class ModelLoader: __instance = None def __new__(cls): if cls.__instance is None: cls.__instance = super(ModelLoader, cls).__new__(cls) cls.__instance.__initialized = False return cls.__instance def __init__(self): if (self.__initialized): return self.__initialized = True LTP_DIR = "./ltp_data" #客製化分詞,並且後處理更改詞性 self.segmentor = Segmentor() self.segmentor.load_with_lexicon( os.path.join(LTP_DIR, "cws.model"), os.path.join(LTP_DIR, 'customized.txt')) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) self.sentenceSplitter = SentenceSplitter()
def role_label(self, words, postags, arcs): """ 语义角色标注 :param words: :param postags: :param arcs: :return: """ srl_model = os.path.join(self.MODEL_PATH, 'pisrl_win.model') labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model) # 加载模型 roles = labeller.label(words, postags, arcs) # 语义角色标注 for role in roles: print( role.index, "".join([ "{0}:({1},{2})".format(arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() return "roles{}".format(roles)
class LTP: def __init__(self): self.segmentor = Segmentor() # 分词器 self.segmentor.load_with_lexicon( Config.SEGMENTOR_PATH, Config.PERSONAL_SEGMENTOR_PATH) # 加载模型 self.postagger = Postagger() # 词性分析器 self.postagger.load(Config.POSTAGGER_PATH) # 加载模型 self.parser = Parser() # 句法分析器 self.recognizer = NamedEntityRecognizer() self.recognizer.load(Config.NAMED_ENTITY_RECONGNTION_PATH) self.parser.load(Config.PARSER_PATH) # 加载模型 self.labeller = SementicRoleLabeller() # 语义角色分析器 self.labeller.load(Config.LABELLER_PATH) # 加载模型 self.negative_list = get_negative_list() self.no_list = get_no_list() self.limit_list = get_limit_list() self.special_list = get_special_list() self.key_sentences = [] def __del__(self): """ 资源释放 """ self.segmentor.release() self.postagger.release() self.parser.release() self.labeller.release()
def role(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 # labeller.load('/usr/local/src/ltp_data/srl') # 加载模型 labeller.load(srl_model_path) # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 """ #arg.name 表示语义角色关系 #arg.range.start 表示起始词位置 #arg.range.end 表示结束位置 roletype = {'C-A0':'施事','A0':'施事','A1':'受事','A2':'间接对象','A3':'直接目标','A4':'直接方法','A5':'其它','ADV':'附词','BNE':'受益人' , 'CND': '条件','DIR':'方向','DGR':'程度','EXT':'扩展','FRQ':'频率','LOC':'地点','MNR':'方式','PRP':'目的或原因' , 'TMP': '时间', 'TPC': '主题', 'CRD': '并列', 'PRD': '谓词', 'PSR': '持有者', 'PSE': '被持有','DIS': '转折'} postype = {'A0':'施事','A1':'受事','A2':'间接对象','A3':'直接目标','A4':'直接方法','A5':'其它','ADV':'附词','BNE':'受益人' , 'CND': '条件','DIR':'方向','DGR':'程度','EXT':'扩展','FRQ':'频率','LOC':'地点','MNR':'方式','PRP':'目的或原因' , 'TMP': '时间', 'TPC': '主题', 'CRD': '并列', 'PRD': '谓词', 'PSR': '持有者', 'PSE': '被持有'} for role in roles: #print role.index, "".join(["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) outstr = "" for arg in role.arguments: block = '' for num in range(arg.range.start, arg.range.end+1): block = block + words[num]+'[%d-%s]'%(num,postags[num]) outstr = outstr + roletype[arg.name] + "(%s);" % block print '%d-%s'%(role.index,words[role.index])+ ":"+outstr """ labeller.release() # 释放模型 return roles
class LtpParser: def __init__(self): LTP_DIR = "./ltp_data_v3.4.0" self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) '''语义角色标注''' def format_labelrole(self, words, postags): arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: roles_dict[role.index] = {arg.name:[arg.name,arg.range.start, arg.range.end] for arg in role.arguments} return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index+1: #arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 roles = labeller.label(words, postags, arcs) # 语义角色标注 #for role in roles: # print (role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) labeller.release() # 释放模型 return roles
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 labeller.load('/Users/chenming/Spyder/3.3.1/ltp_data/srl/') # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 for role in roles: print (role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) labeller.release() # 释放模型
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 model = "srl" labeller.load(os.path.join(modelPath, model)) # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 for role in roles: print role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) labeller.release() # 释放模型
def yuyijuese(words, postags, netags, arcs): """语义角色标注 """ labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "srl/")) roles = labeller.label(words, postags, netags, arcs) for role in roles: print (role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
def ltp_sementic_role_labeller(LTP_DATA_DIR, words, postags, arcs): # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。 # windos下开发使用pisrl_win.model模型 srl_model_path = os.path.join(LTP_DATA_DIR, 'srl/pisrl_win.model') labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 # arcs 使用依存句法分析的结果 roles = labeller.label(words, postags, arcs) # 语义角色标注 labeller.release() # 释放模型 return roles
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 labeller.load('/Users/zhangqinyuan/Downloads/ltp_data_v3.4.0/srl') # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 labeller.load('../ltp_data/srl') # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 # print '----------------' # for role in roles: # print role.index, "".join( # ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) # print '----------------' labeller.release() # 释放模型 return roles
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 labeller.load(os.path.join(LTP_DATA_DIR, 'srl')) # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型
def SrlFunction(contents): from pyltp import Segmentor segmentor = Segmentor() # 初始化实例 # segmentor.load(cws_model_path) # 加载模型 segmentor.load_with_lexicon(cws_model_path, 'E:\\ltp_data_v3.4.0\\personal_seg.txt') words = segmentor.segment(contents) # 分词 k = 1 for word in words: print(word + str(k) + ' ', end='') k = k + 1 print('\n') # print('\t'.join(words)) segmentor.release() # 释放模型 wordslist = list(words) from pyltp import Postagger postagger = Postagger() # postagger.load(pos_model_path) postagger.load_with_lexicon(pos_model_path, 'D:\\ltp_data_v3.4.0\\personal_pos.txt') postags = postagger.postag(wordslist) print('\t'.join(postags)) postagger.release() # wordslist = ['人力资源社会保障局','主管','医疗保险','工作'] # postags = ['n','v','n','v'] from pyltp import Parser parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 arcs = parser.parse(wordslist, postags) # 句法分析 print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # 释放模型 from pyltp import SementicRoleLabeller labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 # arcs 使用依存句法分析的结果 roles = labeller.label(wordslist, postags, arcs) # 语义角色标注 # 打印结果 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型
def get_roles_by_pyltp(self, words_list, postags_list, arcs_list): roles_list = list() # 语义角色标注模型路径,模型名称为‘pisrl.model’ srl_model_path = os.path.join(self.ltp_dir_path, "pisrl.model") labeller = SementicRoleLabeller() labeller.load(srl_model_path) roles = labeller.label(words_list, postags_list, arcs_list) labeller.release() # 尝试释放内存 # import gc # del labeller # gc.collect() # 算了,这个不行 roles_list = list(roles) return roles_list
def get_role_list(self, words, postags): parser = Parser() parser.load(Dependency.par_model) rolelabel = SementicRoleLabeller() rolelabel.load(Dependency.pisrl_model) try: parsers = parser.parse(words, postags) roles = rolelabel.label(words, postags, parsers) except Exception as e: roles = [[]] finally: parser.release() rolelabel.release() return roles
def srl(words, postags, arcs): global labeller if labeller is None: srl_model_path = os.path.join(LTP_DATA_DIR, 'srl') # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。 labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 # arcs 使用依存句法分析的结果 roles = labeller.label(words, postags, arcs) # 语义角色标注 # 打印结果 role_list = [] for role in roles: for arg in role.arguments: args = (role.index, arg.name, arg.range.start, arg.range.end) role_list.append(args) return role_list
def get_srl(sentence): labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 words = list(pyltp_cut(sentence)) # pyltp分词 postags = list(postagger.postag(words)) # 词性标注 arcs = get_parsing(sentence) # arcs 使用依存句法分析的结果 roles = labeller.label(words, postags, arcs) # 语义角色标注 # 打印结果 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型
def get_srl(self, words): # 语义角色标注 labeller = SementicRoleLabeller() # 初始化实例 labeller.load(self.srl_model_path) # 加载模型 # arcs 使用依存句法分析的结果 postags = self.get_postags(words) arcs = self.get_dependency(words) roles = labeller.label(words, postags, arcs) # 语义角色标注 # 打印结果 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型 return roles
def labeller(word_tag, arcs, srl_model_path): ''' Desc: 语义角色标注 Args: word_tag(dict) 词性词典 arcs 依存关系 srl_model_path 语义角色标注模型 ''' labeller = SementicRoleLabeller() labeller.load(srl_model_path) roles = labeller.label(list(word_tag.keys()), list(word_tag.values()), arcs) for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release()
def sentence_label(parse_result): labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 i = 0 final_result = [] for key, value in parse_result.items(): i += 1 if i % 50 == 0: print('休息一下') time.sleep(5) words = value[0] postags = value[1] arcs = value[2] roles = labeller.label(words, postags, arcs) print('done') print(final_result) labeller.release()
class LtpParser(object): def __init__(self, data_dir: str): self.segmentor = Segmentor() self.segmentor.load(os.path.join(data_dir, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(data_dir, "pos.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(data_dir, "ner.model")) self.parser = Parser() self.parser.load(os.path.join(data_dir, "parser.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(data_dir, "pisrl.model")) def parse(self, text: str) -> List[str]: tokens = self.segmentor.segment(text) postags = self.postagger.postag(tokens) netags = self.recognizer.recognize(tokens, postags) arcs = self.parser.parse(tokens, postags) roles = self.labeller.label(tokens, postags, arcs) srlabels = {} for role in roles: srlabels[role.index] = { arg.name: { "start": arg.range.start, "end": arg.range.end } for arg in role.arguments } return { "tokens": list(tokens), "postags": list(postags), "netags": list(netags), "srlabels": srlabels, } def release(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.labeller.release()
def ltp(t_str): segmentor = Segmentor() segmentor.load('cws.model') postagger = Postagger() # 初始化实例 postagger.load('pos.model') # 加载模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load('ner.model') # 加载模型 labeller = SementicRoleLabeller() # 初始化实例 labeller.load('pisrl.model') # 加载模型 parser = Parser() parser.load('parser.model') cut_line = '\t'.join(segmentor.segment(t_str)) words_list = cut_line.split('\t') # 分词 postags = postagger.postag(words_list) # 词性标注 pos_line = '\t'.join(postags) pos_list = pos_line.split('\t') netags = recognizer.recognize(words_list, pos_list) # 命名实体识别 ner_line = '\t'.join(netags) ner_list = ner_line.split('\t') arcs = parser.parse(words_list, pos_list) # 句法分析 arcs_line = "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) arcs_list = arcs_line.split('\t') i = 0 for word, arc in zip(words_list, arcs): i = i + 1 print( str(i) + '/' + word + '/' + str(arc.head) + '/' + str(arc.relation)) # roles = labeller.label(words_list, pos_list, arcs) # 语义角色标注 # for role in roles: # print(role.index, "".join( # ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) words_list = merge_base(words_list, pos_list, ner_list, arcs_list) print(words_list)
class Ltp_parser: def __init__(self): self.segmentor = Segmentor() self.segmentor.load('/home/student/project/project-01/Four-Little-Frogs/ltp_data_v3.4.0/cws.model') self.postagger = Postagger() self.postagger.load('/home/student/project/project-01/Four-Little-Frogs/ltp_data_v3.4.0/pos.model') self.parser = Parser() self.parser.load('/home/student/project/project-01/Four-Little-Frogs/ltp_data_v3.4.0/parser.model') self.recognizer = NamedEntityRecognizer() self.recognizer.load('/home/student/project/project-01/Four-Little-Frogs/ltp_data_v3.4.0/ner.model') self.labeller = SementicRoleLabeller() self.labeller.load('/home/student/project/project-01/Four-Little-Frogs/ltp_data_v3.4.0/pisrl.model') '''依存句法分析''' def get_parser(self, words, postags): arcs = self.parser.parse(words, postags) # arcs = ' '.join("%d:%s" % (arc.head, arc.relation) for arc in arcs) return arcs '''命名实体识别''' def get_name_entity(self, words, postags): netags = self.recognizer.recognize(words, postags) netags = list(netags) return netags '''ltp模型释放''' def ltp_release(self): self.segmentor.release() self.postagger.release() self.parser.release() self.recognizer.release() '''LTP主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.get_parser(words, postags) netags = self.get_name_entity(words, postags) return words, postags, arcs, netags
class LtpParser: def __init__(self): LTP_DIR = 'D:\LTP\MODEL\ltp_data' # ltp模型目录的路径 self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model"))# 分词模型路径,模型名称为`cws.model` self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model"))# 词性标注模型路径,模型名称为`pos.model` self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model"))# 依存句法分析模型路径,模型名称为`parser.model self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model"))# 命名实体识别模型路径,模型名称为`ner.model` self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))# 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。 def ner(self,words, postags): netags = self.recognizer.recognize(words, postags) # 命名实体识别 #for word, ntag in zip(words, netags): # print(word + '/' + ntag) self.parser.release() # 释放模型 return netags
print "\t".join(words) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 #postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print "\t".join(netags) labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "srl/")) #labeller.load("/home/yjliu/ltp/model/srl/") roles = labeller.label(words, postags, netags, arcs) for role in roles: print role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])
class AnsExtractor(object): # init做的事情有: # 1、加载模型 # 2、加载同义词词林 # 3、加载补充的问题分类所需的规则数组 # 其他参数传递给主流程函数do_ans_extract def __init__(self): self.segmentor = Segmentor() # 初始化实例 self.segmentor.load(cws_model_path) # 加载模型 self.postagger = Postagger() self.postagger.load(pos_model_path) self.recognizer = NamedEntityRecognizer() # 初始化实例 self.recognizer.load(ner_model_path) # 加载模型 self.parser = Parser() self.parser.load(par_model_path) self.labeller = SementicRoleLabeller() self.labeller.load(srl_model_path) # 以下是补充的问题分类所需的规则数组 self.istime_lst = [ '年份是', "时间是", "哪一年", "何时", "什么时候", "什么时间", "哪一月", "哪一日" ] self.iscolor_lst = ['什么颜色', "哪种颜色", "哪个颜色", "颜色是"] self.unit_lst = [ "回", "对", "山", "只", "刀", "群", "江", "条", "个", "打", "尾", "手", "双", "张", "溪", "挑", "坡", "首", "令", "网", "辆", "座", "阵", "队", "顶", "匹", "担", "墙", "壳", "炮", "场", "扎", "棵", "支", "颗", "钟", "单", "曲", "客", "罗", "岭", "阙", "捆", "丘", "腔", "贯", "袭", "砣", "窠", "岁", "倍", "枚", "次" ] self.islocation_lst = [ '哪个城市', "哪个国家", '国籍是', "什么国籍", "哪个省", "哪座城市", "县份是", "地址在哪里", "哪里", "何处", "何地", "哪儿", "什么地方", "什么地点" ] self.isorganization_lst = ['哪个组织', "组织是", "哪个机构", "什么组织", "什么机构"] self.isperson_lst = [ '哪个皇帝', "是谁", "什么名字", "者是", "身份是", "学家是", "什么人", "哪个人" ] self.isnum_lst = list() for unit in self.unit_lst: self.isnum_lst.append("多少" + unit) self.stop_words = [] # 停用词目前还没用到 self.sim_word_code = {} # 每个词有一个list,是它的编码(可能多个) self.get_sim_cloud() # 读取同义词词林 def get_sim_cloud(self): """ 同义词词林中的词有三种关系,同义、相关(不一定同义)、独立词 如果用于计算相似度的话,相关的词语具有相同的code,也是能接受的 所以并没有区分词关系,而是直接读取了词的code 填充sim_word_code """ sim_file = open("similarity.txt", 'r', encoding="utf-8") lines = sim_file.readlines(1000000) # 对行按格式进行处理 for line in lines: code = line[0:7] the_type = line[7] words = line[9:] words = words.split(' ') # 解析过的一行,放进模型 for word in words: if word in self.sim_word_code: self.sim_word_code[word].append(code) else: self.sim_word_code[word] = [] self.sim_word_code[word].append(code) sim_file.close() pass def get_all_NER(self, ans_sentence, type): ''' 得到答案中的所有某一类型的命名实体 :param ans_sentence: 答案句子 :param type: 命名实体类型 :return: 该类型的所有命名实体集合 ''' words = self.segmentor.segment(ans_sentence) # 分词 postags = self.postagger.postag(words) # 词性标注 netags = self.recognizer.recognize(words, postags) # 命名实体识别 ner_lst = list() # 命名实体集合 temp_str = '' for i in range(len(netags)): if netags[i] == 'S-' + type: ner_lst.append(words[i]) elif netags[i] == 'B-' + type: temp_str = words[i] elif netags[i] == 'I-' + type: temp_str += words[i] elif netags[i] == 'E-' + type: temp_str += words[i] ner_lst.append(temp_str) return ner_lst def get_pos_lst(self, sentence, type): ''' 获得句子中的某种词性集合 :param sentence: :return: 返回词性集合 ''' words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) temp_tag = '' postag_lst = list() for i in range(len(postags)): if postags[i] == type: temp_tag += words[i] else: if temp_tag != '': postag_lst.append(temp_tag) temp_tag = '' return postag_lst def get_context_type(self, ques): ''' 判断问题类型 是上一句还是下一句 :param ques: 问题语句 :return: '上句' '下句' ''' next_word = ['下句', '下一句', '下文', '后文'] for word in next_word: if ques.find(word) != -1: return '下文' return '上文' def get_parse_oneclass(self, sent): ''' 获得句子的第二层 节点 :param sent: :return:返回词语及其依存关系 ''' words = list(self.segmentor.segment(sent)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) result_arc = dict() # 依存关系的下标 是从1开始的 0表示root i = 0 for arc in arcs: if arc.head == 0: root = { 'word': words[i], 'rel': arc.relation, 'rel_index': i + 1, 'tag': postags[i] } head = words[i] i = i + 1 i = 0 for arc in arcs: if arc.head == root['rel_index']: result_arc[words[i]] = {'rel': arc.relation, 'tag': postags[i]} i = i + 1 return head, result_arc def list_has_intersection(self, lsta, lstb): ''' lstb中的单词 有否存在某一个单词 是lsta的某个单词 子串 :param lsta: :param lstb: :return: lsta的某个单词 ''' for wa in lsta: for wb in lstb: if wa.find(wb) > -1: return wa return None def get_arc_by_index(self, arcs, index): ''' 根据index 得到arc :param index: 这里的index 与语法依存树保持一致 即从1开始 :return: arc ''' i = 1 for arc in arcs: if i == index: return arc i = i + 1 def has_spe_words(self, text, lst): ''' 用于判断某句话里面 是否有列表中的单词 :param text: :param lst: :return: ''' for word in lst: if text.find(word) > -1: return True return False def get_core_rel(self, arcs, words, word): index = words.index(word) + 1 arc = self.get_arc_by_index(arcs, index) while (arc.relation == 'ATT'): arc = self.get_arc_by_index(arcs, arc.head) arc = self.get_arc_by_index(arcs, arc.head) return arc def get_index_list(self, str, word): ''' 获得word在str中的index列表 :param str: :param word: :return: 返回一个列表 没有返回空列表 ''' start = 0 lst = list() while start < len(str): index = str.find(word, start) if index == -1: break else: lst.append(index) start = index + 1 return lst def cal_dis_with_dict(self, ques_kw_dic, ans_kw_dic): ''' 计算 问题关键词index 和 答案关键词index 之间的距离 :param ques_kw_dic: :param ans_ke_dic: :return: 按照距离从小到大的 tuple组成的list ''' result_dic = dict() for ans_kw, ans_kw_index_lst in ans_kw_dic.items(): temp = 99999 for ans_kw_index in ans_kw_index_lst: ans_kw_dis = 0 for ques_kw, ques_kw_index_lst in ques_kw_dic.items(): temp_dis = 9999 for ques_kw_index in ques_kw_index_lst: if abs(ques_kw_index - ans_kw_index) < temp_dis: temp_dis = abs(ques_kw_index - ans_kw_index) ans_kw_dis += temp_dis if ans_kw_dis < temp: temp = ans_kw_dis result_dic[ans_kw] = temp # 排序 从小到大 result_tup = sorted(result_dic.items(), key=lambda item: item[1]) return result_tup def calc_dis_ner_with_dict(self, ques_kw_dic, ner_lst, ans): ''' 实体集合 计算距离问题关键词最近的 :param ques_kw_dic: :param ner_lst: :param ans: :return: 返回 tuple组成的lst ''' ner_dic = dict() for ner in ner_lst: temp = self.get_index_list(ans, ner) if temp: ner_dic[ner] = temp return self.cal_dis_with_dict(ques_kw_dic, ner_dic) def get_final_result(self, result_lst): ''' 对最终返回结果进行包装,若没有结果,返回"未找到准确答案" ''' if len(result_lst) > 0: return result_lst[0][0] else: return "未找到准确答案" def gen_short_ans(self, ques_kw_lst, ans): ''' 对于超过 20字的答案 进行截断 :param ques_kw_lst: :param ans: :return: 返回答案 ''' if len(ans) <= 20: return ans pattern = r"[,,。\.!!?\?]" lst = re.split(pattern, ans) result_dic = dict() for senten in lst: score = 0 for kw in ques_kw_lst: if senten.find(kw) > -1: score += 1 result_dic[senten] = score result_dic = sorted(result_dic.items(), key=lambda item: item[1], reverse=True) if len(result_dic) > 0: return result_dic[0][0][0:20] else: return "未找到精确答案" def do_ans_extract(self, sents, key_words, ques_type, ques, a, b): ''' 调用答案抽取算法,这是主流程函数,返回即为答案 :params: 这几个参数有候选答案句集合、关键词集合(没有用到)、问题种类、问句、算法参数 :return:返回最终的答案 返回的答案大多是一个词,如果不能返回一个词 那么返回一个长度不超过20个字的句子 如果句子和词都找不到 返回"未找到准确答案" 如果没有得到合适的候选答案句集合 返回"没有找到相关内容" ''' self.sentences = sents # 候选答案句集合 self.key_words = key_words # 关键词集合 self.question_type = ques_type # 问题种类 self.question = ques # 问句 self.a = a self.b = b # 句法相似度计算算法的两个参数 tfidf = analyse.extract_tags # 问题 关键词 位置 ques_kw_lst = tfidf(ques) ques_kw_dic = dict() for kw in ques_kw_lst: lst = self.get_index_list(ques, kw) if lst: ques_kw_dic[kw] = lst # 补充的基于规则的问题分类 if self.has_spe_words(self.question, self.isnum_lst): self.question_type = "NUMBER" ques_type = "NUMBER" elif self.has_spe_words(self.question, self.iscolor_lst): self.question_type = "COLOR" ques_type = "COLOR" elif self.has_spe_words(self.question, self.istime_lst): self.question_type = "TIME" ques_type = "TIME" elif self.has_spe_words(self.question, self.islocation_lst): self.question_type = "LOCATION" ques_type = "LOCATION" elif self.has_spe_words(self.question, self.isperson_lst): self.question_type = "PERSON" ques_type = "PERSON" elif self.has_spe_words(self.question, self.isorganization_lst): self.question_type = "ORGANIZATION" ques_type = "ORGANIZATION" # 去掉候选答案句中的空白字符 for i in range(len(self.sentences)): self.sentences[i] = ''.join(self.sentences[i].split()) # 首先得到最有可能包含答案的句子 ans_sentences = self.sort_sentences() # 然后根据问题类型,在这五个句子中进行答案抽取 if len(ans_sentences) == 0: return "没有找到相关内容" # 取最可能包含答案的句子,进入下一步 ans = ans_sentences[0] # 基于问题分类器的分类和规则补充的分类,采取不同的抽取策略 if self.question_type == "PERSON": final_anses = self.get_all_NER(ans, 'Nh') temp_lst = list() # 去除出现在问句中的人物实体 for ner in final_anses: if ques.find(ner) == -1: temp_lst.append(ner) final_anses = temp_lst if len(final_anses) == 0: return self.gen_short_ans(ques_kw_lst, ans) else: # 返回和问题关键词最接近的实体 return self.get_final_result( self.calc_dis_ner_with_dict(ques_kw_dic, final_anses, ans)) elif self.question_type == 'LOCATION': final_anses = self.get_all_NER(ans, 'Ns') temp_lst = list() for ner in final_anses: if ques.find(ner) == -1: temp_lst.append(ner) final_anses = temp_lst if len(final_anses) == 0: return self.gen_short_ans(ques_kw_lst, ans) else: return self.get_final_result( self.calc_dis_ner_with_dict(ques_kw_dic, final_anses, ans)) elif self.question_type == 'ORGANIZATION': final_anses = self.get_all_NER(ans, 'Ni') temp_lst = list() for ner in final_anses: if ques.find(ner) == -1: temp_lst.append(ner) final_anses = temp_lst if len(final_anses) == 0: return self.gen_short_ans(ques_kw_lst, ans) else: return self.get_final_result( self.calc_dis_ner_with_dict(ques_kw_dic, final_anses, ans)) elif self.question_type == 'NUMBER': for sentence in ans_sentences: for num_word in self.isnum_lst: if self.question.find(num_word) > -1: pattern = re.compile( "([\d|零|一|二|三|四|五|六|七|八|九|十|百|千|万|亿]+){unit}". format(unit=num_word[-1])) final_anses = pattern.findall(ans) if final_anses: return self.get_final_result( self.calc_dis_ner_with_dict( ques_kw_dic, final_anses, ans)) num_lst = self.get_pos_lst(sentence, 'm') if len(num_lst) == 0: return self.gen_short_ans(ques_kw_lst, ans) else: return self.get_final_result( self.calc_dis_ner_with_dict(ques_kw_dic, num_lst, ans)) return ans_sentences[0] # 没有找到 返回排名最高的句子,注:这句话没用 elif self.question_type == 'TIME': for sentence in ans_sentences: time_lst = self.get_pos_lst(sentence, 'nt') if len(time_lst) == 0: return self.gen_short_ans(ques_kw_lst, ans) else: return self.get_final_result( self.calc_dis_ner_with_dict(ques_kw_dic, time_lst, ans)) return ans_sentences[0] # 没有找到 返回排名最高的句子,注:这句话没用 elif self.question_type == 'NEXT_SENTENCE': type = self.get_context_type(ques) pattern1 = re.compile('“(.*?)”') pattern2 = re.compile('"(.*?)"') shici_sent_lst = pattern1.findall(ques) shici_sent_lst.extend(pattern2.findall(ques)) if len(shici_sent_lst) == 0: return self.gen_short_ans(ques_kw_lst, ans) shici_sent = shici_sent_lst[-1] # 寻找合适的答案 for sent in ans_sentences: if sent.find(shici_sent) > -1: ans = sent break punc_lst = [ ',', '.', '?', ',', '。', '?', '!', '!', '「', '」', '"', '“', '”', "'", "‘", "’" ] start_index = -1 end_index = -1 if type == '下文': index = ans.find(shici_sent) for i in range(index, len(ans)): if ans[i] in punc_lst and start_index == -1: start_index = i elif ans[i] in punc_lst and end_index == -1: end_index = i break return ans[start_index + 1:end_index][0:20] else: index = ans.find(shici_sent) start_index = -1 end_index = -1 for i in range(index, -1, -1): if ans[i] in punc_lst: end_index = i break for i in range(end_index - 1, -1, -1): if ans[i] in punc_lst: start_index = i break return ans[start_index + 1:end_index][0:20] elif self.question_type == 'COLOR': #对颜色进行提取 ans_words = list(self.segmentor.segment(ans)) ans_postags = list(self.postagger.postag(ans_words)) final_anses = list() i = 0 for tag in ans_postags: if tag == 'a' and ans_words[i].find("色") > -1 and len( ans_words[i]) > 1: final_anses.append(ans_words[i]) i = i + 1 if final_anses: return '、'.join(final_anses) else: return self.gen_short_ans(ques_kw_lst, ans) elif self.question_type == 'AFFIRMATION': # 认同关系 是否 kw_lst = tfidf(self.question)[0:5] score = 0 for kw in kw_lst: if ans.find(kw) > -1: score = score + 1 if score > len(kw_lst) / 2: return "是" else: return "否" else: #通用解决方法,针对一般类型的问题 # 对问题和候选答案进行关键词提取 ques_kw_lst = tfidf(ques) ans_kw_lst = tfidf(ans) # 去掉出现在问题的关键词 temp_lst = list() for kw in ans_kw_lst: if ques.find(kw) == -1: temp_lst.append(kw) ans_kw_lst = temp_lst # 对答案关键词 作词性标注,保留名词性质的词 ans_kw_postags = self.postagger.postag(ans_kw_lst) temp_lst = list() index = 0 for postag in ans_kw_postags: if postag in ['n', 'nd', 'nh', 'ni', 'nl', 'ns', 'nt', 'nz']: temp_lst.append(ans_kw_lst[index]) index = index + 1 ans_kw_lst = temp_lst # 计算关键词的位置 # 问题关键词 ques_kw_dic = dict() for kw in ques_kw_lst: lst = self.get_index_list(ques, kw) if lst: ques_kw_dic[kw] = lst # 答案关键词 ans_kw_dic = dict() for kw in ans_kw_lst: lst = self.get_index_list(ans, kw) if lst: ans_kw_dic[kw] = lst # 得到 距离排序后,最小的那一个词作为答案 ans_dis = self.get_final_result( self.cal_dis_with_dict(ques_kw_dic, ans_kw_dic)) return ans_dis # 计算候选答案句与问句的相似度,并返回按相似度排序的句子 def sort_sentences(self): # 首先得到问句的c&r词集 question_cr_words = self.get_centrial_and_rela_words(self.question) sims = [] i = 0 for sentence in self.sentences: try: sim = 0.1 * self.calc_similarity( sentence, question_cr_words) + 0.9 * self.cal_sim( sentence, self.question) except Exception as err: print(err) sim = self.cal_sim(sentence, self.question) # 相似度算法有可能发生除零错误 sims.append((i, sim)) i = i + 1 sims.sort(key=lambda item: item[1], reverse=True) ans_sentences = [] for i in range(0, len(self.sentences)): ans_sentences.append(self.sentences[sims[i][0]]) return ans_sentences # 句法分析,得到一个句子的核心词和依附于核心词的词的集合 def get_centrial_and_rela_words(self, sentence): words = self.segmentor.segment(sentence) # 分词 postags = self.postagger.postag(words) # 词性标注 arcs = self.parser.parse(words, postags) # 句法分析 i = 1 # 临时变量,指示句法树当前结点 layer_2 = [] # 句法树第二层结点索引 layer_3 = [] # 句法树第三层结点索引 for arc in arcs: # 额外的自然语言处理方案——规则补充的句法分析 """ 添加几条规则 1、对表示时间的词进行处理时,去掉时间词之间的依存关系,把连续的时间词看成一个整体。 2、对助词进行处理,去除句子中助词词性的词, 如“的”、“地”和“得”,然后将依附这些助词的词语直接依附到这些助词所依附的词语上 即不改变弧的方向,然后把其它两个词语直接相连 但是如果依附于这些助词的词语有很多不是唯一的,则此规则不能进行。 3、对虚词进行处理,根据前文所做的词性标注处理,去除与句子意思不相关的 虚词,所依据的规则如上。 """ # 有点复杂,所以没实现这些规则。 # HED表示,这个词是核心词 if arc.relation == "HED": centrial_word = i # 核心词 i = i + 1 i = 1 for arc in arcs: if arc.head == centrial_word: layer_2.append(i) # 找到第2层节点 i = i + 1 i = 1 for arc in arcs: if arc.head in layer_2: layer_3.append(i) # 找到第三层结点 i = i + 1 # 找核心词 & 依附于核心词的词语 # 除了核心词,把第二第三层的词都算作依附于核心词 rela_words = [] # 规定:rela_words数组的第一个词是核心词 rela_words.append(words[centrial_word - 1]) # 除了第一个词,后面把第二第三层的词算作依附于核心词的相关词 for j in layer_2: rela_words.append(words[j - 1]) for j in layer_3: rela_words.append(words[j - 1]) return rela_words # 计算两个词语的语义距离 # Dist(A,B) = min{dist(m,n)} # dist(m,n) = 2 * (7 - first_diff) # first_diff是两个词的code的第一个不同字符所在的位置 def calc_Dist(self, codes1, codes2): dist = 14 for code1 in codes1: for code2 in codes2: first_diff = 7 for i in range(0, 7): if code1[i] != code2[i]: first_diff = i break tmp_dist = 2 * (7 - first_diff) if tmp_dist < dist: dist = tmp_dist return dist def cal_sim(self, sentence, ques): ''' 计算相似度,基于关键词,不涉及句法 :param sentence: :param question_cr_words: :return: ''' tfidf = analyse.extract_tags ques_keywords = tfidf(ques) score = 0 for kw in ques_keywords: if sentence.find(kw) > -1: score = score + 1 return score # 计算某句子与问句的相似度 # 对于认不出来的词(同义词词林中没有) # 有很大可能是专有名词等,这时都识别为 “谜、谜语”即可 # 专有名词于是被认为是相似的 def calc_similarity(self, sentence, question_cr_words): # 对句子进行句法分析,得到c&r词集 cr_words = self.get_centrial_and_rela_words(sentence) # 计算核心词相似度 if question_cr_words[0] in self.sim_word_code: question_c_codes = self.sim_word_code[question_cr_words[0]] else: question_c_codes = ["Dk06D01"] # 注:这个码为 谜、谜语。。。 if cr_words[0] in self.sim_word_code: c_codes = self.sim_word_code[cr_words[0]] else: c_codes = ["Dk06D01"] c_Dist = self.calc_Dist(question_c_codes, c_codes) if c_Dist == 0: c_sim = 1 else: c_sim = 7 / (7 + c_Dist) # 计算非核心词相似度 question_r_codes = [0] * (len(question_cr_words) - 1) for i in range(1, len(question_cr_words)): if question_cr_words[i] in self.sim_word_code: question_r_codes[i - 1] = self.sim_word_code[question_cr_words[i]] else: question_r_codes[i - 1] = ["Dk06D01"] r_codes = [0] * (len(cr_words) - 1) for i in range(1, len(cr_words)): if cr_words[i] in self.sim_word_code: r_codes[i - 1] = self.sim_word_code[cr_words[i]] else: r_codes[i - 1] = ["Dk06D01"] # 这个略麻烦一点 q_s_sims = [0] * (len(question_cr_words) - 1) q_s_sim = 0 for i in range(0, len(question_r_codes)): for j in range(0, len(r_codes)): tmp_Dist = self.calc_Dist(question_r_codes[i], r_codes[j]) if tmp_Dist == 0: q_s_sims[i] = 1 else: tmp_sim = 7 / (7 + tmp_Dist) if tmp_sim > q_s_sims[i]: q_s_sims[i] = tmp_sim q_s_sim += q_s_sims[i] q_s_sim = q_s_sim / len(question_r_codes) s_q_sims = [0] * (len(cr_words) - 1) s_q_sim = 0 for i in range(0, len(r_codes)): for j in range(0, len(question_r_codes)): tmp_Dist = self.calc_Dist(r_codes[i], question_r_codes[j]) if tmp_Dist == 0: s_q_sims[i] = 1 else: tmp_sim = 7 / (7 + tmp_Dist) if tmp_sim > s_q_sims[i]: s_q_sims[i] = tmp_sim s_q_sim += s_q_sims[i] s_q_sim = s_q_sim / len(r_codes) res = self.a * c_sim + self.b * ((q_s_sim + s_q_sim) / 2) return res
def add_feature(self): def getshape(word): r = '' for w in word: if w.isupper(): r = r + 'A' elif w.islower(): r = r + 'a' elif w.isdigit(): r = r + '0' elif w in self.pos: r = r + 'p' elif w in ['”','、','“','。',';',',','?','!','','']: r = r + 'b' else: r = r + 'c' return r def path_cal(begin_idx, end_idx, arcs): # print(begin_idx, end_idx) begin_path_index = [] flag = False while arcs[begin_idx].head != 0: if begin_idx == end_idx: flag = True break begin_path_index.append(begin_idx) begin_idx = arcs[begin_idx].head - 1 begin_path_index.append(begin_idx) if flag: # print(begin_path_index) return begin_path_index else: end_path_index = [] while arcs[end_idx].head != 0: if end_idx == begin_idx: flag = True break end_path_index.append(end_idx) end_idx = arcs[end_idx].head - 1 end_path_index.append(end_idx) if flag: # print(end_path_index) return end_path_index else: end_path_index.reverse() path_index = begin_path_index + end_path_index[1:] # print(path_index) return path_index def get_answer_pos(l, answer): r = [0 for n in range(len(l))] r_str = '' i = 0 while r_str != answer[1:-2] and i < len(l): if l[i] in answer: r_str += l[i] r[i] = 1 else: r_str = '' for j in range(i): r[j] = 0 i += 1 if sum(r) == 0: # print(l) # print(answer) for j in range(len(r)): # print(answer[1:-2],l[j]) if answer[1:-2] in l[j]: r[j] = 1 if sum(r) == 0: i = 0 while r_str != answer[1:-2].replace(' ','') and i < len(l): if l[i] in answer or l[i] in answer.replace(' ',''): r_str += l[i] r[i] = 1 else: r_str = '' for j in range(i): r[j] = 0 i += 1 if sum(r) == 1: for i in range(len(r)): if r[i] == 1: r[i] = 'S' # r[i] = 'B' else: r[i] = 'O' else: for i in range(len(r)): if r[i] == 1: r[i] = 'I' else: r[i] = 'O' for i in range(len(r)): if r[i] == 'I': r[i] = 'B' break for i in range(len(r)-1, 0, -1): if r[i] == 'I': r[i] = 'E' break return r def adjust_list(l,words): l.insert(0,'"') n = len(l) cut_list = [] cut_list.append(l[0]) for i in range(1,n): a = '' b = l[i] if cut_list[-1]+l[i] in words: cut_list[-1] = cut_list[-1]+l[i] continue else: cut_list.append(l[i]) cut_list = cut_list[1:] return cut_list segmentor = Segmentor() segmentor.load('cws.model') postagger = Postagger() # 初始化实例 postagger.load('pos.model') # 加载模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load('ner.model') # 加载模型 parser = Parser() parser.load('parser.model') labeller = SementicRoleLabeller() # 初始化实例 labeller.load('pisrl.model') # 加载模型 fti = open('AC/tf-idf.txt', 'r') all_word = [] for line in fti: k = line[:-1].split('\t')[0] all_word.append(k) all_word = set(all_word) nj = 0 word_feature = [] word_group = [] word_all = [] for k in self.data.keys(): q_list = [] q_pos_list = [] q_sbv = [] q_vob = [] q_v = [] q_att1 = [] q_att2 = [] cut_line = '\t'.join(segmentor.segment(self.data[k][0])) word_list = cut_line.split('\t') # 分词 # print(word_list) for i in word_list: if i not in self.stop_word: q_list.append(i) # q_list = adjust_list(q_list, all_word) postags = postagger.postag(q_list) # 词性标注 pos_line = '\t'.join(postags) q_pos_list = pos_line.split('\t') netags = recognizer.recognize(q_list, postags) # 命名实体识别 ner_line = '\t'.join(netags) ner_list = ner_line.split('\t') q_ner = [] ner_str = '' for nr in range(len(ner_list)): if ner_list[nr][0] != 'O': if ner_list[nr][0] == 'S' or ner_list[nr][0] == 'E': ner_str += q_list[nr] q_ner.append(ner_str) ner_str = '' else: ner_str += q_list[nr] q_arcs = parser.parse(q_list, q_pos_list) # 句法分析 arcs_line = "\t".join("%d %s" % (arc.head, arc.relation) for arc in q_arcs) arcs_list = arcs_line.split('\t') # print(q_list) # for i in range(len(arcs_list)): # # print(arcs_list[i].split(' ')) # if int(arcs_list[i].split(' ')[0]) == 0: # q_arcs.append( 'root_' + q_list[i] + '_' + arcs_list[i].split(' ')[1]) # else: # q_arcs.append(q_list[int(arcs_list[i].split(' ')[0])-1]+ '_' + q_list[i] + '_' + arcs_list[i].split(' ')[1]) # print(q_arcs) # roles = labeller.label(q_list, postags, arcs) # print(q_list) for n in range(len(arcs_list)): # print(q_list[int(arcs_list[n].split()[0])-1],q_list[n],arcs_list[n].split()[1]) if arcs_list[n].split()[1] == 'SBV': q_v.append(q_list[int(arcs_list[n].split()[0]) - 1]) q_sbv.append(q_list[n]) elif arcs_list[n].split()[1] == 'VOB': q_v.append(q_list[int(arcs_list[n].split()[0]) - 1]) q_vob.append(q_list[n]) elif arcs_list[n].split()[1] == 'IOB': q_v.append(q_list[int(arcs_list[n].split()[0]) - 1]) q_vob.append(q_list[n]) elif arcs_list[n].split()[1] == 'FOB': q_vob.append(q_list[int(arcs_list[n].split()[0]) - 1]) q_v.append(q_list[n]) elif arcs_list[n].split()[1] == 'ATT': q_att1.append(q_list[int(arcs_list[n].split()[0]) - 1]) q_att2.append(q_list[n]) # print(q_list[int(arcs_list[n].split()[0]) - 1], q_list[n], arcs_list[n].split()[1]) # print(self.data[k][0]) # print('sbv',q_sbv) # print('v',q_v) # print('vob',q_vob) # print('att1',q_att1) # print('att2',q_att2) q_key = [] q_key_l =0.0 for i in range(len(q_list)): # if q_pos_list[i][0] == 'n' or q_pos_list[i][0] == 'a' or q_pos_list[i][0] == 'v': if q_list[i] not in self.stop_word1 and q_pos_list[i] != 'r': q_key.append(q_list[i]) q_w = '' for q in q_list: if q in self.question_word: q_w = q if q_w == '': q_w = q_list[-3] # print('q_k',q_key) a_list = [] a_pos_list = [] cut_line = '\t'.join(segmentor.segment(self.data[k][1])) word_list = cut_line.split('\t') # 分词 # print(word_list) for i in word_list: if i not in self.stop_word: a_list.append(i) # a_list = adjust_list(a_list, all_word) postags = postagger.postag(a_list) # 词性标注 pos_line = '\t'.join(postags) a_pos_list = pos_line.split('\t') netags = recognizer.recognize(a_list, postags) # 命名实体识别 ner_line = '\t'.join(netags) ner_list = ner_line.split('\t') arcs = parser.parse(a_list, a_pos_list) # 句法分析 arcs_line = "\t".join("%d %s" % (arc.head, arc.relation) for arc in arcs) arcs_list = arcs_line.split('\t') a_arcs = [] a_ci = 0 for i in range(len(arcs_list)): if arcs_list[i].split(' ')[1] == 'HED': aci = i break # print(arcs_list[i].split(' ')) # if int(arcs_list[i].split(' ')[0]) == 0: # a_arcs.append('root_' + a_list[i] + '_' + arcs_list[i].split(' ')[1]) # else: # a_arcs.append(a_list[int(arcs_list[i].split(' ')[0]) - 1] + '_' + a_list[i] + '_' + arcs_list[i].split(' ')[ # 1]) # print(a_arcs) a_key = [] a_key_l = 0.0 for i in range(len(a_list)): # if a_pos_list[i][0] == 'n' or a_pos_list[i][0] == 'a' or a_pos_list[i][0] == 'v': if a_list[i] not in self.stop_word1 and a_list[i] in q_key: a_key.append(a_list[i]) if a_key == []: a_key_l = 5.0 else: for qkw in a_key: # print(path_cal(q_list.index(q_w),q_list.index(qkw),q_arcs)) q_key_l += len(path_cal(q_list.index(q_w),q_list.index(qkw),q_arcs)) q_key_l /= len(a_key) r_pos = get_answer_pos(a_list,self.data[k][2]) # print(a_list) # print(r_pos) for i in range(len(a_list)): str_f = a_list[i] w_l = 0.0 a_l = 0.0 for j in range(len(a_list)): if a_list[j] in q_key: w_l += 1 / (math.fabs(i-j) + 1) if w_l == 0.0: w_l = 5.0 # for j in range(len(a_list)): # print(a_list[i],arcs_list[i]) # print(a_l) if a_list[i] in set(q_list): str_f += '\tin_q' else: str_f += '\tnot_in_q' if a_list[i] in set(q_ner): str_f += '\tin_qner' else: str_f += '\tnot_in_qner' if a_list[i] in set(q_sbv): str_f += '\tin_sbv' else: str_f += '\tnot_in_sbv' if a_list[i] in set(q_v): str_f += '\tin_qv' else: str_f += '\tnot_in_qv' if a_list[i] in set(q_vob): str_f += '\tin_qvob' else: str_f += '\tnot_in_qvob' if a_list[i] in set(q_att1): str_f += '\tin_att1' else: str_f += '\tnot_in_att1' if a_list[i] in set(q_att2): str_f += '\tin_att2' else: str_f += '\tnot_in_att2' if a_list[i] in set(self.pos): str_f += '\t' + a_list[i] else: str_f += '\tnot_in_pos' str_f += '\t'+self.data[k][-1]+ '_' + a_pos_list[i] str_f += '\t' + a_pos_list[i] str_f += '\t' + str(round(w_l,1)) # print(a_key) if a_key == []: a_key_l = 5.0 else: for qkw in a_key: a_key_l += len(path_cal(i,a_list.index(qkw), arcs)) # else: # a_key_l += 10.0 a_key_l /= len(a_key) a_key_l -= q_key_l # print(a_key_l) str_f += '\t' + str(round(a_key_l,1)) str_f += '\t' + str(len(a_list[i])) str_f += '\t' + getshape(a_list[i]) if a_list[i] in self.syn_dict.keys(): str_f += '\t' + self.syn_dict[a_list[i]] else: str_f += '\t' + 'N-syn' str_f += '\t' + str(arcs_list[i].split(' ')[1]) # str_f += '\t' + str(i) str_f += '\t' + str(math.fabs(aci-i)) str_f += '\t' + str(r_pos[i]) # i_key = list(set(q_key) | set(a_key)) # str_f += '\t' + str(get_l(a_list[i],a_arcs,i_key)) # str_f += '\t' + str(get_l(a_list[i], a_arcs, a_key)) word_feature.append(str_f) word_group.append(str(len(a_list))) nj += 1 # if nj == 5: # break if nj % 1000 == 0: print(nj) with open('AS/train.txt','w') as f1: for wf in word_feature[:]: f1.write(wf) f1.write('\n') with open('AS/group.txt','w') as f3: for wg in word_group: f3.write(str(wg)) f3.write('\n')
postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print("\t".join(postags)) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print("\t".join(netags)) labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "pisrl.model")) roles = labeller.label(words, postags, arcs) for role in roles: print(role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) segmentor.release() postagger.release() parser.release() recognizer.release() labeller.release()
postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print "\t".join(netags) labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "srl/")) roles = labeller.label(words, postags, netags, arcs) for role in roles: print role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) segmentor.release() postagger.release() parser.release() recognizer.release() labeller.release()