def role_label(self, words, postags, arcs): """ 语义角色标注 :param words: :param postags: :param arcs: :return: """ srl_model = os.path.join(self.MODEL_PATH, 'pisrl_win.model') labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model) # 加载模型 roles = labeller.label(words, postags, arcs) # 语义角色标注 for role in roles: print( role.index, "".join([ "{0}:({1},{2})".format(arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() return "roles{}".format(roles)
class LTP_word(): """docstring for parser_word deal处理文本,返回词表、词性及依存关系,语义,命名实体五个值 release释放缓存""" def __init__(self, model_path): self.model_path = model_path self.segmentor = Segmentor() # 分词初始化实例 self.segmentor.load_with_lexicon(path.join(self.model_path, 'cws.model'), path.join(self.model_path, 'dictionary_kfc.txt')) self.postagger = Postagger() # 词性标注初始化实例 self.postagger.load(path.join(self.model_path, 'pos.model') ) # 加载模型 self.recognizer = NamedEntityRecognizer() # 命名实体识别初始化实例 self.recognizer.load(path.join(self.model_path, 'ner.model')) self.parser = Parser() # 依存句法初始化实例 s self.parser.load(path.join(self.model_path, 'parser.model')) # 加载模型 self.labeller = SementicRoleLabeller() # 语义角色标注初始化实例 self.labeller.load(path.join(self.model_path, 'srl')) def deal (self, text): #把所有该要使用的东西都提取出来 words =self.segmentor.segment(text) # 分词 postags = self.postagger.postag(words) # 词性标注 netags = self.recognizer.recognize(words, postags) #命名实体 arcs = self.parser.parse(words, postags) # 句法分析 roles = self.labeller.label(words, postags, netags, arcs) # 语义角色标注 return words,postags,arcs,roles,netags def release(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.labeller.release()
class LTP: def __init__(self): self.segmentor = Segmentor() # 分词器 self.segmentor.load_with_lexicon( Config.SEGMENTOR_PATH, Config.PERSONAL_SEGMENTOR_PATH) # 加载模型 self.postagger = Postagger() # 词性分析器 self.postagger.load(Config.POSTAGGER_PATH) # 加载模型 self.parser = Parser() # 句法分析器 self.recognizer = NamedEntityRecognizer() self.recognizer.load(Config.NAMED_ENTITY_RECONGNTION_PATH) self.parser.load(Config.PARSER_PATH) # 加载模型 self.labeller = SementicRoleLabeller() # 语义角色分析器 self.labeller.load(Config.LABELLER_PATH) # 加载模型 self.negative_list = get_negative_list() self.no_list = get_no_list() self.limit_list = get_limit_list() self.special_list = get_special_list() self.key_sentences = [] def __del__(self): """ 资源释放 """ self.segmentor.release() self.postagger.release() self.parser.release() self.labeller.release()
def role(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 # labeller.load('/usr/local/src/ltp_data/srl') # 加载模型 labeller.load(srl_model_path) # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 """ #arg.name 表示语义角色关系 #arg.range.start 表示起始词位置 #arg.range.end 表示结束位置 roletype = {'C-A0':'施事','A0':'施事','A1':'受事','A2':'间接对象','A3':'直接目标','A4':'直接方法','A5':'其它','ADV':'附词','BNE':'受益人' , 'CND': '条件','DIR':'方向','DGR':'程度','EXT':'扩展','FRQ':'频率','LOC':'地点','MNR':'方式','PRP':'目的或原因' , 'TMP': '时间', 'TPC': '主题', 'CRD': '并列', 'PRD': '谓词', 'PSR': '持有者', 'PSE': '被持有','DIS': '转折'} postype = {'A0':'施事','A1':'受事','A2':'间接对象','A3':'直接目标','A4':'直接方法','A5':'其它','ADV':'附词','BNE':'受益人' , 'CND': '条件','DIR':'方向','DGR':'程度','EXT':'扩展','FRQ':'频率','LOC':'地点','MNR':'方式','PRP':'目的或原因' , 'TMP': '时间', 'TPC': '主题', 'CRD': '并列', 'PRD': '谓词', 'PSR': '持有者', 'PSE': '被持有'} for role in roles: #print role.index, "".join(["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) outstr = "" for arg in role.arguments: block = '' for num in range(arg.range.start, arg.range.end+1): block = block + words[num]+'[%d-%s]'%(num,postags[num]) outstr = outstr + roletype[arg.name] + "(%s);" % block print '%d-%s'%(role.index,words[role.index])+ ":"+outstr """ labeller.release() # 释放模型 return roles
def semantic_role_label(self): #依存句法分析 parser = Parser() parser.load('ltp_data/parser.model') arcs = parser.parse(self.words, self.postags) parser.release() labeller = SementicRoleLabeller() labeller.load('ltp_data/srl') roles = labeller.label(self.words, self.postags, self.netags, arcs) Label_AX = [] #存放A0或者A1标签的列表 for role in roles: Label_AX.extend([ arg for arg in role.arguments if arg.name == "A0" or arg.name == "A1" ]) for label in Label_AX: #排除一些长度异常的标签为A0或者A1的动作实施者或者动作接受者 if label.range.end - label.range.start > 0 and label.range.end - label.range.start < 10: for i in range(label.range.start, label.range.end + 1): #将动作实施者或者动作接受者中的名词,人名,地名拿出来作为实体 if self.postags[i] == "n" or self.postags[ i] == "ns" or self.postags[ i] == "nh" or self.postags[i] == "ni": self.entity.append(self.words[i]) else: pass else: pass labeller.release()
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 labeller.load('/Users/chenming/Spyder/3.3.1/ltp_data/srl/') # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 for role in roles: print (role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) labeller.release() # 释放模型
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 roles = labeller.label(words, postags, arcs) # 语义角色标注 #for role in roles: # print (role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) labeller.release() # 释放模型 return roles
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 model = "srl" labeller.load(os.path.join(modelPath, model)) # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 for role in roles: print role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) labeller.release() # 释放模型
def ltp_sementic_role_labeller(LTP_DATA_DIR, words, postags, arcs): # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。 # windos下开发使用pisrl_win.model模型 srl_model_path = os.path.join(LTP_DATA_DIR, 'srl/pisrl_win.model') labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 # arcs 使用依存句法分析的结果 roles = labeller.label(words, postags, arcs) # 语义角色标注 labeller.release() # 释放模型 return roles
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 labeller.load('../ltp_data/srl') # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 # print '----------------' # for role in roles: # print role.index, "".join( # ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) # print '----------------' labeller.release() # 释放模型 return roles
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 labeller.load(os.path.join(LTP_DATA_DIR, 'srl')) # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型
def role_label(words, postags, netags, arcs): labeller = SementicRoleLabeller() # 初始化实例 labeller.load('/Users/zhangqinyuan/Downloads/ltp_data_v3.4.0/srl') # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型
def SrlFunction(contents): from pyltp import Segmentor segmentor = Segmentor() # 初始化实例 # segmentor.load(cws_model_path) # 加载模型 segmentor.load_with_lexicon(cws_model_path, 'E:\\ltp_data_v3.4.0\\personal_seg.txt') words = segmentor.segment(contents) # 分词 k = 1 for word in words: print(word + str(k) + ' ', end='') k = k + 1 print('\n') # print('\t'.join(words)) segmentor.release() # 释放模型 wordslist = list(words) from pyltp import Postagger postagger = Postagger() # postagger.load(pos_model_path) postagger.load_with_lexicon(pos_model_path, 'D:\\ltp_data_v3.4.0\\personal_pos.txt') postags = postagger.postag(wordslist) print('\t'.join(postags)) postagger.release() # wordslist = ['人力资源社会保障局','主管','医疗保险','工作'] # postags = ['n','v','n','v'] from pyltp import Parser parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 arcs = parser.parse(wordslist, postags) # 句法分析 print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # 释放模型 from pyltp import SementicRoleLabeller labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 # arcs 使用依存句法分析的结果 roles = labeller.label(wordslist, postags, arcs) # 语义角色标注 # 打印结果 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型
def get_role_list(self, words, postags): parser = Parser() parser.load(Dependency.par_model) rolelabel = SementicRoleLabeller() rolelabel.load(Dependency.pisrl_model) try: parsers = parser.parse(words, postags) roles = rolelabel.label(words, postags, parsers) except Exception as e: roles = [[]] finally: parser.release() rolelabel.release() return roles
def get_roles_by_pyltp(self, words_list, postags_list, arcs_list): roles_list = list() # 语义角色标注模型路径,模型名称为‘pisrl.model’ srl_model_path = os.path.join(self.ltp_dir_path, "pisrl.model") labeller = SementicRoleLabeller() labeller.load(srl_model_path) roles = labeller.label(words_list, postags_list, arcs_list) labeller.release() # 尝试释放内存 # import gc # del labeller # gc.collect() # 算了,这个不行 roles_list = list(roles) return roles_list
def get_srl(sentence): labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 words = list(pyltp_cut(sentence)) # pyltp分词 postags = list(postagger.postag(words)) # 词性标注 arcs = get_parsing(sentence) # arcs 使用依存句法分析的结果 roles = labeller.label(words, postags, arcs) # 语义角色标注 # 打印结果 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型
def get_srl(self, words): # 语义角色标注 labeller = SementicRoleLabeller() # 初始化实例 labeller.load(self.srl_model_path) # 加载模型 # arcs 使用依存句法分析的结果 postags = self.get_postags(words) arcs = self.get_dependency(words) roles = labeller.label(words, postags, arcs) # 语义角色标注 # 打印结果 for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型 return roles
def labeller(word_tag, arcs, srl_model_path): ''' Desc: 语义角色标注 Args: word_tag(dict) 词性词典 arcs 依存关系 srl_model_path 语义角色标注模型 ''' labeller = SementicRoleLabeller() labeller.load(srl_model_path) roles = labeller.label(list(word_tag.keys()), list(word_tag.values()), arcs) for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release()
def sentence_label(parse_result): labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 i = 0 final_result = [] for key, value in parse_result.items(): i += 1 if i % 50 == 0: print('休息一下') time.sleep(5) words = value[0] postags = value[1] arcs = value[2] roles = labeller.label(words, postags, arcs) print('done') print(final_result) labeller.release()
class LtpParser(object): def __init__(self, data_dir: str): self.segmentor = Segmentor() self.segmentor.load(os.path.join(data_dir, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(data_dir, "pos.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(data_dir, "ner.model")) self.parser = Parser() self.parser.load(os.path.join(data_dir, "parser.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(data_dir, "pisrl.model")) def parse(self, text: str) -> List[str]: tokens = self.segmentor.segment(text) postags = self.postagger.postag(tokens) netags = self.recognizer.recognize(tokens, postags) arcs = self.parser.parse(tokens, postags) roles = self.labeller.label(tokens, postags, arcs) srlabels = {} for role in roles: srlabels[role.index] = { arg.name: { "start": arg.range.start, "end": arg.range.end } for arg in role.arguments } return { "tokens": list(tokens), "postags": list(postags), "netags": list(netags), "srlabels": srlabels, } def release(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.labeller.release()
postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print "\t".join(netags) labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "srl/")) roles = labeller.label(words, postags, netags, arcs) for role in roles: print role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) segmentor.release() postagger.release() parser.release() recognizer.release() labeller.release()
class Extractor(): def __init__(self): self.__clause_list = [] self.__subclause_dict = {} self.__triple_list = [] self.__segmentor = Segmentor() self.__postagger = Postagger() self.__recognizer = NamedEntityRecognizer() self.__parser = Parser() self.__labeller = SementicRoleLabeller() self.__words_full_list = [] self.__netags_full_list = [] @property def clause_list(self): return self.__clause_list @property def triple_list(self): return self.__triple_list def split(self, words, postags): start = 0 for j, w in enumerate(words): if w == ',' or w == ',' or w == '。': clause = Clause(start, j-1 ) self.__clause_list.append(clause) start = j + 1 for clause in self.__clause_list: clause.split(postags) for subclause in clause.sub_clause_list: self.add_inverted_idx(subclause) def add_inverted_idx(self, subclause): for i in range(subclause.start_idx, subclause.end_idx): self.__subclause_dict[i] = subclause def load(self): self.__segmentor.load('ltp_data/cws.model') self.__postagger.load('ltp_data/pos.model') self.__recognizer.load('ltp_data/ner.model') self.__parser.load('ltp_data/parser.model') self.__labeller.load('ltp_data/srl') def release(self): self.__segmentor.release() self.__postagger.release() self.__recognizer.release() self.__parser.release() self.__labeller.release() def clear(self): self.__triple_list = [] self.__words_full_list = [] self.__netags_full_list = [] def resolve_conference(self, entity): try: e_str = entity.get_content_as_str() except Exception: return '?' ref = e_str if e_str == '他' or e_str == '她': for i in range(entity.loc, -1, -1): if self.__netags_full_list[i].lower().endswith('nh'): ref = self.__words_full_list[i] break return ref def resolve_all_conference(self): for t in self.triple_list: e_str = self.resolve_conference(t.entity_1) try: t.entity_1.content = e_str.split() except Exception: pass def chunk_str(self, data): sents = SentenceSplitter.split(data) offset = 0 for sent in sents: try: words = self.__segmentor.segment(sent) postags = self.__postagger.postag(words) netags = self.__recognizer.recognize(words, postags) arcs = self.__parser.parse(words, postags) roles = self.__labeller.label(words, postags, netags, arcs) self.chunk_sent(list(words), list(postags), list(arcs), offset) offset += len(list(words)) self.__words_full_list.extend(list(words)) self.__netags_full_list.extend(list(netags)) except Exception as e: print(str(e)) pass def chunk_sent(self, words, postags, arcs, offset): root = [i for i,x in enumerate(arcs) if x.relation == 'HED'] if len(root) > 1: raise Exception('More than 1 HEAD arc is detected!') root = root[0] relations = [i for i, x in enumerate(arcs) if x.head == root and x.relation == 'COO'] relations.insert(0,root) prev_e1 = None e1 = None for rel in relations: left_arc = [i for i, x in enumerate(arcs) if x.head == rel and x.relation == 'SBV'] if len(left_arc) > 1: pass #raise Exception('More than 1 left arc is detected!') elif len(left_arc) == 0: e1 = prev_e1 elif len(left_arc) == 1: left_arc = left_arc[0] leftmost = find_farthest_att(arcs, left_arc) e1 = Entity(1, [words[i] for i in range(leftmost, left_arc + 1)], offset + leftmost) prev_e1 = e1 right_arc = [i for i, x in enumerate(arcs) if x.head == rel and x.relation == 'VOB'] e2_list = [] if not right_arc: e2 = Entity(2, None) e2_list.append(e2) else: right_ext = find_farthest_vob(arcs, right_arc[0]) items = [i for i, x in enumerate(arcs) if x.head == right_ext and x.relation == 'COO'] items = right_arc + items count = 0 for item in items: leftmost = find_farthest_att(arcs, item) e2 = None if count == 0: e2 = Entity(2, [words[i] for i in range(leftmost, right_ext + 1)], offset+leftmost) else: p1 = range(leftmost, right_arc[0]) p2 = range(item, find_farthest_vob(arcs, item) + 1) e2 = Entity(2, [words[i] for i in itertools.chain(p1, p2)]) e2_list.append(e2) r = Relation(words[rel]) t = Triple(e1, e2, r) self.__triple_list.append(t) count += 1
class SentenceParser: def __init__(self): # LTP_DIR = './ltp_data_v3.4.0' print("加载模型路径", LTP_DIR) self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) print("加载完毕") '''句法分析---为句子中的每个词语维护一个依存句法依存儿子节点(词的出度)的字典''' ''' 句法分析中,每个只有一个入度(可能吧),可能有多个出度。 为了可以结构化的展示分析结果,或者说方便提取信息。 对每个词建立一个子节点的字典: 1) 若该词的出度为0,字典为NULL 2) 若该词的出度为n,那字典的元素个数为n ''' def build_parse_child_dict(self, words, postags, arcs): """ 格式化句法分析结果 :param words: 分词结果 :param postags: 词性标注结果 :param arcs: 句法分析结果 :return: child_dict_list, format_parse_list """ ''' arcs是一个列表: 列表元素当前单词,每个元素arc包含arc.head, arc.relation信息, head为指向该词(词的父节点)的下标(从1开始),relation为父节点和该词的句法关系 *** 因为每个词只有 一个入度, 这个arc信息就表示入度信息 LTP句法分析模型输出arcs:表示每个词的入度信息,父节点信息,只有一个 返回: child_dict_list:是表示每个词的出度信息,就是子节点信息 format_parse_list:每个词信息格式化: 与父节点句法关系,该词,该词下标,该词词性,父节点词,父词下标,父词词性 ''' child_dict_list = [] format_parse_list = [] # 对每个词建立子节点信息 for index in range(len(words)): child_dict = dict() ## 遍历寻找该词的子节点 for arc_index in range(len(arcs)): ## 如果有指向该词的子节点,则加入child_dict if arcs[arc_index].head == index + 1: if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) # 对每个词建立指定信息 ## 包含: [依存关系,词,下标,POS,父节点词,父节点下标,父节点POS] # 还可以加上词的NER信息 rely_id = [arc.head for arc in arcs] # 提取每个词依存父节点id(其中id为0的是Root) relation = [arc.relation for arc in arcs] # 提取每个词依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list '''语义角色标注''' ''' 只对句子中 谓词 进行论元分析,抽取论元以及标注论元和谓词的关系。 ''' def format_labelrole(self, words, postags): """ 格式化语义角色标注结果 :param words: :param postags: :return: """ arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} ''' roles中有多个role,每个role代表句子中的一个谓词 role.index 代表谓词的索引, role.arguments 代表关于该谓词的若干语义角色。(这里的论元可能不是简单的一个词) arg.name 表示语义角色类型, arg.range.start 表示该语义角色起始词位置的索引,(索引从0开始) arg.range.end 表示该语义角色结束词位置的索引。 roles={ 'r1':{ 'args1':{ 'name': 语义角色类型, 'range':{ 'start': 语义角色起始词位置的索引, 'end': 语义角色结束词位置的索引 } }, 'args2':{ 'name': 语义角色类型, 'range': { 'start': 语义角色起始词位置的索引, 'end': 语义角色结束词位置的索引 } }, ... }, 'r2':{ 'args1': { 'name': 语义角色类型, 'range': { 'start': 语义角色起始词位置的索引, 'end': 语义角色结束词位置的索引 } }, 'args2': { 'name': 语义角色类型, 'range': { 'start': 语义角色起始词位置的索引, 'end': 语义角色结束词位置的索引 } }, ... }, ... } ''' for role in roles: roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } return roles_dict def close(self): """关闭与释放模型""" self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.labeller.release() '''parser主函数''' ''' 将模型的输出进行处理,方便之后数据处理 模型输出:words, postags, ners, arcs, roles 处理后信息: child_dict_list:句法分析,每个词的子节点信息 format_parse_list:句法分析,每个词的信息和父节点信心(父节点唯一) roles_dic: ''' def parser_main(self, sentence): '''words, postags, ners, arcs 为LTP模型输出''' words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) ners = list(self.recognizer.recognize(words, postags)) arcs = self.parser.parse(words, postags) # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) """ arcs中有多个arc arc.head 表示依存弧的父节点词的索引。ROOT节点的索引是0,第一个词开始的索引依次为1、2、3… arc.relation 表示依存弧的关系。 注意:一个词最多只有一个弧指向它(即只有一个入度),但是一个词可以指向多个词(即有多个出度) """ child_dict_list, format_parse_list = self.build_parse_child_dict( words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, ners, child_dict_list, format_parse_list, roles_dict
class RequestHandler(): def __init__(self): self.intents = [ 'translation', 'app', 'calc', 'match', 'radio', 'health', 'novel', 'video', 'cinemas', 'music', 'stock', 'train', 'news', 'message', 'map', 'weather', 'cookbook', 'tvchannel', 'flight', 'schedule', 'riddle', 'email', 'contacts', 'bus', 'website', 'datetime', 'poetry', 'lottery', 'chat', 'epg', 'telephone' ] self.segmentor = Segmentor() # 初始化实例 CWS self.segmentor.load(configs.cws_path) # 加载模型 self.postagger = Postagger() # 初始化实例 POS Tagger self.postagger.load(configs.pos_path) # 加载模型 self.labeller = SementicRoleLabeller() # 初始化实例 SRLer self.labeller.load(configs.srl_path) # 加载模型 self.parser = Parser() # 初始化实例 Parser self.parser.load(configs.parser_path) # 加载模型 self.ac = ACAutomatons() self.clf_31 = NBSVM() self.char_vectorizer_31 = joblib.load(configs.models_path + '/nbsvm-vocab-ch.pkl') self.word_vectorizer_31 = joblib.load(configs.models_path + '/nbsvm-vocab-wd.pkl') self.clf_31 = joblib.load(configs.models_path + '/nbsvm_31.pkl') self.ch2_ = joblib.load(configs.models_path + '/nbsvm-feature_selector.pkl') self.word_vectorizer_tv = joblib.load(configs.models_path + '/vocab-wd_epg-tvchannel.pkl') self.char_vectorizer_tv = joblib.load(configs.models_path + '/vocab-ch_epg-tvchannel.pkl') self.clf_tv = joblib.load(configs.models_path + '/svm_epg-tvchannel.pkl') self.word_vectorizer_movie = joblib.load(configs.models_path + '/vocab-wd_video-cinemas.pkl') self.char_vectorizer_movie = joblib.load(configs.models_path + '/vocab-ch_video-cinemas.pkl') self.clf_movie = joblib.load(configs.models_path + '/svm_video-cinemas.pkl') self.char_vectorizer_internet = joblib.load( configs.models_path + '/vocab-ch_website-app.pkl') self.word_vectorizer_internet = joblib.load( configs.models_path + '/vocab-wd_website-app.pkl') self.clf_internet = joblib.load(configs.models_path + '/svm_website-app.pkl') self.char_vectorizer_star = joblib.load(configs.models_path + '/vocab-ch_video-music.pkl') self.clf_star = joblib.load(configs.models_path + '/svm_video-music.pkl') self.word_vectorizer_star = joblib.load(configs.models_path + '/vocab-wd_video-music.pkl') self.char_vectorizer_video = joblib.load(configs.models_path + '/vocab-ch_video-epg.pkl') self.word_vectorizer_video = joblib.load(configs.models_path + '/vocab-wd_video-epg.pkl') self.clf_video = joblib.load(configs.models_path + '/svm_video-epg.pkl') def getResult(self, sentence): """1. Complete the classification in this function. Args: sentence: A string of sentence. Returns: classification: A string of the result of classification. """ processed = self.preprocess(sentence) return self.pipeline(processed) def getBatchResults(self, sentencesList): """2. You can also complete the classification in this function, if you want to classify the sentences in batch. Args: sentencesList: A List of Dictionaries of ids and sentences, like: [{'id':331, 'content':'帮我打电话给张三' }, {'id':332, 'content':'帮我订一张机票!' }, ... ] Returns: resultsList: A List of Dictionaries of ids and results. The order of the list must be the same as the input list, like: [{'id':331, 'result':'telephone' }, {'id':332, 'result':'flight' }, ... ] """ resultsList = [] for sentence in sentencesList: resultDict = {} resultDict['id'] = sentence['id'] resultDict['result'] = self.getResult(sentence['content']) resultsList.append(resultDict) return resultsList def pattern_match(self, sample): srl_res = self.sRLMatch(sample) if srl_res != None: return srl_res else: rul_res = self.ruleMatch(sample) if rul_res != None: return rul_res else: return None def ruleMatch(self, sample): domains = get_rule(sample['query'], self.ac) if len(domains) < 1: return None else: sorted_domains = aggregate_domains(domains) for each in sorted_domains: if each[0] == 'datetime': nouns = get_nouns(sample['query'], 'festival', self.ac) if len(nouns) > 0: return 'datetime' else: continue elif each[0] == 'email': if len( set(sample['word']) & set(['写', '回复', '转发', '打开', '查收', '查看', '答复']) ) > 0: return 'email' else: continue else: return None def sRLMatch(self, sample): srl_res = getSRL(sample['query'], self.segmentor, self.postagger, self.parser, self.labeller) if len(srl_res) == 0: #no any predicate in query or single entity return None else: for res in srl_res: predicate_domains = get_predicate(res[0], self.ac) if len(predicate_domains) < 1: continue #no such a predicate in database else: sorted_domains = aggregate_domains(predicate_domains) for each in sorted_domains: if each[0] == 'app': nouns = get_nouns(res[1], 'app', self.ac) if len(nouns) > 0: return 'app' else: continue elif each[0] == 'cinemas': nouns = get_nouns(res[1], 'film', self.ac) if len(nouns) > 0: return 'Movie_stuff' else: continue elif each[0] == 'contacts': # 'nr' by POS-tagger indicates a person's name if 'nr' in sample['tag']: return 'contacts' else: continue elif each[0] == 'cookbook': nouns = get_nouns(res[1], 'food', self.ac) if len(nouns) > 0: # 如果命中任何专有名词,则划分到意图app return 'cookbook' else: continue elif each[0] == 'tvchannel': nouns = get_nouns(res[1], 'tvchannel', self.ac) if len(nouns) > 0: return 'TV_stuff' else: continue elif each[0] == 'video': nouns = get_nouns(res[1], 'video', self.ac) if len(nouns) > 0: return 'Video_stuff' else: continue elif each[0] == 'health': nouns = get_nouns(res[1], 'disease', self.ac) nouns.extend(get_nouns(res[1], 'drug', self.ac)) if len(nouns) > 0: return 'health' else: continue elif each[0] == 'music': nouns_song = get_nouns(res[1], 'song', self.ac) nouns_singer = get_nouns(res[1], 'singer', self.ac) if len(nouns_song) > 0: return 'music' elif len(nouns_singer) > 0: return 'Star_stuff' else: continue elif each[0] == 'novel': nouns = get_nouns(res[1], 'novel', self.ac) if '小说' in res[1] or len(nouns) > 0: return 'novel' else: continue elif each[0] == 'poetry': nouns = get_nouns(res[1], 'poet', self.ac) if len(nouns) > 0: return 'poetry' else: continue elif each[0] == 'radio': if len(get_nouns(res[1], 'radio', self.ac)) > 0: return 'radio' else: continue elif each[0] == 'stock': nouns = get_nouns(res[1], 'stock', self.ac) if len(nouns) > 0: return 'stock' else: continue elif each[0] == 'website': nouns = get_nouns(res[1], 'website', self.ac) if len(nouns) > 0: return 'Internet_stuff' else: continue def retrieval(self, sample): """ To find proper nouns to handle single entity in a query :param sample: a dict indicates a query and its POS tag :return:a string indicates one certain intent """ pn_res = doRetrieval(sample['query'], self.ac) #look up single instance sorted_domains = aggregate_domains(pn_res) if len(sorted_domains) == 1: #one instance domain = sorted_domains[0][0] if len(max(sorted_domains[0][1], key=len)) > len(sample['query']) / 2: if domain == 'airline': return 'flight' if domain in ['railwaystation', 'airport']: return 'map' if domain == 'app': return 'app' if domain == 'contacts': return 'contacts' if domain in ['drug', 'disease']: return 'health' if domain == 'festival': return 'datetime' if domain in ['moviestar', 'film', 'video']: return 'video' if domain == 'food': return 'cookbook' if domain == 'novel': return 'novel' if domain == 'place': return 'map' if domain == 'poet': return 'poetry' if domain == 'radio': return 'radio' if domain in ['singer', 'song']: return 'music' if domain == 'sports': return 'match' if domain == 'stock': return 'stock' if domain == 'tvchannel': return 'tvchannel' if domain == 'website': return 'website' return None else: return None def classifyAllIntents(self, sample): """ A classifier for 31 intents including chitchat :param sample: a dict indicates a query and its POS tag :return:a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_31.transform(text) test_wd = self.word_vectorizer_31.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) test_vec = self.ch2_.transform(test_vec) pred = self.clf_31.predict(test_vec) return pred.tolist()[0] def epgOrTvchannel(self, sample): """ A classifier to label a instance with 'epg' or 'tvchannel' :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_tv.transform(text) test_wd = self.word_vectorizer_tv.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) pred = self.clf_tv.predict(test_vec) return pred.tolist()[0] def videoOrCinemas(self, sample): """ A classifier to label a instance with 'video' or 'cinemas' :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_movie.transform(text) test_wd = self.word_vectorizer_movie.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) pred = self.clf_movie.predict(test_vec) return pred.tolist()[0] def websiteOrApp(self, sample): """ A classifier to label a instance with 'website' or 'app' :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_internet.transform(text) test_wd = self.word_vectorizer_internet.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) pred = self.clf_internet.predict(test_vec) return pred.tolist()[0] def videoOrMusic(self, sample): """ A classifier to label a instance with 'video' or 'music' :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_star.transform(text) test_wd = self.word_vectorizer_star.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) pred = self.clf_star.predict(test_vec) return pred.tolist()[0] def videoOrEpg(self, sample): """ A classifier to label a instance with 'epg' or 'video' :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_video.transform(text) test_wd = self.word_vectorizer_video.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) pred = self.clf_video.predict(test_vec) return pred.tolist()[0] def pipeline(self, sample, use_pse=True, use_retrieval=False): """ A pipeline to label a instance with one of 31 possible intents :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ if use_pse: ps_res = prettySureExpression(sample['query'], self.ac) if len(list(set([_[1][0] for _ in ps_res]))) == 1: return ps_res[0][1][0] pm_res = self.pattern_match(sample) if pm_res == 'TV_stuff': clf_res = self.classifyAllIntents( sample) # a ML classifier to label 31 intentions if clf_res in ['epg', 'tvchannel']: return clf_res else: return self.epgOrTvchannel( sample) #a ML classifier to label epg or tvchannel elif pm_res == 'Movie_stuff': clf_res = self.classifyAllIntents( sample) # a ML classifier to label 31 intentions if clf_res in ['video', 'cinemas']: return clf_res else: return self.videoOrCinemas(sample) elif pm_res == 'Internet_stuff': clf_res = self.classifyAllIntents( sample) # a ML classifier to label 31 intentions if clf_res in ['website', 'app']: return clf_res else: return self.websiteOrApp(sample) elif pm_res == 'Star_stuff': clf_res = self.classifyAllIntents( sample) # a ML classifier to label 31 intentions if clf_res in ['video', 'music']: return clf_res else: return self.videoOrMusic(sample) elif pm_res == 'Video_stuff': clf_res = self.classifyAllIntents( sample) # a ML classifier to label 31 intentions if clf_res in ['video', 'epg']: return clf_res else: return self.videoOrEpg(sample) elif pm_res == None: if use_retrieval: ret_res = self.retrieval(sample, self.ac) if ret_res == None: return self.classifyAllIntents( sample ) # no pattern matched, so that classify it using ML else: return ret_res else: return self.classifyAllIntents(sample) else: return pm_res def preprocess(self, raw_query): """ To segment a raw user query into words and POS-tags it :param raw_query: a string generated by a user :return: a dict indicate the segmented query ,raw query and POS-tags """ tmp = pseg.cut(raw_query) words = [] pos = [] for word, flag in tmp: words.append(word) pos.append(flag) inst = {} inst['tag'] = pos inst['word'] = words del words del pos inst['query'] = raw_query return inst def close(self): """ To release relevant models """ self.postagger.release() # 释放模型 self.segmentor.release() # 释放模型 self.labeller.release() # 释放模型 self.parser.release() # 释放模型 del self.ac gc.collect()
class ltp_api(object): def __init__(self, MODELDIR, exword_path=None): self.MODELDIR = MODELDIR self.output = {} self.words = None self.postags = None self.netags = None self.arcs = None self.exword_path = exword_path # e.x: '/data1/research/matt/ltp/exwords.txt' # 分词 self.segmentor = Segmentor() if not self.exword_path: # 是否加载额外词典 self.segmentor.load(os.path.join(self.MODELDIR, "cws.model")) else: self.segmentor.load_with_lexicon( os.path.join(self.MODELDIR, "cws.model"), self.exword_path) # 词性标注 self.postagger = Postagger() self.postagger.load(os.path.join(self.MODELDIR, "pos.model")) # 依存句法 self.parser = Parser() self.parser.load(os.path.join(self.MODELDIR, "parser.model")) # 命名实体识别 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(self.MODELDIR, "ner.model")) # 语义角色 self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(MODELDIR, "pisrl.model")) # 分词 def ltp_segmentor(self, sentence): words = self.segmentor.segment(sentence) return words # 词性标注 def ltp_postagger(self, words): postags = self.postagger.postag(words) return postags # 依存语法 def ltp_parser(self, words, postags): arcs = self.parser.parse(words, postags) return arcs # 命名实体识别 def ltp_recognizer(self, words, postags): netags = self.recognizer.recognize(words, postags) return netags # 语义角色识别 def ltp_labeller(self, words, postags, arcs): output = [] roles = self.labeller.label(words, postags, arcs) for role in roles: output.append([(role.index, arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) return output def release(self): self.segmentor.release() self.postagger.release() self.parser.release() self.recognizer.release() self.labeller.release() def get_result(self, sentence): self.words = self.ltp_segmentor(sentence) self.postags = self.ltp_postagger(self.words) self.arcs = self.ltp_parser(self.words, self.postags) self.netags = self.ltp_recognizer(self.words, self.postags) self.output['role'] = self.ltp_labeller(self.words, self.postags, self.arcs) # 载入output self.output['words'] = list(self.words) self.output['postags'] = list(self.postags) self.output['arcs'] = [(arc.head, arc.relation) for arc in self.arcs] self.output['netags'] = list(self.netags)
class myLTP: def __init__(self, LTP_DATA_DIR, pattern_dir='pattern.txt'): self.LTP_DATA_DIR = LTP_DATA_DIR self.ne_pattern = self._read_ne_pattern(pattern_dir) def _read_ne_pattern(self, filename): ne_pattern = [] with open(filename, encoding='utf8') as filein: for line in filein: if line[0] != '#': np = line.split()[:2] ne_pattern.append(np) return ne_pattern def find_ne_by_pattern(self, text): ne_dic = defaultdict(list) for ne_type, pattern in self.ne_pattern: nes = re.findall(pattern, text) text = re.sub(pattern, ne_type, text) ne_dic[ne_type].extend(nes) return text, ne_dic def load(self, index=[1, 1, 1, 1, 1]): """分词 词性标注 命名实体识别 句法分析 语义角色分析""" if index[0]: cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') self.segmentor = Segmentor() self.segmentor.load(cws_model_path) if index[1]: pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model') self.postagger = Postagger() self.postagger.load(pos_model_path) if index[2]: ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model') self.recognizer = NamedEntityRecognizer() self.recognizer.load(ner_model_path) if index[3]: par_model_path = os.path.join(self.LTP_DATA_DIR, 'parser.model') self.parser = Parser() self.parser.load(par_model_path) if index[4]: srl_model_path = os.path.join(self.LTP_DATA_DIR, 'pisrl_win.model') self.labeller = SementicRoleLabeller() self.labeller.load(srl_model_path) def release(self): try: self.segmentor.release() except: pass try: self.postagger.release() except: pass try: self.recognizer.release() except: pass try: self.parser.release() except: pass try: self.labeller.release() except: pass def split_sentence(self, text): """分句""" return SentenceSplitter.split(text) def word_segment(self, sentence): """使用结巴分词""" # words = self.segmentor.segment(sentence) words = jieba.cut(sentence) return list(words) def pos_tag(self, words): """词性标注""" postags = self.postagger.postag(words) return postags def named_entity_recognize(self, words, postags): """命名实体识别""" netags = self.recognizer.recognize(words, postags) return netags def parse(self, words, postags): """句法分析""" arcs = self.parser.parse(words, postags) # (arc.head, arc.relation) return arcs def sementic_role_label(self, words, postags, arcs): """语义角色分析""" roles = self.labeller.label(words, postags, arcs) return roles def _get_ne_for_sentence(self, sentence): """获取实体,包括通过正则表达式定义的一些实体""" sentence, ne_dic = self.find_ne_by_pattern(sentence) words = list(self.word_segment(sentence)) postags = self.postagger.postag(words) ners = self.named_entity_recognize(words, postags) res = {} res['words'] = words res['ners'] = [] for index, ner in enumerate(ners): if ner != 'O': if ner[0] in ('S', 'B'): res['ners'].append([ner[2:], index, index + 1]) else: res['ners'][-1][-1] += 1 for ner_type, v in ne_dic.items(): v = iter(v) if v: for index, word in enumerate(words): if word == ner_type: words[index] = v.__next__() res['ners'].append([ner_type, index, index + 1]) return res def _get_dne_for_sentence(self, sentence): res = [] s = self._get_ne_for_sentence(sentence) ners = s['ners'] words = s['words'] for entity1, entity2 in combinations(ners, 2): res.append((entity1, entity2, words)) return res def get_dne(self, text): """获取实体对,人名(Nh)地名(Ns)机构名(Ni)""" res = [] sentences = self.split_sentence(text) for sentence in sentences: r = self._get_dne_for_sentence(sentence) res.extend(r) return res
class OpinionExtractor(object): def __init__(self): self.__segmentor = Segmentor() self.__postagger = Postagger() self.__parser = Parser() # 初始化实例 self.__labeller = SementicRoleLabeller() # 初始化实例 self.__segmentor.load_with_lexicon( os.path.join(LTP_MODEL_DIR, "cws.model"), os.path.join(DICTIONARY_DIR, "custom_lexicon.model")) self.__postagger.load(os.path.join(LTP_MODEL_DIR, "pos.model")) self.__parser.load(os.path.join(LTP_MODEL_DIR, "parser.model")) # 加载模型 self.__labeller.load(os.path.join(LTP_MODEL_DIR, "pisrl.model")) # 加载模型 self.__adv_dict_list = self.__load_adverb_dictionary() self.__adv_list = self.__adv_dict_list.get("范围副词") + self.__adv_dict_list.get("频率副词") \ + self.__adv_dict_list.get("程度副词") + self.__adv_dict_list.get("时间副词") \ + self.__adv_dict_list.get("肯否副词") + self.__adv_dict_list.get("语气副词") \ + self.__adv_dict_list.get("情态副词") self.__pronoun_list = self.__load_pronoun_words() self.__vi_list = self.__load_intransitive_verb() self.__auxiliary_dict_list = self.__load_auxiliary_dictionary() self.__auxiliary_list = self.__auxiliary_dict_list.get( "语气助词") + self.__auxiliary_dict_list.get( "结构助词") + self.__auxiliary_dict_list.get("时态助词") self.__special_prefix_list = self.__load_special_prefix_words() self.__stopwords_list = self.__load_stopwords("之前", "是因为", "已经") def release(self): self.__labeller.release() self.__parser.release() self.__postagger.release() self.__segmentor.release() @classmethod def __load_stopwords(cls, *self_define_stopwords): """ get stopwords list :param self_define_stopwords: add self define stop word to stopwords list :return: stopwords_list """ stopwords_list = [ word.strip() for word in open(os.path.join(DICTIONARY_DIR, "stopwords.txt"), "r").readlines() ] for stopword in self_define_stopwords: stopwords_list.append(stopword) return stopwords_list @classmethod def __load_special_prefix_words(cls): """ 加载特别开始词 :return: """ special_prefix_words = [] with open(os.path.join(DICTIONARY_DIR, "special_prefix.txt"), "r") as sp_file: for word in sp_file.readlines(): special_prefix_words.append(word.strip()) return special_prefix_words @classmethod def __load_intransitive_verb(cls): """ 加载不及物动词 :return: """ intransitive_verb = [] with open(os.path.join(DICTIONARY_DIR, "intransitive_verb.txt"), "r") as vi_file: for word in vi_file.readlines(): intransitive_verb.append(word.strip()) return intransitive_verb @classmethod def __load_pronoun_words(cls): """ 加载代词 :return: """ pronoun_words = [] with open(os.path.join(DICTIONARY_DIR, "pronoun.txt"), "r") as pronoun_file: for word in pronoun_file.readlines(): pronoun_words.append(word.strip()) return pronoun_words @classmethod def __load_adverb_dictionary(cls): """ 加载副词 :return: """ dictionary = {} with open(os.path.join(DICTIONARY_DIR, "adv.txt"), "r") as adv_file: for line in adv_file.readlines(): index = line.index(":") key = line[0:index].strip() value = line[index + 1:].strip() dictionary.update({key: value.split(" ")}) return dictionary @classmethod def __load_auxiliary_dictionary(cls): """ 加载助词 :return: """ dictionary = {} with open(os.path.join(DICTIONARY_DIR, "auxiliary.txt"), "r") as adv_file: for line in adv_file.readlines(): index = line.index(":") key = line[0:index].strip() value = line[index + 1:].strip() dictionary.update({key: value.split(" ")}) return dictionary @classmethod def __smart_split_sentence(cls, comment): """ 拆分句子 :param comment: :return: """ # 替换空格为"," comment = re.sub(re.compile(r"(\s+)", re.S), ",", comment.strip()) # 句子按分隔[。|!|,|、|?|.|!|,|?]符分出多个子句 subcomments = re.split(r'[。|!|,|、|?|\.|!|,|\?]', comment) return subcomments def sentence_segment_add_space(self, comment, stopwords_list={}): """ 使用空格间隔分词 如: 我们 喜欢 吃 冰激凌 :param comment: 一条语料 :param stopwords_list: 停用词列表 :return: """ self.__segmentor segment = self.__segmentor.segment(self.__remove_special_word(comment)) return segment, " ".join(segment) def __word_self_attention(self, parent_pos, parent_word, current_arc_relation, current_arc_pos, current_word): """ 判断词性与依存关系组合的有效性 词注意力机制 :param parent_pos: 父节点的词性 :param parent_word: 父节点的词 :param current_arc_relation: 当前节点的依存关系 :param current_arc_pos: 当前节点的词词性 :param current_word: 当前节点的词 :return: """ if parent_pos == Pos.v.value: if current_arc_relation == Dependency.SBV.value: return True if current_arc_relation == Dependency.VOB.value: return True if current_arc_relation == Dependency.FOB.value: return True if current_arc_relation == Dependency.ADV.value: if current_arc_pos == Pos.d.value: if current_word in self.__adv_dict_list.get("肯否副词"): return True if current_arc_pos == Pos.p.value and current_word in [ "由", "用" ]: # 由关晓彤代言 return True if current_arc_pos == Pos.v.value: return True if current_arc_relation == Dependency.ATT.value: return True if current_arc_relation == Dependency.CMP.value: return True # if current_arc_pos == Pos.u.value and current_word not in self.__auxiliary_dict_list.get("语气助词") + self.__auxiliary_dict_list.get("时态助词"): if current_arc_pos == Pos.u.value and current_word not in self.__auxiliary_list: return True elif parent_pos == Pos.a.value: if current_arc_relation == Dependency.SBV.value and current_word not in self.__pronoun_list: # e.g.:材料新鲜 它很方便 return True if current_arc_relation == Dependency.ADV.value and ( current_word not in self.__adv_dict_list.get("程度副词") + self.__adv_dict_list.get("范围副词") or (current_arc_pos == Pos.p.value and current_word in ["比"])): # 比别家好 return True if current_arc_relation == Dependency.ATT.value: return True if current_arc_pos == Pos.u.value and current_word not in self.__auxiliary_dict_list.get( "语气助词") + self.__auxiliary_dict_list.get("结构助词"): return True elif parent_pos in [ Pos.n.value, Pos.nd.value, Pos.nh.value, Pos.ni.value, Pos.nl.value, Pos.ns.value, Pos.nt.value, Pos.nz.value ]: if current_arc_relation == Dependency.ADV.value: return True if current_arc_relation == Dependency.ATT.value: # 属性语义修饰名词 return True if current_arc_pos == Pos.u.value and current_word not in self.__auxiliary_dict_list.get( "语气助词") + self.__auxiliary_dict_list.get("结构助词"): # 美丽的 return True elif parent_pos == Pos.p.value: if current_arc_relation == Dependency.SBV.value: # 他给我感觉 return True if current_arc_relation == Dependency.VOB.value: # 给我感觉 return True if current_arc_relation == Dependency.POB.value: # 比别家好 return True elif parent_pos == Pos.d.value: if current_arc_relation == Dependency.SBV.value: return True if current_arc_relation == Dependency.VOB.value: # 没有|d 4|过于|d 5|甜腻 return True elif parent_pos in [Pos.i.value, Pos.r.value, Pos.q.value ] or current_arc_relation == Dependency.CMP.value: return True return False def __parse_opinion(self, core_word_index, arcs, words, postags): """ :param core_word_index: :param arcs: :param words: :param postags: :return: opinion_word_list """ has_vob = False sbv_word = () sbv_att_word_list = [] available_word_idx_list = [core_word_index] opinion_word_list = [] def word_root_index(core_word_idx, index): """ 查找词的root index :return: """ arc = arcs[index] idx = index if arc.relation == Dependency.HED.value else arc.head - 1 if idx == core_word_idx or idx == index: return idx else: return word_root_index(core_word_idx, idx) def do_parse_opinion(core_word_idx): """ 提取以动词为核心的观点,提取的主要结构主谓结构(SBV)、动宾结构(VOB)、状中结构(ADV)、动补结构(CMP)、介宾结构(POB) :return: """ nonlocal has_vob nonlocal sbv_word nonlocal sbv_att_word_list nonlocal available_word_idx_list for m, arc in enumerate(arcs): # tuple格式:(index, 句法依存关系, 词性, 词) current_word_tuple = (m, arc.relation, postags[m], words[m]) parent_word_index = arc.head - 1 parent_word_tuple = (parent_word_index, arcs[parent_word_index].relation, postags[parent_word_index], words[parent_word_index]) if arc.head == core_word_idx + 1 \ and (current_word_tuple[2] not in [Pos.wp.value, Pos.o.value, Pos.c.value, Pos.r.value, Pos.e.value] or (current_word_tuple[2] == Pos.r.value and current_word_tuple[3] not in self.__pronoun_list)) \ and self.__word_self_attention(parent_word_tuple[2], parent_word_tuple[3], current_word_tuple[1], current_word_tuple[2], current_word_tuple[3]): # 计算词的root词是否等于关键词 root_core_index = word_root_index(core_word_index, m) if root_core_index == core_word_index: if arc.relation == Dependency.VOB.value or ( arc.relation == Dependency.CMP.value and postags[current_word_tuple[0]] == Pos.a.value): has_vob = True available_word_idx_list.append(m) opinion_word_list.append(current_word_tuple) else: if arc.head - 1 in available_word_idx_list: available_word_idx_list.append(m) # 若是主谓结构先暂存,不加入观点词list if arc.relation == Dependency.SBV.value: if len(sbv_word) == 0: sbv_word = current_word_tuple else: # 计算词的root词是否等于sbv关键词 sbv_index = sbv_word[0] if len( sbv_word) > 0 else -1 root_sbv_index = word_root_index( sbv_index, current_word_tuple[0]) if root_sbv_index == sbv_index: # 若是主谓结构的其他属性词,暂存在主谓属性词列表 sbv_att_word_list.append( current_word_tuple) else: opinion_word_list.append( current_word_tuple) do_parse_opinion(m) do_parse_opinion(core_word_index) def need_sbv(): """ 判断是否需要主谓结构 :return: """ # 三元组判断,只有包含了动宾结构才把主谓结构加入 if has_vob: return True # 及物动词可以直接加sbv if postags[core_word_index] == Pos.a.value: return True # 形容词句意可以直接在sbv if words[core_word_index] in self.__vi_list: return True return False if need_sbv() and len(sbv_word) > 0: opinion_word_list.append(sbv_word) opinion_word_list += sbv_att_word_list return opinion_word_list def extract_opinion(self, comment, distinct_opinion=True, show_core_word=False, show_detail=False): """ 抽取观点 :param comment: :param distinct_opinion: 是否去重观点 :param show_core_word: 是否展示观点核心词 :param show_detail: 是否展示分词等详细信息 :return: """ subcomments = self.__smart_split_sentence(comment) opinion_list = [] for subcomment in subcomments: words, sentence_with_space = self.sentence_segment_add_space( subcomment) opinions = self.__parse_segment(words, show_detail) if len(opinions) > 0: opinion_list += opinions if distinct_opinion: opinion_list = self.__distinct_opinion(opinion_list) if not show_core_word: opinion_list = [opinion[2] for opinion in opinion_list] return opinion_list @classmethod def __distinct_opinion(cls, opinions): """ 观点去重 :param opinions: :return: """ index = 2 distinct_opinion_list = [] for n in range(1, len(opinions)): for m in range(n, 0, -1): opi_1 = opinions[m][index] opi_2 = opinions[m - 1][index] if len(opi_1) > len(opi_2): tmp = opinions[m - 1] opinions[m - 1] = opinions[m] opinions[m] = tmp for opinion in opinions: opi = opinion[index] if len(distinct_opinion_list) == 0: distinct_opinion_list.append(opinion) else: include = False for idx in range(0, len(distinct_opinion_list)): try: include |= distinct_opinion_list[idx][index].index( opi) > -1 except ValueError: pass if not include: distinct_opinion_list.append(opinion) return distinct_opinion_list def __parse_segment(self, words, show_detail=False): postags = self.__postagger.postag(words) word_tag_tuple_list = [] for i in range(len(words)): word_tag_tuple_list.append((str(i), words[i], postags[i])) arcs = self.__parser.parse(words, postags) # arcs 使用依存句法分析的结果 labels = self.__labeller.label(words, postags, arcs) # 语义角色标注 if show_detail: logger.info("|".join(words)) logger.info(" ".join('|'.join(tpl) for tpl in word_tag_tuple_list)) logger.info(" ".join("%d|%d:%s" % (n, arc.head, arc.relation) for n, arc in enumerate(arcs))) for label in labels: logger.info( str(label.index) + ":" + ",".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in label.arguments ])) # opinions = self.__parse_main_opinion(arcs, words, postags) opinions = self.__parse_opinions(arcs, words, postags) return opinions def __parse_opinions(self, arcs, words, postags): """ 给出核心词性,解释所有该词性的短语观点 :param arcs: :param words: :param postags: :return: """ opinions = [] for n, arc in enumerate(arcs): postag = postags[n] word = words[n] if postag in [Pos.v.value, Pos.a.value, Pos.i.value] or \ (postag == Pos.a.value and word not in self.__adv_list) or \ (arc.relation in [Dependency.HED.value, Dependency.COO.value] and postag not in [Pos.v.value, Pos.a.value, Pos.i.value, Pos.m.value, Pos.c.value]): opinion_word_list = self.__parse_opinion( n, arcs, words, postags) if self.__check_opinion(postag, word, opinion_word_list): opinion_str = self.__opinion_to_str( n, words, opinion_word_list) opinions.append((postag, words[n], opinion_str)) return opinions def __parse_main_opinion(self, arcs, words, postags): """ :param arcs: :param words: :param postags: :return: """ for n, arc in enumerate(arcs): if arc.relation == Dependency.HED.value: core_index = n core_pos = postags[core_index] opinion_word_list = self.__parse_opinion(core_index, arcs, words, postags) return core_pos, words[core_index], self.__opinion_to_str( core_index, words, opinion_word_list) @classmethod def __check_opinion(cls, core_word_pos, core_word, opinion_word_list): """ 检测opinion有效性 :param core_word_pos: :param core_word: :param opinion_word_list: :return: """ if len(opinion_word_list) > 0: return True if len(opinion_word_list) == 0 and core_word_pos not in [ Pos.v.value, Pos.d.value ]: return True if len(opinion_word_list ) == 0 and core_word_pos == Pos.v.value and len( core_word) > 1: # 入口即化|v return True return False def __opinion_to_str(self, core_word_index, words, opinion_word_list): """ 输出观点字符串 :param core_word_index: :param words: :param opinion_word_list: :return: """ index_list = [core_word_index] if self.__remove_core_word(words[core_word_index]): index_list = [] for opinion_word in opinion_word_list: index = opinion_word[0] index_list.append(index) index_list.sort() opinion = "" for index in index_list: opinion += words[index] return self.__remove_special_word(opinion) @classmethod def __remove_core_word(cls, word): if word == "是": return True return False def __remove_special_word(self, opinion): new_opinion = opinion for sp_word in self.__special_prefix_list: if opinion.rfind(sp_word) == 0: new_opinion = opinion[len(sp_word):] return self.__remove_special_word(new_opinion) return new_opinion
class Semantic_Parser(object): def __init__(self): self.cws_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/cws.model' self.pos_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/pos.model' self.parser_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/parser.model' self.ner_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/ner.model' self.srl_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/srl/' def load(self): self.segmentor = Segmentor() self.segmentor.load(self.cws_model_path) self.postagger = Postagger() self.postagger.load(self.pos_model_path) self.parser = Parser() self.parser.load(self.parser_model_path) self.recognizer = NamedEntityRecognizer() self.recognizer.load(self.ner_model_path) self.labeller = SementicRoleLabeller() self.labeller.load(self.srl_model_path) def release(self): self.segmentor.release() self.postagger.release() self.parser.release() self.recognizer.release() self.labeller.release() def get_cws(self, sentence): try: cws = self.segmentor.segment(sentence) except: cws = self.segmentor.segment(sentence.decode('utf8')) print(" ".join(cws)) return cws def get_pos(self, cws): postags = self.postagger.postag(cws) print(" ".join(postags)) return postags def get_arcs(self, cws, postags): arcs = self.parser.parse(cws, postags) label = " ".join("%s:%d:%s" % (word, arc.head, arc.relation) for word, arc in zip(cws, arcs)) print(label) return arcs def get_role(self, cws, postags, arcs): netags = self.recognizer.recognize(cws, postags) roles = self.labeller.label(cws, postags, netags, arcs) for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) def get_query(self, cws, arcs): ''' 对问句做句法分析后,提取其中的主干部分 先取HED,然后分别取SBV和VOB :param cws: :param arcs: :return: ''' words = [word for word in cws] head = [arc.head for arc in arcs] relation = [arc.relation for arc in arcs] print(words) print(head) print(relation) hed_index = index(head, 0)[0] + 1 import_index = index(head, hed_index) print(import_index) sbv = [words[i] for i in import_index if relation[i] == 'SBV'] vob = [words[i] for i in import_index if relation[i] == 'VOB'] print(''.join(sbv)) print(''.join(vob)) return ''.join(sbv), ''.join(vob)
continue print(words[i]) print('\t'.join(postags)) postagger.release() # 释放模型 from pyltp import NamedEntityRecognizer recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load('/home/curtank/Documents/ltp_data/ner.model') # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 print('\t'.join(netags)) recognizer.release() # 释放模型 from pyltp import Parser parser = Parser() parser.load('/home/curtank/Documents/ltp_data/parser.model') arcs = parser.parse(words, postags) # 句法分析 print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # 释放模型 from pyltp import SementicRoleLabeller labeller = SementicRoleLabeller() # 初始化实例 labeller.load('/home/curtank/Documents/ltp_data/srl') # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 for role in roles: print( role.index, " ".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型
class ltpTools(): def __init__(self): #initialize every ltp tool LTP_DIR = "/home/demo1/support_ltp" #分词器 self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) #词性标注 self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) #依存句法分析 self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) #命名实体识别 #self.recognizer = NamedEntityRecognizer() #self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) #语义角色标注模块 self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) print('模型已全部加载') def __del__(self): self.segmentor.release() self.labeller.release() self.postagger.release() self.postagger.release() print('模型已全部释放') def segANDpos(self, sen): ''' 分词加词性标注,同时返回词列表和词性列表,一一对应 ''' words = self.segmentor.segment(sen) postags = self.postagger.postag(words) return list(words), list(postags) '''语义角色标注''' def format_labelrole(self, words, postags): #依赖于词性的标注,做依存句法的分析 #解释: #依存句法分析是基于词性标注的。 arcs = self.parser.parse(words, postags) #根据依存句法的分析,标注语义角色 roles = self.labeller.label(words, postags, arcs) #以字典储存,key为编号,value为列表 #而且是嵌套字典,以arg.name作为key #这个字典的含义就是:每个角色的索引是一级key,二级字典以语义角色类型为key roles_dict = {} for role in roles: roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } print(roles_dict) return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): #其数据结构是: #这个list底下是一个个字典,每个字典的key是关系名称,每个字典的value是这个关系所对应的词语,这样就得到了父节点们所拥有的关系及有这种关系的孩子 child_dict_list = [] #这个list的意义就是展示每个词的依存关系 format_parse_list = [] #一级循环:对每个词分析 for index in range(len(words)): #预设孩子字典 child_dict = dict() #二级循环:查每个词的语义角色 for arc_index in range(len(arcs)): #这里无非就是查一下我到底有没有成为谁的爸爸,如果有的话就登记一下 if arcs[arc_index].head == index + 1: #arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): '''显然这是一个类的主函数''' words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict( words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list