def depend_analysi(a, words_all, words, postags_a, sentences, X): par_model_path = os.path.join( LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 arcs = parser.parse(words, postags_a) # 句法分析 # rely_id = [arc.head for arc in arcs] # 提取依存父节点id #print(rely_id) relation = [arc.relation for arc in arcs] # 提取依存关系 #print(relation) # heads = ['Root' if id == 0 else words[id-1] for id in rely_id] # 匹配依存父节点词语 mixed = [word for word in words if word in say] # ne = get_name_entity(sentences[a],postags_a) # print(ne) name = '' stack = [] c = a + 1 d = a - 1 # saying = "" for k, v in enumerate(arcs): # save the most recent Noun if postags_a[k] in ['nh', 'ni', 'ns']: stack.append(words[k]) if v.relation == 'SBV' and (words[v.head - 1] in mixed): # 确定第一个主谓句 name = get_name(words[k], words[v.head - 1], words, relation) #, ne) saying = get_saying(words, relation, [i.head for i in arcs], v.head) print(name) if not saying: if "“" and "”" in words_all[a - 1]: saying = sentences[a - 1].strip() if "“" and "”" in words_all[a + 1]: saying += sentences[a + 1].strip() if not saying: #与上一句对比 p = text_similarity(a - 1, X) #与下一句对比 z = text_similarity(a, X) if p < z: saying = sentences[a - 1].strip() return "在第{}句话中 {} {}".format( a, name, words[v.head - 1]) + ":{}".format(saying) if p >= z: # saying = sentences[a+1].strip() saying = re.sub(r'[^\w]', '', sentences[a + 1].strip()) # quotations = re.findall(r'“(.+?)”', sentences[a])#???不明白 # print(quotations) # if quotations: # says = quotations[-1] # print(says) for i in range(min(len(sentences) - c - 1, 3)): k = text_similarity(c, X) print(k) if (k <= 0.9): # print(saying) saying += sentences[c + 1] # sentences_all[a] = sentences_all[a+1] c += 1 else: break for i in range(min(d, 3)): z = text_similarity_up(d, X) print("up{}".format(z)) if (z <= 0.9): # print(saying) saying = sentences[d] + saying # sentences_all[a] = sentences_all[a+1] d -= 1 else: break return "在第{}句话中 {} {}".format( a, name, words[v.head - 1]) + ":{}".format(saying) # 若找到‘:’后面必定为言论。 if words[k] == ':': name = stack.pop() saying = ''.join(words[k + 1:]) return name, saying parser.release() return False # for i in range(len(words)): # #print(relation[i] + '(' + words[i] + ', ' + heads[i] + ')') # if relation[i] == "SBV":#找出"SBV"模型下主语 # if heads[i] in say: #如果句子里say[]的词 # print("在第{}句话中 {} {}".format(a,words[i],heads[i])+":{}".format(sentences[a+1])) # article_result = "在第{}句话中 {} {}".format(a,words[i],heads[i])+":{}".format(sentences[a+1]) # return article_result # parser.release() # 释放模型par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model return
class ltp_api(object): def __init__(self, MODELDIR, exword_path='lexion'): self.MODELDIR = MODELDIR # self.output = {} self.words = None self.postags = None self.netags = None self.arcs = None self.exword_path = exword_path # e.x: 'E:\LTP\ltp_data_v3.4.0\exwords.txt' # 分词 self.segmentor = Segmentor() if not self.exword_path: # 是否加载额外词典 self.segmentor.load(os.path.join(self.MODELDIR, "cws.model")) else: self.segmentor.load_with_lexicon( os.path.join(self.MODELDIR, "cws.model"), self.exword_path) # 模型引用 # 词性标注 self.postagger = Postagger() self.postagger.load(os.path.join(self.MODELDIR, "pos.model")) # 依存句法 self.parser = Parser() self.parser.load(os.path.join(self.MODELDIR, "parser.model")) # # 命名实体识别 # self.recognizer = NamedEntityRecognizer() # self.recognizer.load(os.path.join(self.MODELDIR, "ner.model")) # 语义角色 # self.labeller = SementicRoleLabeller() # self.labeller.load(os.path.join(MODELDIR, "pisrl.model")) # 分句 def ltp_sentence_splitter(self, paragraph): sentence = SentenceSplitter.split(paragraph) # 分句的列表 # print ('\n'.join(sentence)) return sentence # 分词 def ltp_segmentor(self, sentence): words = self.segmentor.segment(sentence) # self.segmentor.release() return words #返回词的列表 # 词性标注 def ltp_postagger(self, words): postags = self.postagger.postag(words) # self.postagger.release() return postags #返回词性的列表 # 依存语法 def ltp_parser(self, words, postags): arcs = self.parser.parse(words, postags) # self.parser.release() return arcs # 命名实体识别 def ltp_recognizer(self, words, postags): netags = self.recognizer.recognize(words, postags) # self.recognizer.release() return netags # 语义角色识别 def ltp_labeller(self, words, postags, arcs): output = [] roles = self.labeller.label(words, postags, arcs) # self.labeller.release() for role in roles: output.append([(role.index, arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) return output # 各种结果 def get_result(self, sentence): self.words = self.ltp_segmentor(sentence) # 句子变成词 self.postags = self.ltp_postagger(self.words) # 词性标注 self.arcs = self.ltp_parser(self.words, self.postags) # 依存句法 self.netags = self.ltp_recognizer(self.words, self.postags) # 命名实体 # 载入output,以字典形式输出各种结果 self.output['role'] = self.ltp_labeller(self.words, self.postags, self.arcs) # 语义角色 self.output['words'] = list(self.words) self.output['postags'] = list(self.postags) self.output['arcs'] = [(arc.head, arc.relation) for arc in self.arcs] self.output['netags'] = list(self.netags) return self.output def release(self): self.segmentor.release() self.postagger.release() self.parser.release()
class Model: def __init__(self): self.name_says = defaultdict( list) #定义成全局变量有可能从sentence_process()中写入,也可能从single_sentence()写入 self.model = Word2Vec.load(path) self.word_total_count = self.model.corpus_total_words self.word_dict = self.model.wv.vocab self.dim = 256 self.postagger = Postagger() # 初始化实例 self.postagger.load(pos_model_path) # 加载模型 self.say_sim = [ '诊断', '交代', '说', '说道', '指出', '报道', '报道说', '称', '警告', '所说', '告诉', '声称', '表示', '时说', '地说', '却说', '问道', '写道', '答道', '感叹', '谈到', '说出', '认为', '提到', '强调', '宣称', '表明', '明确指出', '所言', '所述', '所称', '所指', '常说', '断言', '名言', '告知', '询问', '知道', '得知', '质问', '问', '告诫', '坚称', '辩称', '否认', '还称', '指责', '透露', '坦言', '表达', '中说', '中称', '他称', '地问', '地称', '地用', '地指', '脱口而出', '一脸', '直说', '说好', '反问', '责怪', '放过', '慨叹', '问起', '喊道', '写到', '如是说', '何况', '答', '叹道', '岂能', '感慨', '叹', '赞叹', '叹息', '自叹', '自言', '谈及', '谈起', '谈论', '特别强调', '提及', '坦白', '相信', '看来', '觉得', '并不认为', '确信', '提过', '引用', '详细描述', '详述', '重申', '阐述', '阐释', '承认', '说明', '证实', '揭示', '自述', '直言', '深信', '断定', '获知', '知悉', '得悉', '透漏', '追问', '明白', '知晓', '发觉', '察觉到', '察觉', '怒斥', '斥责', '痛斥', '指摘', '回答', '请问', '坚信', '一再强调', '矢口否认', '反指', '坦承', '指证', '供称', '驳斥', '反驳', '指控', '澄清', '谴责', '批评', '抨击', '严厉批评', '诋毁', '责难', '忍不住', '大骂', '痛骂', '问及', '阐明' ] self.valid_sentence = [] self.parser = Parser() self.parser.load(par_model_path) self.segmentor = Segmentor() self.segmentor.load(cws_model_path) self.recognizer = NamedEntityRecognizer() self.recognizer.load(ner_model_path) # @functools.lru_cache() # @fn_timer def get_count(self, word): """ O(1) """ # word_count = 0 #定义默认值 vector = np.zeros(1) #定义默认值 if word in self.word_dict: wf = self.word_dict[word].count wv = self.model.wv[word] else: wf = 1 wv = np.zeros(self.dim) return wf / self.word_total_count, wv #获取句子向量 #TODO: 计算P(w)的过程可以优化 def sentence_embedding(self, sentence): # 按照论文算法Vs=1/|s|*∑a/(a+p(w))*Vw sentences = self.process_content(sentence).replace(' ', '') a = 1e-3 #0.001 words = self.pyltp_cut(sentences) sum_vector = np.zeros(self.dim) for i, w in enumerate(words): wf, wv = self.get_count(w) sum_vector += a / (a + wf) * wv return sum_vector / (i + 1) # 欧式距离 def euclidSimilar(self, inA, inB): return 1.0 / (1.0 + la.norm(inA - inB)) # 皮尔逊相关系数 def pearsonSimilar(self, inA, inB): if len(inA) != len(inB): return 0.0 if len(inA) < 3: return 1.0 return 0.5 + 0.5 * np.corrcoef(inA, inB, rowvar=0)[0][1] # 余弦相似度 def cosSimilar(self, inA, inB): inA = np.mat(inA) inB = np.mat(inB) num = float(inA * inB.T) denom = la.norm(inA) * la.norm(inB) return 0.5 + 0.5 * (num / denom) # 句子依存分析 def parsing(self, sentence): words = self.pyltp_cut(sentence) # pyltp分词 postags = self.postagger.postag(words) # 词性标注 arcs = self.parser.parse(words, postags) # 句法分析 return arcs # 命名实体 # @functools.lru_cache() def get_name_entity(self, strs): sentence = ''.join(strs) words = self.pyltp_cut(sentence) #pyltp分词更合理 postags = self.postagger.postag(words) # 词性标注 netags = self.recognizer.recognize(words, postags) # 命名实体识别 return netags # 输入单个段落句子数组 def valid_sentences_(self, sentences, res): expect = 0.76 tmp = "" # 储存前一个言论 while sentences: curr = sentences.pop(0) if curr[0] == '“': # 当前句子或为 “言论在发言人前的直接引用”。 print(curr) people = re.search('”(.+)“|”(.+)', curr) # 提取发言人所在句段 if people: people = [i for i in people.groups() if i][0] elif res: res[-1][1] += '。' + curr continue else: continue saying = curr.replace(people, '') # 剩余部分被假设为“言论” if res and self.judge_pronoun(people): res[-1][1] += '。' + saying else: comb = self.single_sentence(people) if comb: saying += comb[1] if comb[1] else '' res.append([comb[0], saying]) continue # 尝试提取新闻 发言人,言论内容 combi = self.single_sentence(curr) # 无发言人: 当前句子属于上一个发言人的言论 或 不属于言论 if not combi: if res and tmp and self.compare_sentence( tmp, curr) > expect: #基于句子相似度判断 print('{} - {} : {}'.format( tmp, curr, self.compare_sentence(tmp, curr))) res[-1][1] += '。' + curr tmp = curr continue # 有发言人: 提取 发言人 和 言论。 name, saying = combi if res and self.judge_pronoun(curr) and saying: res[-1][1] += '。' + saying elif saying: res.append([name, saying]) tmp = saying return res @functools.lru_cache() def single_sentence(self, sentence, just_name=False, ws=False): sentence = ','.join([x for x in sentence.split(',') if x]) cuts = list(self.pyltp_cut(sentence)) # pyltp分词更合理 # 判断是否有‘说’相关词: mixed = [word for word in cuts if word in self.say_sim] if not mixed: return False ne = self.get_name_entity(tuple(sentence)) #命名实体 wp = self.parsing(sentence) #依存分析 wp_relation = [w.relation for w in wp] postags = list(self.postagger.postag(cuts)) name = '' stack = [] for k, v in enumerate(wp): # save the most recent Noun if postags[k] in ['nh', 'ni', 'ns']: stack.append(cuts[k]) if v.relation == 'SBV' and (cuts[v.head - 1] in mixed): #确定第一个主谓句 name = self.get_name(cuts[k], cuts[v.head - 1], cuts, wp_relation, ne) if just_name == True: return name #仅返回名字 says = self.get_says(cuts, wp_relation, [i.head for i in wp], v.head) if not says: quotations = re.findall(r'“(.+?)”', sentence) if quotations: says = quotations[-1] return name, says # 若找到‘:’后面必定为言论。 if cuts[k] == ':': name = stack.pop() says = ''.join(cuts[k + 1:]) return name, says return False # 输入主语第一个词语、谓语、词语数组、词性数组,查找完整主语 def get_name(self, name, predic, words, property, ne): index = words.index(name) cut_property = property[index + 1:] #截取到name后第一个词语 pre = words[:index] #前半部分 pos = words[index + 1:] #后半部分 #向前拼接主语的定语 while pre: w = pre.pop(-1) w_index = words.index(w) if property[w_index] == 'ADV': continue if property[w_index] in ['WP', 'ATT', 'SVB'] and (w not in [ ',', '。', '、', ')', '(' ]): name = w + name else: pre = False while pos: w = pos.pop(0) p = cut_property.pop(0) if p in ['WP', 'LAD', 'COO', 'RAD'] and w != predic and (w not in [ ',', '。', '、', ')', '(' ]): name = name + w # 向后拼接 else: #中断拼接直接返回 return name return name # 获取谓语之后的言论 def get_says(self, sentence, property, heads, pos): # word = sentence.pop(0) #谓语 if ':' in sentence: return ''.join(sentence[sentence.index(':') + 1:]) while pos < len(sentence): w = sentence[pos] p = property[pos] h = heads[pos] # 谓语尚未结束 if p in ['DBL', 'CMP', 'RAD']: pos += 1 continue # 定语 if p == 'ATT' and property[h - 1] != 'SBV': pos = h continue # 宾语 if p == 'VOB': pos += 1 continue # if p in ['ATT', 'VOB', 'DBL', 'CMP']: # 遇到此性质代表谓语未结束,continue # continue else: if w == ',': return ''.join(sentence[pos + 1:]) else: return ''.join(sentence[pos:]) #解析处理语句并返回给接口 def sentence_process(self, sentence): # 文章 -->清除空行 # 文章 -->句号分割:如果句号分割A.B, 若B存在‘说’,对B独立解析,否则判断A | B是否相似,确定A是否抛弃B句。 # 句子 -->确定主谓宾: 依存分析、命名实体识别 -->首先要找到宾语,然后确定宾语是否与说近似,若存在多个与‘说’近似,确定第一个为陈述。在说前找命名实体,说后面到本句结尾为宾语 # 命名实体 -->通过命名实体识别,若S - NE, NE = S - NE。若B - NE / I - NE / E - NE,NE = B - NE + I - NE + E - NE self.name_says = defaultdict(list) sentence = sentence.replace('\r\n', '\n') sections = sentence.split('\n') #首先切割成段落 sections = [s for s in sections if s.strip()] valids = '' res = [] for sec in sections: #段落 sentence_list = split(sec) sentence_list = [s.strip() for s in sentence_list if s.strip()] self.cut_sententce_for_name = [s for s in sentence_list if s] # valids = self.valid_sentences(sentence_list) res += self.valid_sentences_(sentence_list, []) if res: self.name_says = defaultdict() for name, saying in res: if name and saying: self.name_says[name] = self.name_says.get( name, '') + saying + ' | ' return self.name_says # 判断是否为代词结构句子“他认为...,他表示....” #@fn_timer def judge_pronoun(self, sentence): subsentence = re.search('(.+)“|”(.+)', sentence) if subsentence: sentence = subsentence.group(1) cuts = list(self.pyltp_cut(sentence)) # 确定分词 wp = self.parsing(sentence) # 依存分析 postags = list(self.postagger.postag(cuts)) for k, v in enumerate(wp): if v.relation == 'SBV' and postags[k] == 'r': # 确定第一个主谓句 return True return False #句子比对皮尔逊系数 def compare_sentence(self, inA, inB): inC = self.sentence_embedding(inA) inD = self.sentence_embedding(inB) return self.pearsonSimilar(inC, inD) #皮尔逊 # print(self.euclidSimilar(inC,inD)) # print(self.pearsonSimilar(inC,inD)) # print(self.cosSimilar(inC,inD)) # print('------------------------') #pyltp中文分词 def pyltp_cut(self, sentence): # segmentor = Segmentor() # 初始化实例 # segmentor.load(cws_model_path) # 加载模型 words = self.segmentor.segment(sentence) # 分 # segmentor.release() # 释放模型 return words #结巴词性标注 def jieba_pseg(self, sentence): return pseg.cut(sentence) def document_frequency(self, word, document): if sum(1 for n in document if word in n) == 0: print(word) print(type(document)) print(len(document)) print(document[0]) return sum(1 for n in document if word in n) def idf(self, word, content, document): """Gets the inversed document frequency""" return math.log10( len(content) / self.document_frequency(word, document)) def tf(self, word, document): """ Gets the term frequemcy of a @word in a @document. """ words = document.split() return sum(1 for w in words if w == word) def process_content(self, content): content = re.sub('[+——() ? 【】“”!,:。?、~@#¥%……&*()《 》]+', '', content) content = ' '.join(jieba.cut(content)) return content def release_all(self): self.segmentor.release() self.recognizer.release() self.parser.release() self.postagger.release()
postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print "\t".join(netags) labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "srl/")) roles = labeller.label(words, postags, netags, arcs) for role in roles: print role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) segmentor.release() postagger.release() parser.release() recognizer.release() labeller.release()
class DSFN: """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析 Attributes: default_user_dict_dir:str,用户自定义词典目录 default_model_dir:str,ltp模型文件目录 """ entity_verb_new = entity_verb_new() all_entity = entity_verb_new.readAllEntity( "../../entity_verb//entity_verb_result\\all_entity.json") default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' # LTP模型文件目录 location_entity = [ "中和殿", "太庙", "人文地理", "亚运村", "九龙壁", "圆明园", "古典建筑", "庑殿顶", "天井", "无量殿", "慈宁宫", "三希堂", "居庸关", "延寿寺", "排云殿", "东桥", "圜丘", "南天门", "垂花门", "西六宫", "配楼", "柳荫街", "中国四大名园", "午门", "乾东五所", "建筑管理", "世界博物馆", "西什库教堂", "晚清", "万泉河", "东暖阁", "储秀宫", "西华门", "院落", "地安门东大街", "御路", "知鱼桥", "清宁宫", "金水河", "景山前街", "司马台长城", "景山公园", "乐寿堂", "东六宫", "延陵", "宜芸馆", "芍药居", "承乾宫", "琉璃瓦", "湘江", "敌楼", "安定门外大街", "三音石", "崇文门", "天坛路", "台基", "东城区", "外朝", "武备", "全国重点文物保护单位", "房山石", "静园", "香山", "中东", "坤宁宫", "彩画", "江南园林", "北河沿大街", "岳阳楼", "丽景轩", "巴黎圣母院", "钟表馆", "戏楼", "白银", "红海", "中原", "明长城", "神乐署", "瀛洲", "码头", "百度地图", "旋子彩画", "乾西五所", "天圆地方", "琉璃厂文化街", "广岛", "御沟", "井亭", "古柏林", "石坊", "北京故宫", "宝云阁", "甬道", "熙和门", "乾清门", "北京城", "暖温带", "沥粉贴金", "安定路", "北齐长城", "减柱造", "宅园", "清华园", "天坛东门站", "西苑", "土山", "温带季风气候", "宫古", "东直门", "美国国务卿", "北海", "中华梦石城", "东门站", "天坛公园", "江山", "谐趣园", "修宅", "苏堤", "玉泉", "牌坊", "蓟镇", "高速公路", "钟粹宫", "无梁殿", "政治家", "牌楼", "波斯", "西内", "老龙头", "阴阳石", "三神山", "丹陛桥", "中国第一历史档案馆", "建筑艺术", "四川", "护城河", "文华殿", "静宜园", "乐峰", "永和宫", "金砖", "清漪园", "安定门", "宫殿", "梵华楼", "龙井", "水街", "东华门", "歇山式顶", "斋宫", "渤海镇", "仁和", "白浮村", "建筑风格", "买卖街", "藻鉴堂", "寿安宫", "奉先殿", "后海", "宋", "承德避暑山庄", "前门站", "寿安山", "八达岭", "棂星门", "经幢", "泰山", "后三宫", "天桥商场", "维新派", "拙政园", "北京十六景", "南湖岛", "山寨", "东海", "寺庙", "图书馆", "西山", "延禧宫", "九土", "十七孔桥", "鹊桥", "石鼓", "样式雷", "礼乐", "圆石", "动物园", "西湖", "齐长城遗址", "京畿", "正脊", "神武门", "洛神赋图", "绿地面积", "暖阁", "多宝塔", "磨砖对缝", "湖心亭", "崇楼", "五谷丰登", "养性殿", "关山", "砖雕", "北境", "凤凰墩", "金殿", "永定路", "世界遗产", "古柏", "郡王府", "慕田峪", "皇舆全览图", "古典园林", "坐北朝南", "皇极殿", "皇家园林", "东四十条", "京西", "黄花镇", "通惠河", "宁寿宫", "旅游局", "大角楼", "昆明湖", "后溪", "东堤", "汉白玉石", "皇史宬", "湖心岛", "长春宫", "玉澜堂", "紫檀", "玉泉山", "玉山", "茶楼", "敌台", "乾清宫", "巴县", "藕香榭", "斗拱", "苏州街", "紫禁城", "颐和轩", "皇穹宇", "南方", "智慧海", "八小部洲", "拱券", "门楣", "太和殿", "銮仪卫", "法门寺地宫", "清音阁", "龙王庙", "城岛", "皇陵", "筒瓦", "天地坛", "张古", "建筑史", "武英殿", "北长街", "天坛", "云山", "大石桥", "北平", "宫殿建筑", "山东", "博物馆", "昆明池", "交道口南大街", "平流村", "聊城", "三大殿", "清晏舫", "墀头", "养心殿", "御道", "百花园", "翊坤宫", "神道", "落地罩", "渔村", "丹陛", "歇山顶", "畅音阁", "漱芳斋", "黄鹤楼", "柱础", "嘉乐堂", "庆长", "档案", "保定", "上海", "佛香阁", "望柱", "德和园", "天桥", "北京旅游网", "祈年殿", "颐和园", "攒尖顶", "香岩宗印之阁", "分界线", "大杂院", "交泰殿", "太和门", "南郊", "健翔桥", "瓮山", "勤政殿", "云南", "景仁宫", "小山村", "金水桥", "保和殿", "寄畅园", "珍妃井", "德和园大戏楼", "正房", "第一批全国重点文物保护单位", "三合院", "万寿山", "厉家菜", "玉峰塔", "藻井", "恭王府花园", "文昌阁", "景山", "前门东大街", "端门", "代王府", "万寿亭", "景阳宫", "东四环", "景明楼", "祈谷坛", "大戏楼", "安佑宫", "石舫", "流杯亭", "行宫", "法华寺", "圜丘坛", "正义路", "居庸关长城", "箭扣长城", "石牌坊", "回音壁", "和玺彩画", "二龙戏珠", "北四环", "玉龙", "广州", "盛京", "四合院", "曲尺", "谷仓", "永定门", "宝顶", "苏式彩画", "皇宫", "寿康宫" ] def __init__(self, model_dir=default_model_dir, all_entity=all_entity): self.default_model_dir = model_dir # 加载ltp模型 # default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' # LTP模型文件目录 self.segmentor_user = Segmentor() user_dict = "..\\source\\user.txt" segmentor_flag_user = self.segmentor_user.load_with_lexicon( os.path.join(default_model_dir, 'cws.model'), user_dict) self.segmentor = Segmentor() segmentor_flag = self.segmentor.load( os.path.join(default_model_dir, 'cws.model')) # 词性标注模型 self.postagger = Postagger() postag_flag = self.postagger.load( os.path.join(self.default_model_dir, 'pos.model')) # 命名实体识别模型 self.recognizer = NamedEntityRecognizer() ner_flag = self.recognizer.load( os.path.join(self.default_model_dir, 'ner.model')) # 依存句法分析模型 self.parser = Parser() parser_flag = self.parser.load( os.path.join(self.default_model_dir, 'parser.model')) if segmentor_flag or postag_flag or ner_flag or parser_flag or segmentor_flag_user: # 可能有错误 print('load model failed') def segment(self, sentence, segmentor, entity_postag=dict()): words = segmentor.segment(sentence) lemmas = [] for lemma in words: lemmas.append(lemma) return lemmas def getPostag(self): return self.postagger def postag(self, lemmas): """ Parameters ---------- lemmas : List,分词后的结果 entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生 Returns ------- words:WordUnit List,包括分词与词性标注的结果 """ words = [] # 词性标注 postags = self.postagger.postag(lemmas) for i in range(len(lemmas)): # 存储分词与词性标记后的词单元WordUnit,编号从1开始 word = WordUnit(i + 1, lemmas[i], postags[i]) words.append(word) # self.postagger.release() #释放 return words def get_postag(self, word): """获得单个词的词性标注 Args: word:str,单词 Returns: pos_tag:str,该单词的词性标注 """ pos_tag = self.postagger.postag([word]) return pos_tag[0] def netag(self, words): """ 命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并 Parameters words : WordUnit list,包括分词与词性标注结果 Returns words_netag:WordUnit list,包含分词,词性标注与命名实体识别的结果 """ lemmas = [] # 存储分词后的结果 postags = [] # 存储词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 命名实体识别 netags = self.recognizer.recognize(lemmas, postags) words_netag = EntityCombine().combine(words, netags) return words_netag def parse(self, words): """ 对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选) Args: words_netag:WordUnit list,包含分词,词性标注与命名实体识别结果 Returns *:sentenceUnit 句子单元 """ lemmas = [] # 分词结果 postags = [] # 词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 依存句法分析 arcs = self.parser.parse(lemmas, postags) for i in range(len(arcs)): words[i].head = arcs[i].head words[i].dependency = arcs[i].relation return SentenceUnit(words) def close(self): """ 关闭与释放 """ # pynlpir.close() self.postagger.release() self.recognizer.release() self.parser.release() def splitSentence(self, text): pattern = r'。|!|?|;|=' result_list = re.split(pattern, text) result_list = list(filter(self.not_empty, result_list)) # print(result_list) return result_list def splitSentenceByComma(self, text): pattern = r',' result_list = re.split(pattern, text) result_list = list(filter(self.not_empty, result_list)) final_list = [] for sentence in result_list: if len(sentence) <= 40: final_list.append(sentence) return final_list def not_empty(self, s): return s and "".join(s.split()) def dsfn1_2_3_4COO(self, sentence, item1, item2): allTripes = [] """ 判断两个实体是否属于DSFN1的情况,并输出三元组 """ # location_position_list = ['主席','总统','总理','主任','内','东门','西门','南门','北门','大门','外','国家主席','尚书' # ] """ 去除约束2 """ if (item1.dependency == "ATT"): AttWord = item1.head_word AttWordDict = dict() AttWordStr = "" while AttWord.ID < item2.ID: AttWordDict[AttWord.ID] = AttWord.lemma # AttWordStr += AttWord.lemma if (AttWord.dependency == "ATT"): AttWord = AttWord.head_word else: break if (AttWord.ID == item2.ID): flag = True while flag: len1 = len(AttWordDict) AttList = AttWordDict.keys() for id in range(item1.ID + 1, item2.ID): item = sentence.get_word_by_id(id) if item.head_word != None and item.head_word.ID in AttList and ( item.dependency == "ATT"): AttWordDict[item.ID] = item.lemma if len1 == len(AttWordDict): flag = False else: flag = True AttWordDict = sorted(AttWordDict.items(), key=lambda item: item[0]) AttWordStr = "" for i in AttWordDict: AttWordStr += i[1] # print("三元组:(" + item1.lemma + "," + AttWordStr + "," + item2.lemma + ")") # if AttWordStr in location_position_list: allTripes.append([item1.lemma, AttWordStr, item2.lemma]) return allTripes def get_entity_num_between(self, verb1, verb2, sentence): """ 获得两个动词之间的实体数量 Parameters ---------- entity1 : WordUnit,动词1 entity2 : WordUnit,动词2 Returns: num:int,两动词间的实体数量 """ if verb1.ID > verb2.ID: c = verb1 verb1 = verb2 verb2 = c num = 0 i = verb1.ID while i < verb2.ID - 1: if self.is_entity(sentence.words[i]): num += 1 i += 1 return num def is_entity(self, entry): """判断词单元是否是实体 Args: entry:WordUnit,词单元 Returns: *:bool,实体(True),非实体(False) """ #候选实体词性列表 entity_postags = ['nh', 'ni', 'ns', 'nz', 'j', 'n', 'v', 'm'] # print(entry.lemma+" : "+entry.postag) if entry.postag in entity_postags: return True else: return False def dsfnAttCOO(self, sentence, item1, item2): item1Att = item1 item2Att = item2 while item1Att.dependency == "ATT": item1Att = item1Att.head_word allTripe = self.dsfn1_2_3_4COO(sentence, item1Att, item2) if allTripe == None or len(allTripe) == 0: while item2Att.dependency == "ATT": item2Att = item2Att.head_word allTripe = self.dsfn1_2_3_4COO(sentence, item1, item2Att) if allTripe == None or len(allTripe) == 0: allTripe = self.dsfn1_2_3_4COO(sentence, item1Att, item2Att) for tripe in allTripe: if tripe[0] == item1Att.lemma: tripe[0] = item1.lemma if tripe[2] == item2Att.lemma: tripe[2] = item2.lemma return allTripe def dsfn5COO(self, sentence, item1, item2): if item1.dependency == "COO": item1COO = item1.head_word allTripes1 = self.dsfn1_2_3_4COO(sentence, item1COO, item2) # print(allTripes1) for tripe in allTripes1: if tripe[0] == item1COO.lemma: tripe[0] = item1.lemma elif tripe[2] == item1COO.lemma: tripe[2] = item1.lemma return allTripes1 # print("allTripes1"+str(allTripes1)) def dsfn6COO(self, sentence, item1, item2): if item2.dependency == "COO": item2COO = item2.head_word allTripes2 = self.dsfn1_2_3_4COO(sentence, item1, item2COO) for tripe in allTripes2: if tripe[2] == item2COO.lemma: tripe[2] = item2.lemma elif tripe[0] == item2COO.lemma: tripe[0] = item2.lemma return allTripes2 def dsfn5and6COO(self, sentence, item1, item2): if item1.dependency == "COO": item1COO = item1.head_word if item2.dependency == "COO": item2COO = item2.head_word allTripe = self.dsfn1_2_3_4COO(sentence, item1COO, item2COO) for tripe in allTripe: if tripe[0] == item1COO.lemma and tripe[ 2] == item2COO.lemma: tripe[0] = item1.lemma tripe[2] = item2.lemma if tripe[2] == item1COO.lemma and tripe[ 0] == item2COO.lemma: tripe[2] = item1.lemma tripe[0] = item2.lemma return allTripe def dsfnStart(self, rawSentence, segmentor, entity1, entity2, all_entity): nounRelatedWithPosition = ['主席', '总理', '教授', '校长'] resultList = [] lemmas = dsfn.segment(rawSentence, segmentor) words = dsfn.postag(lemmas) words_netag = dsfn.netag(words) sentence = dsfn.parse(words_netag) # print(sentence.to_string()) Rawitem1 = None Rawitem2 = None item1 = None item2 = None Rawitem1Index = -1 Rawitem2Index = -1 indexList = [-1, -1] for item in sentence.words: if (item.lemma == entity1): Rawitem1 = item if (item.lemma == entity2): Rawitem2 = item if Rawitem1 != None and Rawitem2 != None and ( Rawitem1.ID != Rawitem1Index or Rawitem2.ID != Rawitem2Index): Rawitem1Index = Rawitem1.ID Rawitem2Index = Rawitem2.ID # if item1 == None or item2 == None: # return None item1 = Rawitem1 item2 = Rawitem2 if item1.ID > item2.ID: c = item1 item1 = item2 item2 = c # print(str(item1.ID) + " " + str(item2.ID)) itemCopy1 = item1 itemCopy2 = item2 # print(item1.lemma) # print(item2.lemma) # print(self.dsfnConstraints2(sentence,item1,item2,all_entity)) if self.dsfnConstraints2(sentence, item1, item2, all_entity) == False: continue allTripes = self.dsfnStartCOO2(sentence, item1, item2) # print("111"+item2.lemma) # print(allTripes) if allTripes == None or (allTripes != None and len(allTripes) == 0): # print("我要走ATT的部分了") while item1.dependency == "ATT": item1 = item1.head_word while item2.dependency == "ATT": item2 = item2.head_word allTripes = self.dsfnStartCOO2(sentence, item1, item2) if len(allTripes) != 0: for tripe in allTripes: if tripe[1] != "": if tripe[0] == item1.lemma: if item1.ID < itemCopy1.ID: tripe[ 0] = item1.lemma + "" + itemCopy1.lemma elif item1.ID > itemCopy1.ID: tripe[ 0] = itemCopy1.lemma + "" + item1.lemma else: tripe[0] = itemCopy1.lemma elif tripe[2] == item1.lemma: if item1.ID < itemCopy1.ID: tripe[ 2] = item1.lemma + "" + itemCopy1.lemma elif item1.ID > itemCopy1.ID: tripe[ 2] = itemCopy1.lemma + "" + item1.lemma else: tripe[2] = itemCopy1.lemma # tripe[2] = itemCopy1.lemma if tripe[0] == item2.lemma: if item2.ID < itemCopy2.ID: tripe[ 0] = item2.lemma + "" + itemCopy2.lemma elif item2.ID > itemCopy2.ID: tripe[ 0] = itemCopy2.lemma + "" + item2.lemma else: tripe[0] = itemCopy2.lemma elif tripe[2] == item2.lemma: # print(item2.lemma) if item2.ID < itemCopy2.ID: tripe[ 2] = item2.lemma + "" + itemCopy2.lemma elif item2.ID > itemCopy2.ID: tripe[ 2] = itemCopy2.lemma + "" + item2.lemma else: tripe[2] = itemCopy2.lemma # print("12345") resultList.append(tripe) else: for tripe in allTripes: if tripe[1] != "": resultList.append(tripe) # if len(resultList) > 0: # return np.array(set([tuple(t) for t in resultList])) if item1 == None or item2 == None: return None if len(resultList) > 0: # return np.array(set([tuple(t) for t in resultList])) # print("输出结果1"+str(resultList)) return resultList def dsfnStartCOO2(self, sentence, item1, item2): nounRelatedWithPosition = ['主席', '总理', '教授', '校长'] resultList = [] itemCopy1 = item1 itemCopy2 = item2 """ 来解决ATT依赖的名词,如 李克强[ATT] <----- 总理[SBV] """ # print(item1.lemma) # print(item2.lemma) allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2) if len(allTripes) == 0: # print("11111111") allTripes = self.dsfn5COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("2222222") allTripes = self.dsfn6COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("3333333") allTripes = self.dsfn5and6COO(sentence, item1, item2) # if allTripes == None or len(allTripes) == 0: # print("44444444444") # allTripes = self.dsfnAttCOO(sentence,item1,item2) # print("第一次"+str(allTripes)) if allTripes != None and len(allTripes) != 0: for tripe in allTripes: resultList.append(tripe) # print("第二次") pred1 = None subForCoo = None for item in sentence.words: if item.postag == "v" and item.dependency == "COO": pred1 = item.head_word for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred1.ID: for phrase in sentence.words: if phrase.dependency == "SBV" and phrase.head_word.ID == item.ID: subForCoo = phrase if subForCoo == None or ( subForCoo != None and subForCoo.ID == word.ID): # 处理动词COO的情况,必须要保证此并列动词没有额外主语。 # 考虑到:习近平主席视察厦门,李克强总理访问香港 word.head_word = item # print(sentence.to_string()) # print(item1.lemma) # print(item2.lemma) allTripes = self.dsfn1_2_3_4COO( sentence, item1, item2) if len(allTripes) == 0: # print("11111111") allTripes = self.dsfn5COO( sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("2222222") allTripes = self.dsfn6COO( sentence, item1, item2) if allTripes == None or len( allTripes) == 0: # print("3333333") allTripes = self.dsfn5and6COO( sentence, item1, item2) # if allTripes == None or len(allTripes) == 0: # allTripes = self.dsfnAttCOO(sentence,item1,item2) # print("第二次"+str(allTripes)) if allTripes != None and len(allTripes) != 0: for tripe in allTripes: resultList.append(tripe) # print(np.array(set([tuple(t) for t in resultList]))) return resultList def dsfnConstraints1(self, rawSentence, maxLength): """ :param rawSentence: 原句子 :param maxLength: 句子的最大长度 :return: 小于maxLength的长度 """ newSentence = [] if len(rawSentence) <= maxLength: newSentence.append(rawSentence) return newSentence else: newSentence = self.splitSentenceByComma(rawSentence) return newSentence def dsfnConstraints2(self, sentence, item1, item2, allEntities): countEntity = 0 countChar = 0 for index in range(item1.ID + 1, item2.ID): word = sentence.get_word_by_id(index) countChar += len(word.lemma) if word.lemma in allEntities: countEntity += 1 # print(countEntity) # print(countChar) if countEntity > 3: return False elif countChar > 12: # print(countChar) return False else: return True def dsfnConstraints3(self, sentence, item1, item2): countChar = 0 for index in range(item1.ID + 1, item2.ID): word = sentence.get_word_by_id(index) countChar += len(word.lemma) if countChar > 5: return False else: return True def getSPO(self, sentence, segmentor): all_result = [] raw_sentence = [] RawSentence = sentence lemmas = self.segment(sentence, segmentor) words = self.postag(lemmas) words_netag = self.netag(words) sentence = self.parse(words_netag) # print(sentence.to_string()) for itemWord in sentence.words: #来找到一个动词,这个动词要么是一句话的HED,要么与一句话的HED是COO的依存关系 if (itemWord.head_word == None and itemWord.postag == "v" ) or (itemWord.postag == "v" and itemWord.dependency == "COO" and itemWord.head_word.head_word == None)\ or (itemWord.postag == "v") : relation_verb = itemWord #将找到的这个动词,作为relation_verb relationString = relation_verb.lemma # print(relationString) if itemWord.head_word == None: # print("1") verbId = itemWord.ID #关系动词的ID verbId2 = None elif itemWord.head_word.head_word == None: # print("2") verbId = itemWord.ID #该关系动词的ID if itemWord.dependency == "COO" or self.get_entity_num_between( itemWord, itemWord.head_word, sentence) == 0: verbId2 = itemWord.head_word.ID # 这句话的HED,用来找SUB else: verbId2 = None else: # print("3") verbId = itemWord.ID #该关系动词的ID verbId2 = None O_dict = dict() #存储所有的Object S_dict = dict() #存储所有的Subject verb_dict = dict() #存储所有的verb,主要考虑的情况为:习近平主席在北京大学发表演讲 OBJ = None SUB = None DSFN3 = dict() for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId: #寻找这个动词的主语 # if SUB == None or SUB.lemma != entity: SUB = item #找到主语 S_dict[SUB.ID] = SUB.lemma #将主语加入到字典中 if (item.dependency == "VOB" and item.head_word.ID == verbId and item.postag != "v"): # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词) OBJ = item O_dict[OBJ.ID] = OBJ.lemma relationString = relation_verb.lemma verb_dict[OBJ.ID] = relationString if (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID == verbId): # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词) OBJ = item O_dict[OBJ.ID] = OBJ.lemma relationString = relation_verb.lemma + "" + item.head_word.lemma verb_dict[OBJ.ID] = relationString if (item.dependency == "POB" and (item.head_word.postag == "p" or item.head_word.postag == 'd')\ and item.head_word.dependency == "ADV" and item.head_word.head_word.ID == verbId \ and item.postag!='v'): # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词) OBJ = item O_dict[OBJ.ID] = OBJ.lemma verbObj = None DSFN3[OBJ.ID] = True objectDict = dict() relationString = relation_verb.lemma for eachWord in sentence.words: if eachWord.dependency == "VOB" and eachWord.head_word.ID == relation_verb.ID: # relationString = relation_verb.lemma + "" + eachWord.lemma verbObj = eachWord objectDict[verbObj.ID] = verbObj if verbObj != None: for word in sentence.words: if word.head_word != None and word.dependency == "ATT" and word.head_word.ID == verbObj.ID: objectDict[word.ID] = word objectDict = sorted(objectDict.items(), key=lambda item: item[0]) objectStr = "" for objectItem in objectDict: objectStr += objectItem[1].lemma relationString = relation_verb.lemma + "" + objectStr else: for eachWord in sentence.words: if eachWord.dependency == "POB" and eachWord.head_word.dependency == "CMP" and\ eachWord.head_word.head_word.ID == relation_verb.ID: relationString = relation_verb.lemma + "" + eachWord.head_word.lemma + "" + eachWord.lemma verb_dict[OBJ.ID] = relationString if SUB == None: #如果没找到主语,那么就找与该动词并列的verbId2的主语 for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId2: # if SUB == None or SUB.lemma != entity: SUB = item S_dict[SUB.ID] = SUB.lemma # print(verbId2) if OBJ == None: verb_coo = None for item in sentence.words: if item.dependency == "COO" and item.head_word.ID == verbId and item.ID > verbId: verb_coo = item break flag = True if verb_coo != None and self.get_entity_num_between( relation_verb, verb_coo, sentence) == 0: for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verb_coo.ID: flag = False if flag != False: for item in sentence.words: if (item.dependency == "VOB" and item.head_word.ID == verb_coo.ID)\ or (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID== verb_coo.ID) or (item.dependency == "POB" and item.head_word.postag == "p"\ and item.head_word.dependency == "ADV" and item.head_word.head_word.ID== verb_coo.ID): OBJ = item O_dict[OBJ.ID] = OBJ.lemma # print(S_dict) # print(verb_dict) # print(O_dict) SUB_COO = None OBJ_COO = None for item in sentence.words: if item.head_word != None: if SUB != None and item.dependency == "COO" and item.head_word.ID in S_dict: #获得主语的COO SUB_COO = item S_dict[SUB_COO.ID] = SUB_COO.lemma if item.head_word != None and OBJ != None: if item.dependency == "COO" and item.head_word.ID in O_dict: #获得宾语的COO OBJ_COO = item O_dict[OBJ_COO.ID] = OBJ_COO.lemma S_new = [] for sub in S_dict: # if sentence.get_word_by_id(sub).postag == 'r': # continue S_dict2 = dict() # 存放主语ATT的列表 S_dict2[sub] = S_dict[sub] flag = True while flag == True: len1 = len(S_dict2) for item in sentence.words: if item.head_word != None: SUBList = S_dict2.keys() if item.head_word.ID in SUBList and ( item.dependency == "ATT" or item.dependency == "ADV"): SUBATT = item S_dict2[SUBATT.ID] = SUBATT.lemma if len(S_dict2) != len1: flag = True else: flag = False S_dict2 = sorted(S_dict2.items(), key=lambda item: item[0]) Subject = "" for i in S_dict2: Subject += i[1] S_new.append(Subject) O_new = [] V_new = [] for obj in O_dict: # if sentence.get_word_by_id(obj).postag == 'r': # continue O_dict2 = dict() # 存放宾语ATT的列表 O_dict2[obj] = O_dict[obj] if verb_dict != None: if obj in verb_dict: relationString2 = verb_dict[obj] else: relationString2 = relation_verb.lemma else: relationString2 = relation_verb.lemma V_new.append(relationString2) flag = True while flag == True: len2 = len(O_dict2) for item in sentence.words: if item.head_word != None: OBJList = O_dict2.keys() if item.head_word.ID in OBJList and ( item.dependency == "ADV" or item.dependency == "ATT" or item.dependency == "VOB" or (item.dependency == "COO" and item.head_word.ID != obj)): if item.dependency == "ATT" and item.postag == "v": if self.get_entity_num_between( item, sentence.get_word_by_id(obj), sentence) > 0: continue else: OBJATT = item O_dict2[OBJATT.ID] = OBJATT.lemma else: OBJATT = item O_dict2[OBJATT.ID] = OBJATT.lemma # print(OBJATT.lemma) if len(O_dict2) != len2: flag = True else: flag = False #一直循环,直到找不到新的修饰词 O_dict2 = sorted(O_dict2.items(), key=lambda item: item[0]) Object = "" for i in O_dict2: Object += i[1] flag = False # if obj in DSFN3: # for location in self.location_entity: # if location in Object : # flag = True # if flag == True: # O_new.append(Object) # if flag == False: # O_new.append("") # else: O_new.append(Object) # print(O_dict) # print(O_new) for sub in S_new: for i in range(0, len(O_new)): obj = O_new[i] relationWord = V_new[i] if obj != "": # print(RawSentence) # print((sub, relationWord, obj)) all_result.append([sub, relationWord, obj]) raw_sentence.append(RawSentence) return all_result, raw_sentence def hasEntity(self, word, allEntity): for entity in allEntity: if entity in word: # print(entity) return True return False def PostProcessSPO(self, rawSentence, allTripes, allEntity): output_list = [] for i in range(0, len(allTripes)): tripe = allTripes[i] sub = tripe[0] obj = tripe[2] # print(sub) # print(obj) if self.hasEntity(sub, allEntity) and self.hasEntity( obj, allEntity): output_list.append(tripe) return output_list
class ltp_api(object): def __init__(self, MODELDIR, exword_path=None): self.MODELDIR = MODELDIR self.output = {} self.words = None self.postags = None self.netags = None self.arcs = None self.exword_path = exword_path # e.x: '/data1/research/matt/ltp/exwords.txt' # 分词 self.segmentor = Segmentor() if not self.exword_path: # 是否加载额外词典 self.segmentor.load(os.path.join(self.MODELDIR, "cws.model")) else: self.segmentor.load_with_lexicon( os.path.join(self.MODELDIR, "cws.model"), self.exword_path) # 词性标注 self.postagger = Postagger() self.postagger.load(os.path.join(self.MODELDIR, "pos.model")) # 依存句法 self.parser = Parser() self.parser.load(os.path.join(self.MODELDIR, "parser.model")) # 命名实体识别 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(self.MODELDIR, "ner.model")) # 语义角色 self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(MODELDIR, "pisrl.model")) # 分词 def ltp_segmentor(self, sentence): words = self.segmentor.segment(sentence) return words # 词性标注 def ltp_postagger(self, words): postags = self.postagger.postag(words) return postags # 依存语法 def ltp_parser(self, words, postags): arcs = self.parser.parse(words, postags) return arcs # 命名实体识别 def ltp_recognizer(self, words, postags): netags = self.recognizer.recognize(words, postags) return netags # 语义角色识别 def ltp_labeller(self, words, postags, arcs): output = [] roles = self.labeller.label(words, postags, arcs) for role in roles: output.append([(role.index, arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) return output def release(self): self.segmentor.release() self.postagger.release() self.parser.release() self.recognizer.release() self.labeller.release() def get_result(self, sentence): self.words = self.ltp_segmentor(sentence) self.postags = self.ltp_postagger(self.words) self.arcs = self.ltp_parser(self.words, self.postags) self.netags = self.ltp_recognizer(self.words, self.postags) self.output['role'] = self.ltp_labeller(self.words, self.postags, self.arcs) # 载入output self.output['words'] = list(self.words) self.output['postags'] = list(self.postags) self.output['arcs'] = [(arc.head, arc.relation) for arc in self.arcs] self.output['netags'] = list(self.netags)
class Extractor(): def __init__(self): self.__clause_list = [] self.__subclause_dict = {} self.__triple_list = [] self.__segmentor = Segmentor() self.__postagger = Postagger() self.__recognizer = NamedEntityRecognizer() self.__parser = Parser() self.__labeller = SementicRoleLabeller() self.__words_full_list = [] self.__netags_full_list = [] @property def clause_list(self): return self.__clause_list @property def triple_list(self): return self.__triple_list def split(self, words, postags): start = 0 for j, w in enumerate(words): if w == ',' or w == ',' or w == '。': clause = Clause(start, j-1 ) self.__clause_list.append(clause) start = j + 1 for clause in self.__clause_list: clause.split(postags) for subclause in clause.sub_clause_list: self.add_inverted_idx(subclause) def add_inverted_idx(self, subclause): for i in range(subclause.start_idx, subclause.end_idx): self.__subclause_dict[i] = subclause def load(self): self.__segmentor.load('ltp_data/cws.model') self.__postagger.load('ltp_data/pos.model') self.__recognizer.load('ltp_data/ner.model') self.__parser.load('ltp_data/parser.model') self.__labeller.load('ltp_data/srl') def release(self): self.__segmentor.release() self.__postagger.release() self.__recognizer.release() self.__parser.release() self.__labeller.release() def clear(self): self.__triple_list = [] self.__words_full_list = [] self.__netags_full_list = [] def resolve_conference(self, entity): try: e_str = entity.get_content_as_str() except Exception: return '?' ref = e_str if e_str == '他' or e_str == '她': for i in range(entity.loc, -1, -1): if self.__netags_full_list[i].lower().endswith('nh'): ref = self.__words_full_list[i] break return ref def resolve_all_conference(self): for t in self.triple_list: e_str = self.resolve_conference(t.entity_1) try: t.entity_1.content = e_str.split() except Exception: pass def chunk_str(self, data): sents = SentenceSplitter.split(data) offset = 0 for sent in sents: try: words = self.__segmentor.segment(sent) postags = self.__postagger.postag(words) netags = self.__recognizer.recognize(words, postags) arcs = self.__parser.parse(words, postags) roles = self.__labeller.label(words, postags, netags, arcs) self.chunk_sent(list(words), list(postags), list(arcs), offset) offset += len(list(words)) self.__words_full_list.extend(list(words)) self.__netags_full_list.extend(list(netags)) except Exception as e: print(str(e)) pass def chunk_sent(self, words, postags, arcs, offset): root = [i for i,x in enumerate(arcs) if x.relation == 'HED'] if len(root) > 1: raise Exception('More than 1 HEAD arc is detected!') root = root[0] relations = [i for i, x in enumerate(arcs) if x.head == root and x.relation == 'COO'] relations.insert(0,root) prev_e1 = None e1 = None for rel in relations: left_arc = [i for i, x in enumerate(arcs) if x.head == rel and x.relation == 'SBV'] if len(left_arc) > 1: pass #raise Exception('More than 1 left arc is detected!') elif len(left_arc) == 0: e1 = prev_e1 elif len(left_arc) == 1: left_arc = left_arc[0] leftmost = find_farthest_att(arcs, left_arc) e1 = Entity(1, [words[i] for i in range(leftmost, left_arc + 1)], offset + leftmost) prev_e1 = e1 right_arc = [i for i, x in enumerate(arcs) if x.head == rel and x.relation == 'VOB'] e2_list = [] if not right_arc: e2 = Entity(2, None) e2_list.append(e2) else: right_ext = find_farthest_vob(arcs, right_arc[0]) items = [i for i, x in enumerate(arcs) if x.head == right_ext and x.relation == 'COO'] items = right_arc + items count = 0 for item in items: leftmost = find_farthest_att(arcs, item) e2 = None if count == 0: e2 = Entity(2, [words[i] for i in range(leftmost, right_ext + 1)], offset+leftmost) else: p1 = range(leftmost, right_arc[0]) p2 = range(item, find_farthest_vob(arcs, item) + 1) e2 = Entity(2, [words[i] for i in itertools.chain(p1, p2)]) e2_list.append(e2) r = Relation(words[rel]) t = Triple(e1, e2, r) self.__triple_list.append(t) count += 1
def parse(s, isGraph=False): """ 对语句进行句法分析,并返回句法结果 """ tmp_ner_dict = {} num_lst = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十'] # print(s) # 将公司代码替换为特殊称谓,保证分词词性正确 for i, ner in enumerate(list(set(re.findall(r'(ner\_\d\d\d\d\_)', s)))): try: tmp_ner_dict[num_lst[i] + '号企业'] = ner except IndexError: # TODO:定义错误情况的输出 # TODO ... num_lst.append(str(i)) tmp_ner_dict[num_lst[i] + '号企业'] = ner s = s.replace(ner, num_lst[i] + '号企业') # print(tmp_ner_dict) words = segmentor.segment(s) tags = postagger.postag(words) parser = Parser() # 初始化实例 parse_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') parser.load(parse_model_path) arcs = parser.parse(words, tags) # 句法分析 arcs_lst = list(map(list, zip(*[[arc.head, arc.relation] for arc in arcs]))) # 句法分析结果输出 parse_result = pd.DataFrame([[ a, b, c, d ] for a, b, c, d in zip(list(words), list(tags), arcs_lst[0], arcs_lst[1]) ], index=range(1, len(words) + 1)) parser.release() # 释放模型 result = [] # 实体的依存关系类别 rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 # for i in range(len(words)): # print(relation[i] + '(' + words[i] + ', ' + heads[i] + ')') company_list = list(tmp_ner_dict.keys()) str_enti_1 = "一号企业" str_enti_2 = "二号企业" l_w = list(words) is_two_company = str_enti_1 in l_w and str_enti_2 in l_w if is_two_company: second_entity_index = l_w.index(str_enti_2) entity_sentence_type = parse_result.iloc[second_entity_index, -1] if entity_sentence_type in SEN_TAGS: result.append(SEN_TAGS.index(entity_sentence_type)) else: result.append(-1) else: result.append(-1) if isGraph: g = Digraph('测试图片') g.node(name='Root') for word in words: g.node(name=word, fontname="SimHei") for i in range(len(words)): if relation[i] not in ['HED']: g.edge(words[i], heads[i], label=relation[i], fontname="SimHei") else: if heads[i] == 'Root': g.edge(words[i], 'Root', label=relation[i], fontname="SimHei") else: g.edge(heads[i], 'Root', label=relation[i], fontname="SimHei") g.view() # 企业实体间句法距离 distance_e_jufa = 0 if is_two_company: distance_e_jufa = shortest_path(parse_result, list(words), str_enti_1, str_enti_2, isGraph=False) result.append(distance_e_jufa) # 企业实体间距离 distance_entity = 0 if is_two_company: distance_entity = np.abs(l_w.index(str_enti_1) - l_w.index(str_enti_2)) result.append(distance_entity) # 企业实体分别和关键触发词的距离 key_words = [ "收购", "竞拍", "转让", "扩张", "并购", "注资", "整合", "并入", "竞购", "竞买", "支付", "收购价", "收购价格", "承购", "购得", "购进", "购入", "买进", "买入", "赎买", "购销", "议购", "函购", "函售", "抛售", "售卖", "销售", "转售" ] # TODO:*根据关键词和对应句法关系提取特征(如没有思路可以不完成) # TODO ... k_w = None for w in words: if w in key_words: k_w = w break dis_key_e_1 = -1 dis_key_e_2 = -1 if k_w != None and is_two_company: k_w = str(k_w) # print("k_w", k_w) l_w = list(words) # dis_key_e_1 = shortest_path(parse_result, l_w, str_enti_1, k_w) # dis_key_e_2 = shortest_path(parse_result, l_w, str_enti_2, k_w) dis_key_e_1 = np.abs(l_w.index(str_enti_1) - l_w.index(k_w)) dis_key_e_2 = np.abs(l_w.index(str_enti_2) - l_w.index(k_w)) result.append(dis_key_e_1) result.append(dis_key_e_2) return result
class OpinionExtractor(object): def __init__(self): self.__segmentor = Segmentor() self.__postagger = Postagger() self.__parser = Parser() # 初始化实例 self.__labeller = SementicRoleLabeller() # 初始化实例 self.__segmentor.load_with_lexicon( os.path.join(LTP_MODEL_DIR, "cws.model"), os.path.join(DICTIONARY_DIR, "custom_lexicon.model")) self.__postagger.load(os.path.join(LTP_MODEL_DIR, "pos.model")) self.__parser.load(os.path.join(LTP_MODEL_DIR, "parser.model")) # 加载模型 self.__labeller.load(os.path.join(LTP_MODEL_DIR, "pisrl.model")) # 加载模型 self.__adv_dict_list = self.__load_adverb_dictionary() self.__adv_list = self.__adv_dict_list.get("范围副词") + self.__adv_dict_list.get("频率副词") \ + self.__adv_dict_list.get("程度副词") + self.__adv_dict_list.get("时间副词") \ + self.__adv_dict_list.get("肯否副词") + self.__adv_dict_list.get("语气副词") \ + self.__adv_dict_list.get("情态副词") self.__pronoun_list = self.__load_pronoun_words() self.__vi_list = self.__load_intransitive_verb() self.__auxiliary_dict_list = self.__load_auxiliary_dictionary() self.__auxiliary_list = self.__auxiliary_dict_list.get( "语气助词") + self.__auxiliary_dict_list.get( "结构助词") + self.__auxiliary_dict_list.get("时态助词") self.__special_prefix_list = self.__load_special_prefix_words() self.__stopwords_list = self.__load_stopwords("之前", "是因为", "已经") def release(self): self.__labeller.release() self.__parser.release() self.__postagger.release() self.__segmentor.release() @classmethod def __load_stopwords(cls, *self_define_stopwords): """ get stopwords list :param self_define_stopwords: add self define stop word to stopwords list :return: stopwords_list """ stopwords_list = [ word.strip() for word in open(os.path.join(DICTIONARY_DIR, "stopwords.txt"), "r").readlines() ] for stopword in self_define_stopwords: stopwords_list.append(stopword) return stopwords_list @classmethod def __load_special_prefix_words(cls): """ 加载特别开始词 :return: """ special_prefix_words = [] with open(os.path.join(DICTIONARY_DIR, "special_prefix.txt"), "r") as sp_file: for word in sp_file.readlines(): special_prefix_words.append(word.strip()) return special_prefix_words @classmethod def __load_intransitive_verb(cls): """ 加载不及物动词 :return: """ intransitive_verb = [] with open(os.path.join(DICTIONARY_DIR, "intransitive_verb.txt"), "r") as vi_file: for word in vi_file.readlines(): intransitive_verb.append(word.strip()) return intransitive_verb @classmethod def __load_pronoun_words(cls): """ 加载代词 :return: """ pronoun_words = [] with open(os.path.join(DICTIONARY_DIR, "pronoun.txt"), "r") as pronoun_file: for word in pronoun_file.readlines(): pronoun_words.append(word.strip()) return pronoun_words @classmethod def __load_adverb_dictionary(cls): """ 加载副词 :return: """ dictionary = {} with open(os.path.join(DICTIONARY_DIR, "adv.txt"), "r") as adv_file: for line in adv_file.readlines(): index = line.index(":") key = line[0:index].strip() value = line[index + 1:].strip() dictionary.update({key: value.split(" ")}) return dictionary @classmethod def __load_auxiliary_dictionary(cls): """ 加载助词 :return: """ dictionary = {} with open(os.path.join(DICTIONARY_DIR, "auxiliary.txt"), "r") as adv_file: for line in adv_file.readlines(): index = line.index(":") key = line[0:index].strip() value = line[index + 1:].strip() dictionary.update({key: value.split(" ")}) return dictionary @classmethod def __smart_split_sentence(cls, comment): """ 拆分句子 :param comment: :return: """ # 替换空格为"," comment = re.sub(re.compile(r"(\s+)", re.S), ",", comment.strip()) # 句子按分隔[。|!|,|、|?|.|!|,|?]符分出多个子句 subcomments = re.split(r'[。|!|,|、|?|\.|!|,|\?]', comment) return subcomments def sentence_segment_add_space(self, comment, stopwords_list={}): """ 使用空格间隔分词 如: 我们 喜欢 吃 冰激凌 :param comment: 一条语料 :param stopwords_list: 停用词列表 :return: """ self.__segmentor segment = self.__segmentor.segment(self.__remove_special_word(comment)) return segment, " ".join(segment) def __word_self_attention(self, parent_pos, parent_word, current_arc_relation, current_arc_pos, current_word): """ 判断词性与依存关系组合的有效性 词注意力机制 :param parent_pos: 父节点的词性 :param parent_word: 父节点的词 :param current_arc_relation: 当前节点的依存关系 :param current_arc_pos: 当前节点的词词性 :param current_word: 当前节点的词 :return: """ if parent_pos == Pos.v.value: if current_arc_relation == Dependency.SBV.value: return True if current_arc_relation == Dependency.VOB.value: return True if current_arc_relation == Dependency.FOB.value: return True if current_arc_relation == Dependency.ADV.value: if current_arc_pos == Pos.d.value: if current_word in self.__adv_dict_list.get("肯否副词"): return True if current_arc_pos == Pos.p.value and current_word in [ "由", "用" ]: # 由关晓彤代言 return True if current_arc_pos == Pos.v.value: return True if current_arc_relation == Dependency.ATT.value: return True if current_arc_relation == Dependency.CMP.value: return True # if current_arc_pos == Pos.u.value and current_word not in self.__auxiliary_dict_list.get("语气助词") + self.__auxiliary_dict_list.get("时态助词"): if current_arc_pos == Pos.u.value and current_word not in self.__auxiliary_list: return True elif parent_pos == Pos.a.value: if current_arc_relation == Dependency.SBV.value and current_word not in self.__pronoun_list: # e.g.:材料新鲜 它很方便 return True if current_arc_relation == Dependency.ADV.value and ( current_word not in self.__adv_dict_list.get("程度副词") + self.__adv_dict_list.get("范围副词") or (current_arc_pos == Pos.p.value and current_word in ["比"])): # 比别家好 return True if current_arc_relation == Dependency.ATT.value: return True if current_arc_pos == Pos.u.value and current_word not in self.__auxiliary_dict_list.get( "语气助词") + self.__auxiliary_dict_list.get("结构助词"): return True elif parent_pos in [ Pos.n.value, Pos.nd.value, Pos.nh.value, Pos.ni.value, Pos.nl.value, Pos.ns.value, Pos.nt.value, Pos.nz.value ]: if current_arc_relation == Dependency.ADV.value: return True if current_arc_relation == Dependency.ATT.value: # 属性语义修饰名词 return True if current_arc_pos == Pos.u.value and current_word not in self.__auxiliary_dict_list.get( "语气助词") + self.__auxiliary_dict_list.get("结构助词"): # 美丽的 return True elif parent_pos == Pos.p.value: if current_arc_relation == Dependency.SBV.value: # 他给我感觉 return True if current_arc_relation == Dependency.VOB.value: # 给我感觉 return True if current_arc_relation == Dependency.POB.value: # 比别家好 return True elif parent_pos == Pos.d.value: if current_arc_relation == Dependency.SBV.value: return True if current_arc_relation == Dependency.VOB.value: # 没有|d 4|过于|d 5|甜腻 return True elif parent_pos in [Pos.i.value, Pos.r.value, Pos.q.value ] or current_arc_relation == Dependency.CMP.value: return True return False def __parse_opinion(self, core_word_index, arcs, words, postags): """ :param core_word_index: :param arcs: :param words: :param postags: :return: opinion_word_list """ has_vob = False sbv_word = () sbv_att_word_list = [] available_word_idx_list = [core_word_index] opinion_word_list = [] def word_root_index(core_word_idx, index): """ 查找词的root index :return: """ arc = arcs[index] idx = index if arc.relation == Dependency.HED.value else arc.head - 1 if idx == core_word_idx or idx == index: return idx else: return word_root_index(core_word_idx, idx) def do_parse_opinion(core_word_idx): """ 提取以动词为核心的观点,提取的主要结构主谓结构(SBV)、动宾结构(VOB)、状中结构(ADV)、动补结构(CMP)、介宾结构(POB) :return: """ nonlocal has_vob nonlocal sbv_word nonlocal sbv_att_word_list nonlocal available_word_idx_list for m, arc in enumerate(arcs): # tuple格式:(index, 句法依存关系, 词性, 词) current_word_tuple = (m, arc.relation, postags[m], words[m]) parent_word_index = arc.head - 1 parent_word_tuple = (parent_word_index, arcs[parent_word_index].relation, postags[parent_word_index], words[parent_word_index]) if arc.head == core_word_idx + 1 \ and (current_word_tuple[2] not in [Pos.wp.value, Pos.o.value, Pos.c.value, Pos.r.value, Pos.e.value] or (current_word_tuple[2] == Pos.r.value and current_word_tuple[3] not in self.__pronoun_list)) \ and self.__word_self_attention(parent_word_tuple[2], parent_word_tuple[3], current_word_tuple[1], current_word_tuple[2], current_word_tuple[3]): # 计算词的root词是否等于关键词 root_core_index = word_root_index(core_word_index, m) if root_core_index == core_word_index: if arc.relation == Dependency.VOB.value or ( arc.relation == Dependency.CMP.value and postags[current_word_tuple[0]] == Pos.a.value): has_vob = True available_word_idx_list.append(m) opinion_word_list.append(current_word_tuple) else: if arc.head - 1 in available_word_idx_list: available_word_idx_list.append(m) # 若是主谓结构先暂存,不加入观点词list if arc.relation == Dependency.SBV.value: if len(sbv_word) == 0: sbv_word = current_word_tuple else: # 计算词的root词是否等于sbv关键词 sbv_index = sbv_word[0] if len( sbv_word) > 0 else -1 root_sbv_index = word_root_index( sbv_index, current_word_tuple[0]) if root_sbv_index == sbv_index: # 若是主谓结构的其他属性词,暂存在主谓属性词列表 sbv_att_word_list.append( current_word_tuple) else: opinion_word_list.append( current_word_tuple) do_parse_opinion(m) do_parse_opinion(core_word_index) def need_sbv(): """ 判断是否需要主谓结构 :return: """ # 三元组判断,只有包含了动宾结构才把主谓结构加入 if has_vob: return True # 及物动词可以直接加sbv if postags[core_word_index] == Pos.a.value: return True # 形容词句意可以直接在sbv if words[core_word_index] in self.__vi_list: return True return False if need_sbv() and len(sbv_word) > 0: opinion_word_list.append(sbv_word) opinion_word_list += sbv_att_word_list return opinion_word_list def extract_opinion(self, comment, distinct_opinion=True, show_core_word=False, show_detail=False): """ 抽取观点 :param comment: :param distinct_opinion: 是否去重观点 :param show_core_word: 是否展示观点核心词 :param show_detail: 是否展示分词等详细信息 :return: """ subcomments = self.__smart_split_sentence(comment) opinion_list = [] for subcomment in subcomments: words, sentence_with_space = self.sentence_segment_add_space( subcomment) opinions = self.__parse_segment(words, show_detail) if len(opinions) > 0: opinion_list += opinions if distinct_opinion: opinion_list = self.__distinct_opinion(opinion_list) if not show_core_word: opinion_list = [opinion[2] for opinion in opinion_list] return opinion_list @classmethod def __distinct_opinion(cls, opinions): """ 观点去重 :param opinions: :return: """ index = 2 distinct_opinion_list = [] for n in range(1, len(opinions)): for m in range(n, 0, -1): opi_1 = opinions[m][index] opi_2 = opinions[m - 1][index] if len(opi_1) > len(opi_2): tmp = opinions[m - 1] opinions[m - 1] = opinions[m] opinions[m] = tmp for opinion in opinions: opi = opinion[index] if len(distinct_opinion_list) == 0: distinct_opinion_list.append(opinion) else: include = False for idx in range(0, len(distinct_opinion_list)): try: include |= distinct_opinion_list[idx][index].index( opi) > -1 except ValueError: pass if not include: distinct_opinion_list.append(opinion) return distinct_opinion_list def __parse_segment(self, words, show_detail=False): postags = self.__postagger.postag(words) word_tag_tuple_list = [] for i in range(len(words)): word_tag_tuple_list.append((str(i), words[i], postags[i])) arcs = self.__parser.parse(words, postags) # arcs 使用依存句法分析的结果 labels = self.__labeller.label(words, postags, arcs) # 语义角色标注 if show_detail: logger.info("|".join(words)) logger.info(" ".join('|'.join(tpl) for tpl in word_tag_tuple_list)) logger.info(" ".join("%d|%d:%s" % (n, arc.head, arc.relation) for n, arc in enumerate(arcs))) for label in labels: logger.info( str(label.index) + ":" + ",".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in label.arguments ])) # opinions = self.__parse_main_opinion(arcs, words, postags) opinions = self.__parse_opinions(arcs, words, postags) return opinions def __parse_opinions(self, arcs, words, postags): """ 给出核心词性,解释所有该词性的短语观点 :param arcs: :param words: :param postags: :return: """ opinions = [] for n, arc in enumerate(arcs): postag = postags[n] word = words[n] if postag in [Pos.v.value, Pos.a.value, Pos.i.value] or \ (postag == Pos.a.value and word not in self.__adv_list) or \ (arc.relation in [Dependency.HED.value, Dependency.COO.value] and postag not in [Pos.v.value, Pos.a.value, Pos.i.value, Pos.m.value, Pos.c.value]): opinion_word_list = self.__parse_opinion( n, arcs, words, postags) if self.__check_opinion(postag, word, opinion_word_list): opinion_str = self.__opinion_to_str( n, words, opinion_word_list) opinions.append((postag, words[n], opinion_str)) return opinions def __parse_main_opinion(self, arcs, words, postags): """ :param arcs: :param words: :param postags: :return: """ for n, arc in enumerate(arcs): if arc.relation == Dependency.HED.value: core_index = n core_pos = postags[core_index] opinion_word_list = self.__parse_opinion(core_index, arcs, words, postags) return core_pos, words[core_index], self.__opinion_to_str( core_index, words, opinion_word_list) @classmethod def __check_opinion(cls, core_word_pos, core_word, opinion_word_list): """ 检测opinion有效性 :param core_word_pos: :param core_word: :param opinion_word_list: :return: """ if len(opinion_word_list) > 0: return True if len(opinion_word_list) == 0 and core_word_pos not in [ Pos.v.value, Pos.d.value ]: return True if len(opinion_word_list ) == 0 and core_word_pos == Pos.v.value and len( core_word) > 1: # 入口即化|v return True return False def __opinion_to_str(self, core_word_index, words, opinion_word_list): """ 输出观点字符串 :param core_word_index: :param words: :param opinion_word_list: :return: """ index_list = [core_word_index] if self.__remove_core_word(words[core_word_index]): index_list = [] for opinion_word in opinion_word_list: index = opinion_word[0] index_list.append(index) index_list.sort() opinion = "" for index in index_list: opinion += words[index] return self.__remove_special_word(opinion) @classmethod def __remove_core_word(cls, word): if word == "是": return True return False def __remove_special_word(self, opinion): new_opinion = opinion for sp_word in self.__special_prefix_list: if opinion.rfind(sp_word) == 0: new_opinion = opinion[len(sp_word):] return self.__remove_special_word(new_opinion) return new_opinion
def getRelation(paragraph): """ paragraph: a list of string, each string is a sentence return: a list of relations and a dict which records the number of occurrence of differents DSNF """ relations = [] dict_DSNF = { 'num_DSNF1': 0, 'num_DSNF2': 0, 'num_DSNF3': 0, 'num_DSNF7': 0, } segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) for iteration, sentence in enumerate(paragraph): print("evaluate the " + str(iteration + 1) + "-th sentences") sentence = SentenceSplitter.split(sentence)[0] words = segmentor.segment(sentence) # print("\t".join(words)) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) # print("\t".join(postags)) arcs = parser.parse(words, postags) # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) netags = recognizer.recognize(words, postags) # print("\t".join(netags)) # labeller = SementicRoleLabeller() # labeller.load(os.path.join(MODELDIR, "pisrl.model")) # roles = labeller.label(words, postags, arcs) # for role in roles: # print(role.index, "".join( # ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) entityList = findEntities(netags) # print(entityList) entities = [] for i in entityList: l = '' for j in i: l += words[j] entities.append(l) print("entities in " + str(iteration + 1) + "-th sentence : ", entities) DSNF1_ret = DSNF1(arcs, entityList, words, netags) DSNF2_ret = DSNF2(arcs, entityList, words) DSNF3_ret = DSNF3(arcs, entityList, words, postags) DSNF7_ret = DSNF7(arcs, entityList, words) # print("DSNF1 result: ", DSNF1_ret) # print("DSNF2 result: ", DSNF2_ret) # print("DSNF3 result: ", DSNF3_ret) # print("DSNF7 result: ", DSNF7_ret) relation = [] for r in DSNF1_ret: dict_DSNF['num_DSNF1'] += 1 relation.append(r) relations.append(r) for r in DSNF2_ret: dict_DSNF['num_DSNF2'] += 1 relation.append(r) relations.append(r) for r in DSNF3_ret: dict_DSNF['num_DSNF3'] += 1 relation.append(r) relations.append(r) for r in DSNF7_ret: dict_DSNF['num_DSNF7'] += 1 relation.append(r) relations.append(r) print("with entities relation: ", relation) print("--" * 30) segmentor.release() postagger.release() parser.release() recognizer.release() # labeller.release() return relations, dict_DSNF
class NLP: """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析 Attributes: default_user_dict_dir: str,用户自定义词典目录 default_model_dir: str,ltp模型文件目录 """ default_user_dict_dir = '../../resource/' # 默认的用户词典目录,清华大学法律词典 default_model_dir = '../../model/' # ltp模型文件目录 def __init__(self, user_dict_dir=default_user_dict_dir, model_dir=default_model_dir): self.default_user_dict_dir = user_dict_dir self.default_model_dir = model_dir # 初始化分词器 # pynlpir.open() # 初始化分词器 # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快 files = os.listdir(user_dict_dir) for file in files: file_path = os.path.join(user_dict_dir, file) # 文件夹则跳过 if os.path.isdir(file): continue with open(file_path, 'r', encoding='utf-8') as f: line = f.readline() while line: word = line.strip('\n').strip() jieba.add_word(word) # print(c_char_p(word.encode())) # pynlpir.nlpir.AddUserWord(c_char_p(word.encode())) line = f.readline() # 加载ltp模型 # 词性标注模型 self.postagger = Postagger() postag_flag = self.postagger.load(os.path.join(self.default_model_dir, 'pos.model')) # 命名实体识别模型 self.recognizer = NamedEntityRecognizer() ner_flag = self.recognizer.load(os.path.join(self.default_model_dir, 'ner.model')) # 依存句法分析模型 self.parser = Parser() parse_flag = self.parser.load(os.path.join(self.default_model_dir, 'parser.model')) if postag_flag or ner_flag or parse_flag: print('load model failed!') def segment(self, sentence, entity_postag=dict()): """采用NLPIR进行分词处理 Args: sentence: string,句子 entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生 Returns: lemmas: list,分词结果 """ # 添加实体词典 if entity_postag: for entity in entity_postag: # pynlpir.nlpir.AddUserWord(c_char_p(entity.encode())) jieba.add_word(entity) # pynlpir.nlpir.AddUserWord(c_char_p('前任'.encode())) # 单个用户词加入示例 # pynlpir.nlpir.AddUserWord(c_char_p('习近平'.encode())) # 单个用户词加入示例 # 分词,不进行词性标注 # lemmas = pynlpir.segment(sentence, pos_tagging=False) lemmas = jieba.lcut(sentence) # pynlpir.close() # 释放 return lemmas def postag(self, lemmas): """对分词后的结果进行词性标注 Args: lemmas: list,分词后的结果 entity_dict: set,实体词典,处理具体的一则判决书的结构化文本时产生 Returns: words: WordUnit list,包含分词与词性标注结果 """ words = [] # 存储句子处理后的词单元 # 词性标注 postags = self.postagger.postag(lemmas) for i in range(len(lemmas)): # 存储分词与词性标记后的词单元WordUnit,编号从1开始 word = WordUnit(i+1, lemmas[i], postags[i]) words.append(word) # self.postagger.release() # 释放 return words def get_postag(self, word): """获得单个词的词性标注 Args: word: str,单词 Returns: post_tag: str,该单词的词性标注 """ post_tag = self.postagger.postag([word, ]) return post_tag[0] def netag(self, words): """命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并 Args: words: WordUnit list,包含分词与词性标注结果 Returns: words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果 """ lemmas = [] # 存储分词后的结果 postags = [] # 存储词性标书结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 命名实体识别 netags = self.recognizer.recognize(lemmas, postags) # print('\t'.join(netags)) # just for test words_netag = EntityCombine().combine(words, netags) # self.recognizer.release() # 释放 return words_netag def parse(self, words): """对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选) Args: words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果 Returns: *: SentenceUnit,该句子单元 """ lemmas = [] # 分词结果 postags = [] # 词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 依存句法分析 arcs = self.parser.parse(lemmas, postags) for i in range(len(arcs)): words[i].head = arcs[i].head words[i].dependency = arcs[i].relation # self.parser.release() return SentenceUnit(words) def close(self): """关闭与释放nlp""" # pynlpir.close() self.postagger.release() self.recognizer.release() self.parser.release()
class RequestHandler(): def __init__(self): self.intents = [ 'translation', 'app', 'calc', 'match', 'radio', 'health', 'novel', 'video', 'cinemas', 'music', 'stock', 'train', 'news', 'message', 'map', 'weather', 'cookbook', 'tvchannel', 'flight', 'schedule', 'riddle', 'email', 'contacts', 'bus', 'website', 'datetime', 'poetry', 'lottery', 'chat', 'epg', 'telephone' ] self.segmentor = Segmentor() # 初始化实例 CWS self.segmentor.load(configs.cws_path) # 加载模型 self.postagger = Postagger() # 初始化实例 POS Tagger self.postagger.load(configs.pos_path) # 加载模型 self.labeller = SementicRoleLabeller() # 初始化实例 SRLer self.labeller.load(configs.srl_path) # 加载模型 self.parser = Parser() # 初始化实例 Parser self.parser.load(configs.parser_path) # 加载模型 self.ac = ACAutomatons() self.clf_31 = NBSVM() self.char_vectorizer_31 = joblib.load(configs.models_path + '/nbsvm-vocab-ch.pkl') self.word_vectorizer_31 = joblib.load(configs.models_path + '/nbsvm-vocab-wd.pkl') self.clf_31 = joblib.load(configs.models_path + '/nbsvm_31.pkl') self.ch2_ = joblib.load(configs.models_path + '/nbsvm-feature_selector.pkl') self.word_vectorizer_tv = joblib.load(configs.models_path + '/vocab-wd_epg-tvchannel.pkl') self.char_vectorizer_tv = joblib.load(configs.models_path + '/vocab-ch_epg-tvchannel.pkl') self.clf_tv = joblib.load(configs.models_path + '/svm_epg-tvchannel.pkl') self.word_vectorizer_movie = joblib.load(configs.models_path + '/vocab-wd_video-cinemas.pkl') self.char_vectorizer_movie = joblib.load(configs.models_path + '/vocab-ch_video-cinemas.pkl') self.clf_movie = joblib.load(configs.models_path + '/svm_video-cinemas.pkl') self.char_vectorizer_internet = joblib.load( configs.models_path + '/vocab-ch_website-app.pkl') self.word_vectorizer_internet = joblib.load( configs.models_path + '/vocab-wd_website-app.pkl') self.clf_internet = joblib.load(configs.models_path + '/svm_website-app.pkl') self.char_vectorizer_star = joblib.load(configs.models_path + '/vocab-ch_video-music.pkl') self.clf_star = joblib.load(configs.models_path + '/svm_video-music.pkl') self.word_vectorizer_star = joblib.load(configs.models_path + '/vocab-wd_video-music.pkl') self.char_vectorizer_video = joblib.load(configs.models_path + '/vocab-ch_video-epg.pkl') self.word_vectorizer_video = joblib.load(configs.models_path + '/vocab-wd_video-epg.pkl') self.clf_video = joblib.load(configs.models_path + '/svm_video-epg.pkl') def getResult(self, sentence): """1. Complete the classification in this function. Args: sentence: A string of sentence. Returns: classification: A string of the result of classification. """ processed = self.preprocess(sentence) return self.pipeline(processed) def getBatchResults(self, sentencesList): """2. You can also complete the classification in this function, if you want to classify the sentences in batch. Args: sentencesList: A List of Dictionaries of ids and sentences, like: [{'id':331, 'content':'帮我打电话给张三' }, {'id':332, 'content':'帮我订一张机票!' }, ... ] Returns: resultsList: A List of Dictionaries of ids and results. The order of the list must be the same as the input list, like: [{'id':331, 'result':'telephone' }, {'id':332, 'result':'flight' }, ... ] """ resultsList = [] for sentence in sentencesList: resultDict = {} resultDict['id'] = sentence['id'] resultDict['result'] = self.getResult(sentence['content']) resultsList.append(resultDict) return resultsList def pattern_match(self, sample): srl_res = self.sRLMatch(sample) if srl_res != None: return srl_res else: rul_res = self.ruleMatch(sample) if rul_res != None: return rul_res else: return None def ruleMatch(self, sample): domains = get_rule(sample['query'], self.ac) if len(domains) < 1: return None else: sorted_domains = aggregate_domains(domains) for each in sorted_domains: if each[0] == 'datetime': nouns = get_nouns(sample['query'], 'festival', self.ac) if len(nouns) > 0: return 'datetime' else: continue elif each[0] == 'email': if len( set(sample['word']) & set(['写', '回复', '转发', '打开', '查收', '查看', '答复']) ) > 0: return 'email' else: continue else: return None def sRLMatch(self, sample): srl_res = getSRL(sample['query'], self.segmentor, self.postagger, self.parser, self.labeller) if len(srl_res) == 0: #no any predicate in query or single entity return None else: for res in srl_res: predicate_domains = get_predicate(res[0], self.ac) if len(predicate_domains) < 1: continue #no such a predicate in database else: sorted_domains = aggregate_domains(predicate_domains) for each in sorted_domains: if each[0] == 'app': nouns = get_nouns(res[1], 'app', self.ac) if len(nouns) > 0: return 'app' else: continue elif each[0] == 'cinemas': nouns = get_nouns(res[1], 'film', self.ac) if len(nouns) > 0: return 'Movie_stuff' else: continue elif each[0] == 'contacts': # 'nr' by POS-tagger indicates a person's name if 'nr' in sample['tag']: return 'contacts' else: continue elif each[0] == 'cookbook': nouns = get_nouns(res[1], 'food', self.ac) if len(nouns) > 0: # 如果命中任何专有名词,则划分到意图app return 'cookbook' else: continue elif each[0] == 'tvchannel': nouns = get_nouns(res[1], 'tvchannel', self.ac) if len(nouns) > 0: return 'TV_stuff' else: continue elif each[0] == 'video': nouns = get_nouns(res[1], 'video', self.ac) if len(nouns) > 0: return 'Video_stuff' else: continue elif each[0] == 'health': nouns = get_nouns(res[1], 'disease', self.ac) nouns.extend(get_nouns(res[1], 'drug', self.ac)) if len(nouns) > 0: return 'health' else: continue elif each[0] == 'music': nouns_song = get_nouns(res[1], 'song', self.ac) nouns_singer = get_nouns(res[1], 'singer', self.ac) if len(nouns_song) > 0: return 'music' elif len(nouns_singer) > 0: return 'Star_stuff' else: continue elif each[0] == 'novel': nouns = get_nouns(res[1], 'novel', self.ac) if '小说' in res[1] or len(nouns) > 0: return 'novel' else: continue elif each[0] == 'poetry': nouns = get_nouns(res[1], 'poet', self.ac) if len(nouns) > 0: return 'poetry' else: continue elif each[0] == 'radio': if len(get_nouns(res[1], 'radio', self.ac)) > 0: return 'radio' else: continue elif each[0] == 'stock': nouns = get_nouns(res[1], 'stock', self.ac) if len(nouns) > 0: return 'stock' else: continue elif each[0] == 'website': nouns = get_nouns(res[1], 'website', self.ac) if len(nouns) > 0: return 'Internet_stuff' else: continue def retrieval(self, sample): """ To find proper nouns to handle single entity in a query :param sample: a dict indicates a query and its POS tag :return:a string indicates one certain intent """ pn_res = doRetrieval(sample['query'], self.ac) #look up single instance sorted_domains = aggregate_domains(pn_res) if len(sorted_domains) == 1: #one instance domain = sorted_domains[0][0] if len(max(sorted_domains[0][1], key=len)) > len(sample['query']) / 2: if domain == 'airline': return 'flight' if domain in ['railwaystation', 'airport']: return 'map' if domain == 'app': return 'app' if domain == 'contacts': return 'contacts' if domain in ['drug', 'disease']: return 'health' if domain == 'festival': return 'datetime' if domain in ['moviestar', 'film', 'video']: return 'video' if domain == 'food': return 'cookbook' if domain == 'novel': return 'novel' if domain == 'place': return 'map' if domain == 'poet': return 'poetry' if domain == 'radio': return 'radio' if domain in ['singer', 'song']: return 'music' if domain == 'sports': return 'match' if domain == 'stock': return 'stock' if domain == 'tvchannel': return 'tvchannel' if domain == 'website': return 'website' return None else: return None def classifyAllIntents(self, sample): """ A classifier for 31 intents including chitchat :param sample: a dict indicates a query and its POS tag :return:a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_31.transform(text) test_wd = self.word_vectorizer_31.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) test_vec = self.ch2_.transform(test_vec) pred = self.clf_31.predict(test_vec) return pred.tolist()[0] def epgOrTvchannel(self, sample): """ A classifier to label a instance with 'epg' or 'tvchannel' :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_tv.transform(text) test_wd = self.word_vectorizer_tv.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) pred = self.clf_tv.predict(test_vec) return pred.tolist()[0] def videoOrCinemas(self, sample): """ A classifier to label a instance with 'video' or 'cinemas' :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_movie.transform(text) test_wd = self.word_vectorizer_movie.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) pred = self.clf_movie.predict(test_vec) return pred.tolist()[0] def websiteOrApp(self, sample): """ A classifier to label a instance with 'website' or 'app' :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_internet.transform(text) test_wd = self.word_vectorizer_internet.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) pred = self.clf_internet.predict(test_vec) return pred.tolist()[0] def videoOrMusic(self, sample): """ A classifier to label a instance with 'video' or 'music' :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_star.transform(text) test_wd = self.word_vectorizer_star.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) pred = self.clf_star.predict(test_vec) return pred.tolist()[0] def videoOrEpg(self, sample): """ A classifier to label a instance with 'epg' or 'video' :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ raw_query = sample['query'] text = [''.join([w for w in jieba.cut(raw_query)])] test_ch = self.char_vectorizer_video.transform(text) test_wd = self.word_vectorizer_video.transform(text) test_vec = hstack([test_ch, test_wd]) test_vec = csr_matrix(test_vec) pred = self.clf_video.predict(test_vec) return pred.tolist()[0] def pipeline(self, sample, use_pse=True, use_retrieval=False): """ A pipeline to label a instance with one of 31 possible intents :param sample: a dict indicates a query and its POS tag :return: a string indicates one certain intent """ if use_pse: ps_res = prettySureExpression(sample['query'], self.ac) if len(list(set([_[1][0] for _ in ps_res]))) == 1: return ps_res[0][1][0] pm_res = self.pattern_match(sample) if pm_res == 'TV_stuff': clf_res = self.classifyAllIntents( sample) # a ML classifier to label 31 intentions if clf_res in ['epg', 'tvchannel']: return clf_res else: return self.epgOrTvchannel( sample) #a ML classifier to label epg or tvchannel elif pm_res == 'Movie_stuff': clf_res = self.classifyAllIntents( sample) # a ML classifier to label 31 intentions if clf_res in ['video', 'cinemas']: return clf_res else: return self.videoOrCinemas(sample) elif pm_res == 'Internet_stuff': clf_res = self.classifyAllIntents( sample) # a ML classifier to label 31 intentions if clf_res in ['website', 'app']: return clf_res else: return self.websiteOrApp(sample) elif pm_res == 'Star_stuff': clf_res = self.classifyAllIntents( sample) # a ML classifier to label 31 intentions if clf_res in ['video', 'music']: return clf_res else: return self.videoOrMusic(sample) elif pm_res == 'Video_stuff': clf_res = self.classifyAllIntents( sample) # a ML classifier to label 31 intentions if clf_res in ['video', 'epg']: return clf_res else: return self.videoOrEpg(sample) elif pm_res == None: if use_retrieval: ret_res = self.retrieval(sample, self.ac) if ret_res == None: return self.classifyAllIntents( sample ) # no pattern matched, so that classify it using ML else: return ret_res else: return self.classifyAllIntents(sample) else: return pm_res def preprocess(self, raw_query): """ To segment a raw user query into words and POS-tags it :param raw_query: a string generated by a user :return: a dict indicate the segmented query ,raw query and POS-tags """ tmp = pseg.cut(raw_query) words = [] pos = [] for word, flag in tmp: words.append(word) pos.append(flag) inst = {} inst['tag'] = pos inst['word'] = words del words del pos inst['query'] = raw_query return inst def close(self): """ To release relevant models """ self.postagger.release() # 释放模型 self.segmentor.release() # 释放模型 self.labeller.release() # 释放模型 self.parser.release() # 释放模型 del self.ac gc.collect()
def split_sentence(self, sentence=None, say_word_list: List[str] = None, cycle: bool = True, ratio: float = None) -> None: """ 分词 :type say_word_list: :param sentence: :return: """ LTP_DATA_PATH = 'D:\pyltp-master\ltp_data_v3.4.0' cws_model_path = os.path.join(LTP_DATA_PATH, 'cws.model') pos_model_path = os.path.join(LTP_DATA_PATH, 'pos.model') ner_model_path = os.path.join(LTP_DATA_PATH, 'ner.model') par_model_path = os.path.join(LTP_DATA_PATH, 'parser.model') postagger = Postagger() postagger.load(pos_model_path) print('Postagger loaded!') parser = Parser() parser.load(par_model_path) print('Parser loaded!') segment = Segmentor() segment.load(cws_model_path) print('CWS loaded!') if cycle == True: try: lines = sentence sentence = list(segment.segment(lines)) # print('sen ok') # 找出相似 find_say_word = [ word for word in sentence if word in say_word_list ] if len(find_say_word) == 0: print('没有发现类似“说”的单词!') else: post_word = postagger.postag(sentence) post_word = list(post_word) # print('post ok') parse_word = parser.parse(sentence, post_word) parse_word = [(arc.head, arc.relation) for arc in parse_word] # print('parse ok') counter_index = 0 for index, word in enumerate(parse_word): location_part1 = '' location_part2 = '' location_part3 = '' # 找出第一个SBV下的"真新闻" if word[-1] == 'SBV': counter_index = word[0] location_part1 += sentence[index] location_part1 += sentence[word[0] - 1] break # 先将整个SBV后面碰到是双引号或者没有双引号的句子,用于后面文本向量的模型计算 # 暂时只提取双引号内容和两个句号结束的句子为数据 if sentence[counter_index] == '"': for index_2, word_2 in enumerate( sentence[counter_index + 1:]): if word_2 == '"': break location_part2 += word_2 else: for index_2, word_2 in enumerate( sentence[counter_index:]): if word_2 == '。': for word_4 in sentence[index_2 + 1:]: if word_4 == '。': break location_part3 += word_4 break location_part2 += word_2 # 判别说前后两个句号句子的相似度 cal_ratio = difflib.SequenceMatcher( None, location_part2, location_part3).ratio() if cal_ratio > ratio: result = location_part1 + location_part2 + location_part3 else: result = location_part1 + location_part2 segment.release() postagger.release() parser.release() return result.strip('\n') except Exception as e: print(e) elif cycle == False: print('不处理') else: raise TypeError('错误的输入类型') print('词标注和上下文定义结束') print('-' * 20, '华丽的分割线', '-' * 20)
def extract_comment(self, article, say_words): """ 抽取言论 :param news_path: 新闻路径 :param say_words: similar to "say" :return:result:list[[person, say, comment],...] """ # ltp路径 LTP_DATA_PATH = '../ltp_data_v3.4.0' cws_model_path = os.path.join(LTP_DATA_PATH, 'cws.model') pos_model_path = os.path.join(LTP_DATA_PATH, 'pos.model') ner_model_path = os.path.join(LTP_DATA_PATH, 'ner.model') par_model_path = os.path.join(LTP_DATA_PATH, 'parser.model') postagger = Postagger() postagger.load(pos_model_path) print('Postagger loaded!') recognizer = NamedEntityRecognizer() recognizer.load(ner_model_path) print('NER loaded!') parser = Parser() parser.load(par_model_path) print('Parser loaded!') result = [] sentences = self.cut_sentence(self.token(article)) for s_index, sentence in enumerate(sentences): words = self.cut_word(sentence) pos = self.word_pos(sentence, postagger) ner_list = self.ner(words, pos, recognizer) parse_list = self.dependency_parse(words, pos, parser) if 'S-Nh' or 'S-Ni' or 'S-Ns' in ner_list: comment = '' for p_index, p in enumerate(parse_list): # p[0]-1:说的索引(words,parse_list中都是) # p_index:主语位置 if (p[1] == 'SBV') and words[p[0] - 1] in say_words: say = words[p[0] - 1] person = words[p_index] p_i = 1 while p_i <= p_index and parse_list[p_index - p_i][1] == 'ATT': person = words[p_index - p_i] + person p_i = p_i + 1 # 说后是。找前一句话的“” if words[p[0]] == '。': # print('说。') i = 1 last_sentence = sentences[s_index - i] last_words = self.cut_word(last_sentence) begin = self.find_str_index(last_words, 0, ['“']) end = self.find_str_index(last_words, 0, ['”']) if begin != -1 and end != -1 and begin < end: comment = ''.join(last_words[begin + 1:end]) else: while begin == -1 and end != -1: i = i + 1 last_sentence = sentences[s_index - i] last_words = self.cut_word(last_sentence) begin = self.find_str_index( last_words, 0, ['“']) while i > 0: comment = comment + sentences[s_index - i] i = i - 1 else: begin = self.find_str_index(words, p[0], ['“']) end = self.find_str_index(words, p[0], ['”']) if begin != -1 and end != -1 and parse_list[ end - 1][0] == 'WP': comment = ''.join(words[begin:end]) elif begin != -1 and end == -1: comment = ''.join(words[begin:]) i = 1 next_sentence = sentences[s_index + i] while end == -1: end = self.find_str_index( self.cut_word(next_sentence), 0, ['”']) i = i + 1 if len(sentences) > s_index + i: next_sentence = sentences[s_index + i] else: break comments = '' while i > 1 and len(sentences) > s_index + i: comments = sentences[s_index + i] + comments i = i - 1 comment = comment + comments else: # 说后面跟,或: if words[p[0]] == ',' or words[ p[0]] == ',' or words[p[0]] == ':': # print('说,') comment = ''.join(words[p[0] + 1:]) # end = self.find_str_index(words, p[0] + 1, ['。', '!']) # if end != -1: # comment = ''.join(words[p[0] + 1:end]) # 说后跟宾语 elif parse_list[ p[0]][1] == 'VOB' or parse_list[ p[0]][1] == 'IOB': print('告诉谁') i = 0 comment = ''.join(words[p[0] + 1:]) # while len(comment) == 0: # end = self.find_str_index(words, p[0] + i, [ '。', '!']) # if end != -1: # comment = ''.join(words[p[0] + i:end]) # i = i + 1 # 说后面直接跟内容 else: comment = ''.join(words[p[0]:]) # print('说内容') # end = self.find_str_index(words, p_index, [ '。', '!']) # if end != -1: # comment = ''.join(words[p[0]:end]) print(parse_list) # print(words[p[0]]) print(sentence) print('[{}] [{}] [{}]'.format(person, say, comment)) print('-' * 50) item = [] # item.append(person) # item.append(say) # item.append(comment) result.append([person, say, comment]) # result.append(item) postagger.release() recognizer.release() parser.release() return result
class DSFN: """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析 Attributes: default_user_dict_dir:str,用户自定义词典目录 default_model_dir:str,ltp模型文件目录 """ entity_verb_new = entity_verb_new() all_entity = entity_verb_new.readAllEntity("../../entity_verb//entity_verb_result\\all_entity.json") default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' # LTP模型文件目录 def __init__(self, model_dir=default_model_dir, all_entity=all_entity): self.default_model_dir = model_dir # 加载ltp模型 # default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' # LTP模型文件目录 self.segmentor = Segmentor() user_dict = "..\\source\\user.txt" segmentor_flag = self.segmentor.load_with_lexicon(os.path.join(default_model_dir, 'cws.model'), user_dict) # segmentor_flag = self.segmentor.load(os.path.join(default_model_dir, 'cws.model')) # 词性标注模型 self.postagger = Postagger() postag_flag = self.postagger.load(os.path.join(self.default_model_dir, 'pos.model')) # 命名实体识别模型 self.recognizer = NamedEntityRecognizer() ner_flag = self.recognizer.load(os.path.join(self.default_model_dir, 'ner.model')) # 依存句法分析模型 self.parser = Parser() parser_flag = self.parser.load(os.path.join(self.default_model_dir, 'parser.model')) if segmentor_flag or postag_flag or ner_flag or parser_flag: # 可能有错误 print('load model failed') def segment(self, sentence, entity_postag=dict()): words = self.segmentor.segment(sentence) lemmas = [] for lemma in words: lemmas.append(lemma) return lemmas def getPostag(self): return self.postagger def postag(self, lemmas): """ Parameters ---------- lemmas : List,分词后的结果 entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生 Returns ------- words:WordUnit List,包括分词与词性标注的结果 """ words = [] # 词性标注 postags = self.postagger.postag(lemmas) for i in range(len(lemmas)): # 存储分词与词性标记后的词单元WordUnit,编号从1开始 word = WordUnit(i + 1, lemmas[i], postags[i]) words.append(word) # self.postagger.release() #释放 return words def get_postag(self, word): """获得单个词的词性标注 Args: word:str,单词 Returns: pos_tag:str,该单词的词性标注 """ pos_tag = self.postagger.postag([word]) return pos_tag[0] def netag(self, words): """ 命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并 Parameters words : WordUnit list,包括分词与词性标注结果 Returns words_netag:WordUnit list,包含分词,词性标注与命名实体识别的结果 """ lemmas = [] # 存储分词后的结果 postags = [] # 存储词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 命名实体识别 netags = self.recognizer.recognize(lemmas, postags) words_netag = EntityCombine().combine(words, netags) return words_netag def parse(self, words): """ 对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选) Args: words_netag:WordUnit list,包含分词,词性标注与命名实体识别结果 Returns *:sentenceUnit 句子单元 """ lemmas = [] # 分词结果 postags = [] # 词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 依存句法分析 arcs = self.parser.parse(lemmas, postags) for i in range(len(arcs)): words[i].head = arcs[i].head words[i].dependency = arcs[i].relation return SentenceUnit(words) def close(self): """ 关闭与释放 """ # pynlpir.close() self.postagger.release() self.recognizer.release() self.parser.release() def splitSentence(self,text): pattern = r'。|!|?|;|=' result_list = re.split(pattern, text) result_list = list(filter(self.not_empty, result_list)) # print(result_list) return result_list def splitSentenceByComma(self,text): pattern = r',' result_list = re.split(pattern, text) result_list = list(filter(self.not_empty, result_list)) return result_list def not_empty(self,s): return s and "".join(s.split()) def dsfn1_2_3_4COO(self, sentence, item1, item2): allTripes = [] """ 判断两个实体是否属于DSFN1的情况,并输出三元组 """ if self.dsfnConstraints3(sentence,item1,item2) and (item1.dependency == "ATT"): AttWord = item1.head_word AttWordDict = dict() AttWordStr = "" while AttWord.ID < item2.ID: AttWordDict[AttWord.ID] = AttWord.lemma # AttWordStr += AttWord.lemma if (AttWord.dependency == "ATT"): AttWord = AttWord.head_word else: break if (AttWord.ID == item2.ID): flag = True while flag: len1 = len(AttWordDict) AttList = AttWordDict.keys() for id in range(item1.ID + 1, item2.ID): item = sentence.get_word_by_id(id) if item.head_word != None and item.head_word.ID in AttList and (item.dependency == "ATT"): AttWordDict[item.ID] = item.lemma if len1 == len(AttWordDict): flag = False else: flag = True AttWordDict = sorted(AttWordDict.items(), key=lambda item: item[0]) AttWordStr = "" for i in AttWordDict: AttWordStr += i[1] # print("三元组:(" + item1.lemma + "," + AttWordStr + "," + item2.lemma + ")") allTripes.append([item1.lemma, AttWordStr, item2.lemma]) """ 考虑DSFN2的情况 """ if item1.dependency == "SBV": pred1 = item1.head_word predDict = dict() predDict[pred1.ID] = pred1.lemma if item2.dependency == "VOB": pred2 = item2.head_word predDict[pred2.ID] = pred2.lemma if (len(predDict) == 1): PredWordStr = "" for i in predDict: PredWordStr += predDict[i] # print("DSFN2三元组:(" + item1.lemma + "," + PredWordStr + "," + item2.lemma + ")") allTripes.append([item1.lemma, PredWordStr, item2.lemma]) """ 新加,为了考虑“习近平视察和访问上海”的情况 """ if len(predDict) ==2: num = self.get_entity_num_between(pred1,pred2,sentence) # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num)) if num == 0: # print("DSFN2三元组:(" + item1.lemma + "," + pred1.lemma + "," + item2.lemma + ")") allTripes.append([item1.lemma, pred1.lemma, item2.lemma]) # print("DSFN2三元组:(" + item1.lemma + "," + pred2.lemma + "," + item2.lemma + ")") allTripes.append([item1.lemma, pred2.lemma, item2.lemma]) """ DSFN3.0 """ pred = None if item1.dependency == "SBV" and item2.dependency == "POB": pred = item1.head_word prep = item2.head_word elif item1.dependency == "FOB" and item2.dependency == "POB": # 考虑介词为“被”的情况,如 “小王被小明所陷害” pred = item1.head_word prep = item2.head_word c = item1 item1 = item2 item2 = c if pred != None and prep != None: if prep.dependency == "ADV": if prep.head_word.ID == pred.ID: pred2 = None object = None objectForPred2 = None for i in range(pred.ID + 1, len(sentence.words) + 1): item = sentence.get_word_by_id(i) if item.dependency == "VOB" and item.head_word.ID == pred.ID: object = item objectDict = dict() objectDict[object.ID] = object for word in sentence.words: if word.head_word != None and word.dependency == "ATT" and word.head_word.ID == object.ID: objectDict[word.ID] = word objectDict = sorted(objectDict.items(), key=lambda item: item[0]) objectStr = "" for objectItem in objectDict: objectStr += objectItem[1].lemma # print( # "DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "" + objectStr + "," + item2.lemma + ")") allTripes.append([item1.lemma, pred.lemma + "" + objectStr, item2.lemma]) # print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "" + object.lemma + "," + item2.lemma + ")") # allTripes.append([item1.lemma, pred.lemma + "" + object.lemma, item2.lemma]) if object == None: # print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")") allTripes.append([item1.lemma, pred.lemma , item2.lemma]) """ DSFN4 """ pred = None prep = None prep1 = None pred2 = None if item1.dependency == "SBV" and item2.dependency == "POB": pred = item1.head_word prep = item2.head_word if prep.dependency == "CMP": pred2 = prep.head_word if pred2.ID == pred.ID: # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep.lemma + "," + item2.lemma + ")") allTripes.append([item1.lemma, pred.lemma + "" + prep.lemma, item2.lemma]) else : num = self.get_entity_num_between(pred1, pred2, sentence) # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num)) if num == 0: for word in sentence.words: if word.dependency == "CMP" and word.head_word.ID == pred.ID: prep1 = word if prep1!=None: # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep1.lemma + "," + item2.lemma + ")") allTripes.append([item1.lemma, pred.lemma + "" + prep1.lemma, item2.lemma]) # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")") allTripes.append([item1.lemma, pred2.lemma + "" + prep.lemma, item2.lemma]) else: # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")") allTripes.append([item1.lemma, pred.lemma , item2.lemma]) # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")") allTripes.append([item1.lemma, pred2.lemma + "" + prep.lemma, item2.lemma]) """ DSFN5 """ # self.dsfn5and6(rawSentence,sentence,item1,item2) return allTripes def get_entity_num_between(self,verb1,verb2,sentence): """ 获得两个动词之间的实体数量 Parameters ---------- entity1 : WordUnit,动词1 entity2 : WordUnit,动词2 Returns: num:int,两动词间的实体数量 """ if verb1.ID > verb2.ID: c = verb1 verb1 = verb2 verb2 = c num = 0 i = verb1.ID while i < verb2.ID-1: if self.is_entity(sentence.words[i]): num +=1 i +=1 return num def is_entity(self,entry): """判断词单元是否是实体 Args: entry:WordUnit,词单元 Returns: *:bool,实体(True),非实体(False) """ #候选实体词性列表 entity_postags = ['nh','ni','ns','nz','j','n','v'] # print(entry.lemma+" : "+entry.postag) if entry.postag in entity_postags: return True else: return False def dsfnAttCOO(self,sentence,item1,item2): item1Att = item1 item2Att = item2 while item1Att.dependency == "ATT": item1Att = item1Att.head_word allTripe = self.dsfn1_2_3_4COO(sentence,item1Att,item2) if allTripe == None or len(allTripe) == 0: while item2Att.dependency == "ATT": item2Att = item2Att.head_word allTripe = self.dsfn1_2_3_4COO(sentence,item1,item2Att) if allTripe == None or len(allTripe) == 0: allTripe = self.dsfn1_2_3_4COO(sentence,item1Att,item2Att) for tripe in allTripe: if tripe[0] == item1Att.lemma: tripe[0] = item1.lemma if tripe[2] == item2Att.lemma: tripe[2] = item2.lemma return allTripe def dsfn5COO(self, sentence, item1, item2): if item1.dependency == "COO": item1COO = item1.head_word allTripes1 = self.dsfn1_2_3_4COO(sentence,item1COO,item2) # print(allTripes1) for tripe in allTripes1: if tripe[0] == item1COO.lemma: tripe[0] = item1.lemma elif tripe[2] == item1COO.lemma: tripe[2] = item1.lemma return allTripes1 # print("allTripes1"+str(allTripes1)) def dsfn6COO(self,sentence,item1,item2): if item2.dependency == "COO": item2COO = item2.head_word allTripes2 = self.dsfn1_2_3_4COO(sentence,item1,item2COO) for tripe in allTripes2: if tripe[2] == item2COO.lemma: tripe[2] = item2.lemma elif tripe[0] == item2COO.lemma: tripe[0] = item2.lemma return allTripes2 def dsfn5and6COO(self,sentence,item1,item2): if item1.dependency == "COO": item1COO = item1.head_word if item2.dependency == "COO": item2COO = item2.head_word allTripe = self.dsfn1_2_3_4COO(sentence,item1COO,item2COO) for tripe in allTripe: if tripe[0] == item1COO.lemma and tripe[2] == item2COO.lemma: tripe[0] = item1.lemma tripe[2] = item2.lemma if tripe[2] == item1COO.lemma and tripe[0] == item2COO.lemma: tripe[2] = item1.lemma tripe[0] = item2.lemma return allTripe def dsfnStartCOO3(self, rawSentence, entity1, entity2,all_entity): nounRelatedWithPosition = ['主席','总理','教授','校长'] resultList = [] lemmas = dsfn.segment(rawSentence) words = dsfn.postag(lemmas) words_netag = dsfn.netag(words) sentence = dsfn.parse(words_netag) print(sentence.to_string()) Rawitem1 = None Rawitem2 = None item1 = None item2 = None Rawitem1Index = -1 Rawitem2Index = -1 indexList = [-1,-1] for item in sentence.words: if (item.lemma == entity1): Rawitem1 = item if (item.lemma == entity2): Rawitem2 = item if Rawitem1 != None and Rawitem2 != None and (Rawitem1.ID!=Rawitem1Index or Rawitem2.ID!=Rawitem2Index): Rawitem1Index = Rawitem1.ID Rawitem2Index = Rawitem2.ID # if item1 == None or item2 == None: # return None item1 = Rawitem1 item2 = Rawitem2 if item1.ID > item2.ID: c = item1 item1 = item2 item2 = c # print(str(item1.ID) + " " + str(item2.ID)) itemCopy1 = item1 itemCopy2 = item2 if self.dsfnConstraints2(sentence,item1,item2,all_entity) == False: continue allTripes = self.dsfnStartCOO2(sentence,item1,item2) # print("111"+item2.lemma) if allTripes!=None and len(allTripes) == 0: # return None # if item1.postag in ['n', 'nh', 'nl', 'ns', 'nz', 'ni'] and item1.dependency == "ATT": # item1 = item1.head_word while item1.dependency == "ATT": item1 = item1.head_word # if 'n' in item1.postag and item1.postag not in ['nh', 'ns', 'nz', 'ni']: # pass # else: # item1 = itemCopy1 # if item2.postag in ['n', 'nh', 'nl', 'ns', 'nz', 'ni'] and item2.dependency == "ATT": # item2 = item2.head_word while item2.dependency == "ATT": item2 = item2.head_word allTripes = self.dsfnStartCOO2(sentence, item1, item2) if len(allTripes) != 0: for tripe in allTripes: if tripe[1]!= "": if tripe[0] == item1.lemma: if item1.ID < itemCopy1.ID: tripe[0] = item1.lemma+""+itemCopy1.lemma elif item1.ID > itemCopy1.ID: tripe[0] = itemCopy1.lemma+""+item1.lemma else: tripe[0] = itemCopy1.lemma elif tripe[2] == item1.lemma: if item1.ID < itemCopy1.ID: tripe[2] = item1.lemma+""+itemCopy1.lemma elif item1.ID > itemCopy1.ID: tripe[2] = itemCopy1.lemma+""+item1.lemma else: tripe[2] = itemCopy1.lemma # tripe[2] = itemCopy1.lemma if tripe[0] == item2.lemma: if item2.ID < itemCopy2.ID: tripe[0] = item2.lemma + ""+ itemCopy2.lemma elif item2.ID > itemCopy2.ID: tripe[0] = itemCopy2.lemma + ""+ item2.lemma else: tripe[0] = itemCopy2.lemma elif tripe[2] == item2.lemma: # print(item2.lemma) if item2.ID < itemCopy2.ID: tripe[2] = item2.lemma + ""+ itemCopy2.lemma elif item2.ID > itemCopy2.ID: tripe[2] = itemCopy2.lemma + ""+ item2.lemma else: tripe[2] = itemCopy2.lemma # print("12345") resultList.append(tripe) else: for tripe in allTripes: if tripe[1]!="": resultList.append(tripe) # if len(resultList) > 0: # return np.array(set([tuple(t) for t in resultList])) if item1 == None or item2 == None: return None if len(resultList) > 0: return np.array(set([tuple(t) for t in resultList])) def dsfnStartCOO2(self, sentence, item1, item2): nounRelatedWithPosition = ['主席', '总理', '教授', '校长'] resultList = [] itemCopy1 = item1 itemCopy2 = item2 """ 来解决ATT依赖的名词,如 李克强[ATT] <----- 总理[SBV] """ # print(item1.lemma) # print(item2.lemma) allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2) if len(allTripes) == 0: # print("11111111") allTripes = self.dsfn5COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("2222222") allTripes = self.dsfn6COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("3333333") allTripes = self.dsfn5and6COO(sentence, item1, item2) # if allTripes == None or len(allTripes) == 0: # print("44444444444") # allTripes = self.dsfnAttCOO(sentence,item1,item2) # print("第一次"+str(allTripes)) if allTripes != None and len(allTripes) != 0: for tripe in allTripes: resultList.append(tripe) # print("第二次") pred1 = None subForCoo = None for item in sentence.words: if item.postag == "v" and item.dependency == "COO": pred1 = item.head_word for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred1.ID: for phrase in sentence.words: if phrase.dependency == "SBV" and phrase.head_word.ID == item.ID: subForCoo = phrase if subForCoo == None or ( subForCoo != None and subForCoo.ID == word.ID): # 处理动词COO的情况,必须要保证此并列动词没有额外主语。 # 考虑到:习近平主席视察厦门,李克强总理访问香港 word.head_word = item allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2) if len(allTripes) == 0: # print("11111111") allTripes = self.dsfn5COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("2222222") allTripes = self.dsfn6COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("3333333") allTripes = self.dsfn5and6COO(sentence, item1, item2) # if allTripes == None or len(allTripes) == 0: # allTripes = self.dsfnAttCOO(sentence,item1,item2) # print("第二次"+str(allTripes)) if allTripes != None and len(allTripes) != 0: for tripe in allTripes: resultList.append(tripe) # print(np.array(set([tuple(t) for t in resultList]))) return resultList def dsfnConstraints1(self,rawSentence,maxLength): """ :param rawSentence: 原句子 :param maxLength: 句子的最大长度 :return: 小于maxLength的长度 """ newSentence = [] if len(rawSentence) <= maxLength: newSentence.append(rawSentence) return newSentence else: newSentence = self.splitSentenceByComma(rawSentence) return newSentence def dsfnConstraints2(self,sentence,item1,item2,allEntities): countEntity = 0 countChar = 0 for index in range(item1.ID+1, item2.ID): word = sentence.get_word_by_id(index) countChar += len(word.lemma) if word.lemma in allEntities: countEntity +=1 if countEntity > 3: return False elif countChar > 12: return False else: return True def dsfnConstraints3(self,sentence,item1,item2): countChar = 0 for index in range(item1.ID+1, item2.ID): word = sentence.get_word_by_id(index) countChar += len(word.lemma) if countChar > 5: return False else: return True def getSPO(self,sentence_list): for sentence in sentence_list: RawSentence = sentence lemmas = self.segment(sentence) words = self.postag(lemmas) words_netag = self.netag(words) sentence = self.parse(words_netag) print(sentence.to_string()) for item in sentence.words: if (item.head_word == None and item.postag == "v" ) or (item.postag == "v" and item.dependency == "COO" and item.head_word.head_word == None): relation_verb = item if item.head_word==None: verbId = item.ID verbId2 = None elif item.head_word.head_word == None: verbId = item.ID verbId2 = item.head_word.ID O_dict = dict() S_dict = dict() OBJ = None SUB = None for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId: # if SUB == None or SUB.lemma != entity: SUB = item S_dict[SUB.ID] = SUB.lemma if (item.dependency == "VOB" and item.head_word.ID == verbId) or (item.dependency == "POB" and item.head_word.ID == verbId)\ or (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID== verbId) or(item.dependency == "POB" and item.head_word.postag == "p"\ and item.head_word.dependency == "ADV" and item.head_word.head_word.ID== verbId): OBJ = item O_dict[OBJ.ID] = OBJ.lemma # if item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" \ # and item.head_word.head_word.ID == verbId: # verb_p = item.head_word # O_dict[OBJ.lemma] = OBJ.ID if SUB == None: for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId2: # if SUB == None or SUB.lemma != entity: SUB = item S_dict[SUB.ID] = SUB.lemma if OBJ == None: for item in sentence.words: if item.dependency == "VOB" and item.head_word.ID == verbId2: OBJ = item O_dict[OBJ.ID] = OBJ.lemma OBJList = [] flag = True while flag == True: len1 = len(S_dict) len2 = len(O_dict) for item in sentence.words: if SUB !=None and item.head_word!=None: SUBList = S_dict.keys() if item.head_word.ID in SUBList and (item.dependency =="ATT" or item.dependency == "COO"): SUBATT = item S_dict[SUBATT.ID] = SUBATT.lemma if OBJ != None and item.head_word != None: OBJList = O_dict.keys() if item.head_word.ID in OBJList and (item.dependency == "ATT" or item.dependency == "COO") : OBJATT = item # if item.dependency!="COO": O_dict[OBJATT.ID] = OBJATT.lemma # else: # O_dict[OBJATT.ID] = OBJATT.lemma+" " if len(S_dict)!=len1 or len(O_dict)!=len2: flag = True else: flag = False O_dict = sorted(O_dict.items(), key=lambda item: item[0]) S_dict = sorted(S_dict.items(), key=lambda item: item[0]) Object = "" Subject = "" for i in O_dict: Object += i[1] for i in S_dict: Subject += i[1] if SUB != None : print(RawSentence) print((Subject, relation_verb.lemma, Object)) S_dict2 = dict() O_dict2 = dict() SUB_COO = None OBJ_COO = None for item in sentence.words: if item.head_word != None: if SUB != None and item.dependency == "COO" and item.head_word.ID == SUB.ID: # if SUB == None or SUB.lemma != entity: SUB_COO = item S_dict2[SUB_COO.ID] = SUB_COO.lemma if item.head_word != None and OBJ!=None: if item.dependency == "COO" and item.head_word.ID == OBJ.ID: OBJ_COO = item O_dict2[OBJ_COO.ID] = OBJ_COO.lemma flag = True while flag == True: len1 = len(S_dict2) len2 = len(O_dict2) for item in sentence.words: if SUB_COO != None and item.head_word != None: SUBList = S_dict2.keys() if item.head_word.ID in SUBList and item.dependency == "ATT": SUBATT = item S_dict2[SUBATT.ID] = SUBATT.lemma if OBJ_COO != None and item.head_word != None: OBJList = O_dict2.keys() if item.head_word.ID in OBJList and item.dependency == "ATT": OBJATT = item O_dict2[OBJATT.ID] = OBJATT.lemma if len(S_dict2) != len1 or len(O_dict2) != len2: flag = True else: flag = False O_dict2 = sorted(O_dict2.items(), key=lambda item: item[0]) S_dict2 = sorted(S_dict2.items(), key=lambda item: item[0]) if len(O_dict2) or len(S_dict2): if len(O_dict2) == 0: O_dict2 = O_dict if len(S_dict2) == 0: S_dict2 = S_dict Object = "" Subject = "" for i in O_dict2: Object += i[1] for i in S_dict2: Subject += i[1] if SUB != None: print("11111111111111111111111"+RawSentence) print((Subject, relation_verb.lemma, Object)) def getSPO2(self,sentence_list): all_result = [] raw_sentence = [] for sentence in sentence_list: RawSentence = sentence lemmas = self.segment(sentence) words = self.postag(lemmas) words_netag = self.netag(words) sentence = self.parse(words_netag) # print(sentence.to_string()) for itemWord in sentence.words: #来找到一个动词,这个动词要么是一句话的HED,要么与一句话的HED是COO的依存关系 if (itemWord.head_word == None and itemWord.postag == "v" ) or (itemWord.postag == "v" and itemWord.dependency == "COO" and itemWord.head_word.head_word == None): relation_verb = itemWord #将找到的这个动词,作为relation_verb relationString = relation_verb.lemma if itemWord.head_word==None: verbId = itemWord.ID #关系动词的ID verbId2 = None elif itemWord.head_word.head_word == None: verbId = itemWord.ID #该关系动词的ID verbId2 = itemWord.head_word.ID #这句话的HED,用来找SUB O_dict = dict() #存储所有的Object S_dict = dict() #存储所有的Subject verb_dict = dict() #存储所有的verb,主要考虑的情况为:习近平主席在北京大学发表演讲 OBJ = None SUB = None for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId: #寻找这个动词的主语 # if SUB == None or SUB.lemma != entity: SUB = item #找到主语 S_dict[SUB.ID] = SUB.lemma #将主语加入到字典中 if (item.dependency == "VOB" and item.head_word.ID == verbId): # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词) OBJ = item O_dict[OBJ.ID] = OBJ.lemma relationString = relation_verb.lemma verb_dict[OBJ.ID] = relationString if (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID== verbId) : # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词) OBJ = item O_dict[OBJ.ID] = OBJ.lemma relationString = relation_verb.lemma + "" + item.head_word.lemma verb_dict[OBJ.ID] = relationString if (item.dependency == "POB" and item.head_word.postag == "p"\ and item.head_word.dependency == "ADV" and item.head_word.head_word.ID== verbId): # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词) OBJ = item O_dict[OBJ.ID] = OBJ.lemma relationString = relation_verb.lemma for eachWord in sentence.words: if eachWord.dependency == "VOB" and eachWord.head_word.ID == relation_verb.ID: relationString = relation_verb.lemma + "" + eachWord.lemma verb_dict[OBJ.ID] = relationString if SUB == None:#如果没找到主语,那么就找与该动词并列的verbId2的主语 for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId2: # if SUB == None or SUB.lemma != entity: SUB = item S_dict[SUB.ID] = SUB.lemma if OBJ == None: verb_coo = None for item in sentence.words: if item.dependency == "COO" and item.head_word.ID == verbId: verb_coo = item break flag = True if verb_coo != None and self.get_entity_num_between(relation_verb,verb_coo,sentence) == 0: for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verb_coo.ID: flag = False if flag!= False: for item in sentence.words: if (item.dependency == "VOB" and item.head_word.ID == verb_coo.ID)\ or (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID== verb_coo.ID) or (item.dependency == "POB" and item.head_word.postag == "p"\ and item.head_word.dependency == "ADV" and item.head_word.head_word.ID== verb_coo.ID): OBJ = item O_dict[OBJ.ID] = OBJ.lemma SUB_COO = None OBJ_COO = None for item in sentence.words: if item.head_word != None: if SUB != None and item.dependency == "COO" and item.head_word.ID == SUB.ID: #获得主语的COO SUB_COO = item S_dict[SUB_COO.ID] = SUB_COO.lemma if item.head_word != None and OBJ!=None: if item.dependency == "COO" and item.head_word.ID == OBJ.ID: #获得宾语的COO OBJ_COO = item O_dict[OBJ_COO.ID] = OBJ_COO.lemma S_new = [] for sub in S_dict: if sentence.get_word_by_id(sub).postag == 'r': continue S_dict2 = dict() # 存放主语ATT的列表 S_dict2[sub] = S_dict[sub] flag = True while flag == True: len1 = len(S_dict2) for item in sentence.words: if item.head_word != None: SUBList = S_dict2.keys() if item.head_word.ID in SUBList and (item.dependency == "ATT" or item.dependency == "ADV"): SUBATT = item S_dict2[SUBATT.ID] = SUBATT.lemma if len(S_dict2) != len1 : flag = True else: flag = False S_dict2 = sorted(S_dict2.items(), key=lambda item: item[0]) Subject = "" for i in S_dict2: Subject += i[1] S_new.append(Subject) O_new = [] V_new = [] for obj in O_dict: if sentence.get_word_by_id(obj).postag == 'r': continue O_dict2 = dict() # 存放宾语ATT的列表 O_dict2[obj] = O_dict[obj] if verb_dict!=None: if obj in verb_dict: relationString2 = verb_dict[obj] else: relationString2 = relation_verb.lemma else: relationString2 = relation_verb.lemma V_new.append(relationString2) flag = True while flag == True: len2 = len(O_dict2) for item in sentence.words: if item.head_word != None: OBJList = O_dict2.keys() if item.head_word.ID in OBJList and (item.dependency == "ADV" or item.dependency == "ATT" or item.dependency == "VOB"): OBJATT = item O_dict2[OBJATT.ID] = OBJATT.lemma if len(O_dict2) != len2: flag = True else: flag = False #一直循环,直到找不到新的修饰词 O_dict2 = sorted(O_dict2.items(), key=lambda item: item[0]) Object = "" for i in O_dict2: Object += i[1] O_new.append(Object) for sub in S_new: for i in range(0,len(O_new)): obj = O_new[i] relationWord = V_new[i] if obj != "": # print(RawSentence) # print((sub, relationWord, obj)) all_result.append([sub,relationWord,obj]) raw_sentence.append(RawSentence) return all_result,raw_sentence def hasEntity(self,word,allEntity): for entity in allEntity: if entity in word: # print(entity) return True return False def PostProcessSPO(self,rawSentence,allTripes,allEntity): for i in range(0,len(allTripes)): tripe = allTripes[i] sub = tripe[0] obj = tripe[2] # print(sub) # print(obj) if self.hasEntity(sub,allEntity) and self.hasEntity(obj,allEntity): print(rawSentence[i]) print(tripe)
class Extractor(): def __init__(self): self.__triple_list = [] self.__segmentor = Segmentor() self.__postagger = Postagger() self.__recognizer = NamedEntityRecognizer() self.__parser = Parser() self.__words_full_list = [] self.__netags_full_list = [] self.load() @property def triple_list(self): return self.__triple_list def load(self): ltp_dir=conf.get('config','ltp_dir') self.__segmentor.load(ltp_dir+'cws.model') self.__postagger.load(ltp_dir+'pos.model') self.__recognizer.load(ltp_dir+'ner.model') self.__parser.load(ltp_dir+'parser.model') def release(self): self.__segmentor.release() self.__postagger.release() self.__recognizer.release() self.__parser.release() def clear(self): self.__triple_list = [] self.__words_full_list = [] self.__netags_full_list = [] def chunk_str(self, data): self.clear() sents = SentenceSplitter.split(data.strip()) offset = 0 for sent in sents: try: words = self.__segmentor.segment(sent) postags = self.__postagger.postag(words) netags = self.__recognizer.recognize(words, postags) arcs = self.__parser.parse(words, postags) self.__words_full_list.extend(list(words)) self.__netags_full_list.extend(list(netags)) self.chunk_sent(list(words), list(postags), list(arcs), offset) offset += len(list(words)) except Exception as e: print(str(e)) offset += len(list(words)) return [t.to_list() for t in self.__triple_list] def chunk_sent(self, words, postags, arcs, offset): root = [i+1 for i,x in enumerate(arcs) if x.relation == 'HED'] if len(root) > 1: raise Exception('More than 1 HEAD arc is detected!') root = root[0] relations = [i+1 for i, x in enumerate(arcs) if x.head == root and x.relation == 'COO'] relations.insert(0,root) for rel in relations: e1=None left_arc = [i+1 for i, x in enumerate(arcs) if x.head == rel and x.relation == 'SBV'] if len(left_arc) == 0: for i in range(rel-2,-1,-1): x=arcs[i] if x.head == rel: left_arc=[i+1] break if len(left_arc) > 0: left_arc = left_arc[-1] leftmost = find_farthest_att(arcs, left_arc) e1 = Entity(1, [words[i] for i in range(leftmost-1, left_arc)], offset + leftmost-1) right_arc = [i+1 for i, x in enumerate(arcs) if x.head == rel and x.relation == 'VOB'] e2_list = [] if not right_arc: e2 = None e2_list.append(e2) else: right_ext = find_farthest_vob(arcs, right_arc[0]) items = [i+1 for i, x in enumerate(arcs) if x.head == right_ext and x.relation == 'COO'] items = right_arc + items count = 0 for item in items: leftmost = find_farthest_att(arcs, item) e2 = None if count == 0: e2 = Entity(2, [words[i] for i in range(leftmost-1, right_ext)], offset+leftmost-1) else: p1 = range(leftmost-1, right_arc[0]-1) p2 = range(item-1, find_farthest_vob(arcs, item)) e2 = Entity(2, [words[i] for i in itertools.chain(p1, p2)]) e2_list.append(e2) count += 1 for e2 in e2_list: if e1==None: e1=Entity(1,[]) if e2==None: e2=Entity(2,[]) r=Relation(words[rel-1]) t=Triple(e1,e2,r) self.__triple_list.append(t)
class myLTP: def __init__(self, LTP_DATA_DIR, pattern_dir='pattern.txt'): self.LTP_DATA_DIR = LTP_DATA_DIR self.ne_pattern = self._read_ne_pattern(pattern_dir) def _read_ne_pattern(self, filename): ne_pattern = [] with open(filename, encoding='utf8') as filein: for line in filein: if line[0] != '#': np = line.split()[:2] ne_pattern.append(np) return ne_pattern def find_ne_by_pattern(self, text): ne_dic = defaultdict(list) for ne_type, pattern in self.ne_pattern: nes = re.findall(pattern, text) text = re.sub(pattern, ne_type, text) ne_dic[ne_type].extend(nes) return text, ne_dic def load(self, index=[1, 1, 1, 1, 1]): """分词 词性标注 命名实体识别 句法分析 语义角色分析""" if index[0]: cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') self.segmentor = Segmentor() self.segmentor.load(cws_model_path) if index[1]: pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model') self.postagger = Postagger() self.postagger.load(pos_model_path) if index[2]: ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model') self.recognizer = NamedEntityRecognizer() self.recognizer.load(ner_model_path) if index[3]: par_model_path = os.path.join(self.LTP_DATA_DIR, 'parser.model') self.parser = Parser() self.parser.load(par_model_path) if index[4]: srl_model_path = os.path.join(self.LTP_DATA_DIR, 'pisrl_win.model') self.labeller = SementicRoleLabeller() self.labeller.load(srl_model_path) def release(self): try: self.segmentor.release() except: pass try: self.postagger.release() except: pass try: self.recognizer.release() except: pass try: self.parser.release() except: pass try: self.labeller.release() except: pass def split_sentence(self, text): """分句""" return SentenceSplitter.split(text) def word_segment(self, sentence): """使用结巴分词""" # words = self.segmentor.segment(sentence) words = jieba.cut(sentence) return list(words) def pos_tag(self, words): """词性标注""" postags = self.postagger.postag(words) return postags def named_entity_recognize(self, words, postags): """命名实体识别""" netags = self.recognizer.recognize(words, postags) return netags def parse(self, words, postags): """句法分析""" arcs = self.parser.parse(words, postags) # (arc.head, arc.relation) return arcs def sementic_role_label(self, words, postags, arcs): """语义角色分析""" roles = self.labeller.label(words, postags, arcs) return roles def _get_ne_for_sentence(self, sentence): """获取实体,包括通过正则表达式定义的一些实体""" sentence, ne_dic = self.find_ne_by_pattern(sentence) words = list(self.word_segment(sentence)) postags = self.postagger.postag(words) ners = self.named_entity_recognize(words, postags) res = {} res['words'] = words res['ners'] = [] for index, ner in enumerate(ners): if ner != 'O': if ner[0] in ('S', 'B'): res['ners'].append([ner[2:], index, index + 1]) else: res['ners'][-1][-1] += 1 for ner_type, v in ne_dic.items(): v = iter(v) if v: for index, word in enumerate(words): if word == ner_type: words[index] = v.__next__() res['ners'].append([ner_type, index, index + 1]) return res def _get_dne_for_sentence(self, sentence): res = [] s = self._get_ne_for_sentence(sentence) ners = s['ners'] words = s['words'] for entity1, entity2 in combinations(ners, 2): res.append((entity1, entity2, words)) return res def get_dne(self, text): """获取实体对,人名(Nh)地名(Ns)机构名(Ni)""" res = [] sentences = self.split_sentence(text) for sentence in sentences: r = self._get_dne_for_sentence(sentence) res.extend(r) return res
class MyLtp: def __init__(self): self.postagger = Postagger() pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') self.postagger.load(pos_model_path) self.recognizer = NamedEntityRecognizer() ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') self.recognizer.load(ner_model_path) self.parser = Parser() par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') self.parser.load(par_model_path) def clean(self): self.postagger.release() self.recognizer.release() self.parser.release() # 寻找依存树根节点编号 def get_dependtree_root_index(self, word_list): # 词性标注 postags = self.postagger.postag(word_list) # print(list(postags)) # 命名实体识别 netags = self.recognizer.recognize(word_list, postags) # print(list(netags)) # 句法依存关系 arcs = self.parser.parse(word_list, postags) # print(' '.join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) for i in range(len(arcs)): if arcs[i].head == 0: return i, postags, arcs # 同时返回词性及依存关系列表 return -1, postags, arcs # 寻找依存关系子节点 def get_child_index(self, ind, arcs): ret = [] for i in range(len(arcs)): if arcs[i].head == ind + 1: ret.append(i) return ret # 获取命名实体索引 def get_ne_index(self, postags, chd_list): ret = [] for i in chd_list: if postags[i] in ['n', 'nh', 'ni']: ret.append(i) return ret # 获取中心词之后的第一个符号的索引 def get_first_wp_after_index(self, postags, after): for i in range(after + 1, len(postags)): if postags[i] == 'wp': return i return 0 # 获取句号索引列表 def get_periods_index_after(self, word_list, after): ret = [] for i in range(after + 1, len(word_list)): if word_list[i] in ['。', '?', '!']: ret.append(i) return ret # 获取长句中的分句,为下面的句子向量分析作准备 def get_sent_list(self, word_list, start, periods): ret = [] if len(periods) == 0: ret.append(list(word_list[start + 1:])) for i, p in enumerate(periods): if i == 0: ret.append(list(word_list[start + 1:p + 1])) else: ret.append(list(word_list[periods[i - 1] + 1:p + 1])) return ret # # 获取语料库TF-IDF vectorizer # def get_tfidf_vectorizer(self, corpus_file): # corpus = [] # with open(corpus_file, 'r', encoding='utf-8') as f: # while True: # line = f.readline() # l = line.strip() # if l: # corpus.append(l) # else: # break # # vectorizer = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b') # 不过滤单汉字 # X = vectorizer.fit_transform(corpus) # return vectorizer # # 获取句子向量 # def get_sentence_vec(self, vectorizer, word_list): # trans = vectorizer.transform([' '.join(word_list)]) # return trans.toarray()[0] # words: 要识别的内容词语列表, talk_sims: “说”的近义词 def get_character_speech(self, words, talk_sims): # 获取中心词,词性列表,依存关系表 root_index, postags, arcs = self.get_dependtree_root_index(words) # print('index:', root_index) # print('len words:', len(words)) # print('root:', words[root_index]) # 中心词不在近义词列表,返回空值 if words[root_index] not in talk_sims: return '', '', [] wp_index = self.get_first_wp_after_index(postags, root_index) if wp_index == 0: wp_index = root_index # print('wp_index:', wp_index) sent_split_idx = self.get_periods_index_after(words, wp_index) # print('split:', sent_split_idx) # 分句 sents = self.get_sent_list(words, wp_index, sent_split_idx) # print('sents: ', sents) # for sen in sents: # print('sen: ', sen) # 获取完整命名实体,针对命名实体词被分割的情况 children = self.get_child_index(root_index, arcs) # print(children) ne_list = self.get_ne_index(postags, children) oth = [] for ne in ne_list: nechd = self.get_child_index(ne, arcs) oth.append(self.get_ne_index(postags, nechd)) # print('ne_list: ', ne_list) # print('oth: ', oth) if ne_list: for i, n in enumerate(ne_list): if oth[i]: ne = words[oth[i][0]] + words[n] # print(words[oth[i][0]] + words[n]) else: ne = words[n] # print(words[n]) return ne, words[root_index], sents else: return '', '', []
class SentenceParser: def __init__(self): # LTP_DIR = './ltp_data_v3.4.0' print("加载模型路径", LTP_DIR) self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) print("加载完毕") '''句法分析---为句子中的每个词语维护一个依存句法依存儿子节点(词的出度)的字典''' ''' 句法分析中,每个只有一个入度(可能吧),可能有多个出度。 为了可以结构化的展示分析结果,或者说方便提取信息。 对每个词建立一个子节点的字典: 1) 若该词的出度为0,字典为NULL 2) 若该词的出度为n,那字典的元素个数为n ''' def build_parse_child_dict(self, words, postags, arcs): """ 格式化句法分析结果 :param words: 分词结果 :param postags: 词性标注结果 :param arcs: 句法分析结果 :return: child_dict_list, format_parse_list """ ''' arcs是一个列表: 列表元素当前单词,每个元素arc包含arc.head, arc.relation信息, head为指向该词(词的父节点)的下标(从1开始),relation为父节点和该词的句法关系 *** 因为每个词只有 一个入度, 这个arc信息就表示入度信息 LTP句法分析模型输出arcs:表示每个词的入度信息,父节点信息,只有一个 返回: child_dict_list:是表示每个词的出度信息,就是子节点信息 format_parse_list:每个词信息格式化: 与父节点句法关系,该词,该词下标,该词词性,父节点词,父词下标,父词词性 ''' child_dict_list = [] format_parse_list = [] # 对每个词建立子节点信息 for index in range(len(words)): child_dict = dict() ## 遍历寻找该词的子节点 for arc_index in range(len(arcs)): ## 如果有指向该词的子节点,则加入child_dict if arcs[arc_index].head == index + 1: if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) # 对每个词建立指定信息 ## 包含: [依存关系,词,下标,POS,父节点词,父节点下标,父节点POS] # 还可以加上词的NER信息 rely_id = [arc.head for arc in arcs] # 提取每个词依存父节点id(其中id为0的是Root) relation = [arc.relation for arc in arcs] # 提取每个词依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list '''语义角色标注''' ''' 只对句子中 谓词 进行论元分析,抽取论元以及标注论元和谓词的关系。 ''' def format_labelrole(self, words, postags): """ 格式化语义角色标注结果 :param words: :param postags: :return: """ arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} ''' roles中有多个role,每个role代表句子中的一个谓词 role.index 代表谓词的索引, role.arguments 代表关于该谓词的若干语义角色。(这里的论元可能不是简单的一个词) arg.name 表示语义角色类型, arg.range.start 表示该语义角色起始词位置的索引,(索引从0开始) arg.range.end 表示该语义角色结束词位置的索引。 roles={ 'r1':{ 'args1':{ 'name': 语义角色类型, 'range':{ 'start': 语义角色起始词位置的索引, 'end': 语义角色结束词位置的索引 } }, 'args2':{ 'name': 语义角色类型, 'range': { 'start': 语义角色起始词位置的索引, 'end': 语义角色结束词位置的索引 } }, ... }, 'r2':{ 'args1': { 'name': 语义角色类型, 'range': { 'start': 语义角色起始词位置的索引, 'end': 语义角色结束词位置的索引 } }, 'args2': { 'name': 语义角色类型, 'range': { 'start': 语义角色起始词位置的索引, 'end': 语义角色结束词位置的索引 } }, ... }, ... } ''' for role in roles: roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } return roles_dict def close(self): """关闭与释放模型""" self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.labeller.release() '''parser主函数''' ''' 将模型的输出进行处理,方便之后数据处理 模型输出:words, postags, ners, arcs, roles 处理后信息: child_dict_list:句法分析,每个词的子节点信息 format_parse_list:句法分析,每个词的信息和父节点信心(父节点唯一) roles_dic: ''' def parser_main(self, sentence): '''words, postags, ners, arcs 为LTP模型输出''' words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) ners = list(self.recognizer.recognize(words, postags)) arcs = self.parser.parse(words, postags) # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) """ arcs中有多个arc arc.head 表示依存弧的父节点词的索引。ROOT节点的索引是0,第一个词开始的索引依次为1、2、3… arc.relation 表示依存弧的关系。 注意:一个词最多只有一个弧指向它(即只有一个入度),但是一个词可以指向多个词(即有多个出度) """ child_dict_list, format_parse_list = self.build_parse_child_dict( words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, ners, child_dict_list, format_parse_list, roles_dict
class EventInfoExtract(): def __init__(self,modulePath,outfile): self.MODELDIR = modulePath self.adict = { '·' :'', '的':'', '了':'', '“':'', '”':'', '一次':'' } self.segmentor=None self.postagger=None self.parser=None self.recognizer=None self.out_file=outfile def multiple_replace(self,text): rx = re.compile('|'.join(map(re.escape, self.adict))) def one_xlat(match): return self.adict[match.group(0)] return rx.sub(one_xlat, text) def InitModule(self): #print "正在加载LTP模型... ..." self.segmentor = Segmentor() #print os.path.join(self.MODELDIR, "cws.model") self.segmentor.load("./3.3.0/ltp_data/cws.model") #分词模型,单文件 self.postagger = Postagger() self.postagger.load("./3.3.0/ltp_data/pos.model") #词性标注模型,单文件 self.parser = Parser() self.parser.load( "./3.3.0/ltp_data/parser.model") #依存句法分析模型,单文件 self.recognizer = NamedEntityRecognizer() self.recognizer.load("./3.3.0/ltp_data/ner.model") #命名实体识别模型,单文件 #print self.recognizer def release_module(self): ''' release the model ''' self.segmentor.release() self.segmentor=None self.postagger.release() self.postagger=None self.parser.release() self.parser=None self.recognizer.release() self.recognizer=None def Txtextraction_start(self,txt,out_file): """ 事实三元组的控制程序 Args: txt:带抽取的内容 """ txt = txt.strip() out_file = open(self.out_file, 'a') #try: #print "Execute here====-====" self.fact_triple_extract(txt,out_file) out_file.flush() out_file.close() def addresssTime_extract(self,inputtxt): #这个地方先做实体抽取,提取出人物、组织和相关的时间,首先分词,得到分词结果 #words = self.segmentor.segment(inputtxt) sentences = inputtxt.split('。') #print sentences DataAndTime=[] for sentence in sentences: if len(sentence)<=1: continue #sentence = u"北京是中国首都" words = self.segmentor.segment(sentence) #print '\t'.join(words) postags = self.postagger.postag(words) netags = self.recognizer.recognize(words, postags) #print '\t'.join(postags) arcs = self.parser.parse(words, postags) #print "sentence;===========132123123123123" Dt={'date':'','address':''} if (("发生" in sentence or "遭" in sentence) and ("爆炸" in sentence or "事件" in sentence or "袭击" in sentence )) or (("恐怖" in sentence) or ("袭击" in sentence)): Flag=False #print '\t'.join(words) #print '\t'.join(postags) #print '\t'.join(postags) Addressbackups=[] Address ='' for i in range(len(postags)-1): if Flag==True: if postags[i]=='ns'or postags[i]=='nd' or postags[i]=='n': # ns 地理名 nd方向名词 n一般名词 head = arcs[i].head Address=Address+words[i] if postags[head-1]=="n": Address+=words[head-1] head = arcs[head-1].head if(words[head-1]=="在" or words[head-1]=="发生" or words[head-1]=="袭击" or words[head-1]=="遭" or words[head-1]=="遭遇" or words[head-1]=="将"): Dt['address']=Address break else: print "地址,",Address Addressbackups.append(Address) Address='' Flag=False continue if postags[i]=='ns' and Flag == False: #这个地方只会第一次进来。 head = arcs[i].head Address = Address+words[i] if (words[head-1]=="在" or words[head-1]=="发生" or words[head-1]=="遭" or words[head-1]=="遭遇" or words[head-1]=="将"): Dt['address']=Address break #if postags[i+1]!='ns' or postags[i+1]!='nd' or postags[i+1]!='n': # print "wewewerwer====,",Address # Addressbackups.append(Address) Flag = True #print Addressbackups[0] if ("月" in sentence or '日' in sentence) and ("发生" in sentence or "袭击" in sentence): Flag = False Date='' Datebackup=[] for i in range(len(postags)-1): if Flag==True: if postags[i]=='nt': #print words[i] head = arcs[i].head Date=Date+words[i] if words[head-1]=="发生" or words[head-1]=="袭击": Dt['date']=Date break else: Datebackup.append(Date) Date='' Flag=False continue if postags[i]=='nt' and Flag == False: Date = Date+words[i] #获取一下head head = arcs[i].head if words[head-1]=="发生" or words[head-1]=="袭击": Dt['date']=Date break if postags[i+1]!='nt': Datebackup.append(Date) #index=i Flag = True if Dt['date']=='' and len(Datebackup): Dt['date']=Datebackup[-1] if Dt['date']!='' or Dt['address']!='': DataAndTime.append(Dt) if len(DataAndTime)>1: for i in DataAndTime: if i['date']=="当天": DataAndTime.remove(i) if len(DataAndTime)==0: Dt['date']='' Dt['address']='' DataAndTime.append(Dt) return DataAndTime def extraction_start(self, input_txt,out_file_name): """ 事实三元组抽取的总控程序 Args: in_file_name: 输入文件的名称 #out_file_name: 输出文件的名称 begin_line: 读文件的起始行 end_line: 读文件的结束行 """ #in_file = open(in_file_name, 'r') out_file = open(out_file_name, 'a') line_index = 1 sentence_number = 0 text_line = input_txt while text_line: if line_index < begin_line: text_line = in_file.readline() line_index += 1 continue if end_line != 0 and line_index > end_line: break sentence = text_line.strip() if sentence == "" or len(sentence) > 1000: text_line = in_file.readline() line_index += 1 continue try: sentence_one = sentence.split(" ")#"。" for num in range(len(sentence_one)-1): self.fact_triple_extract(sentence, out_file) out_file.flush() except: pass sentence_number += 1 if sentence_number % 50 == 0: print "%d done" % (sentence_number) text_line = in_file.readline() line_index += 1 in_file.close() out_file.close() def attribute_define0(self,text,keywords): words = self.segmentor.segment(text) postags = self.postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): if(words[index]==keywords): for i in range(index): if(postags[index-i-2][0]=='n'): continue else: print "事件属性:","".join(words[index-i-1:index+1]) break def attribute_define1(self,text,keywords): words = self.segmentor.segment(text) postags = self.postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): if(words[index]==keywords): for i in range(index): if(postags[index-i-2][0]=='n'): continue else: if(i != 0): print "事件属性:","".join(words[index-i-1:index+1]) break def num_define(self,text): words = self.segmentor.segment(text) postags = self.postagger.postag(words)#词性标注 for index in range(len(words)): if(postags[index]=='m'): return words[index] def attribute_define2(self,text,keywords): words = self.segmentor.segment(text) #postags = postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): if(words[index]==keywords): for i in range(index): if(words[index-i-1]!=('发生' or '是')):#|(words[index-i-1]!='遭遇'): continue else: if(i != 0): attribute = "".join(words[index-i:index+1]) #attribute = multiple_replace(attribute) print '===========' if attribute in '恐怖袭击事件': return return attribute else: return def organization_define(self,text,keywords): words = self.segmentor.segment(text) postags = self.postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): if(words[index]==keywords): for i in range(index): if(postags[index-i-1][0]=='n')&(index-i-1 != 0): continue else: if(words[index-1]=='组织')&(postags[index-2][0]!='n'): continue if(i != 0): print "组织:","".join(words[index-i:index]) return "".join(words[index-i:index]) def organization_define1(self,text,keywords): words = self.segmentor.segment(text) postags = self.postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): if(words[index]==keywords): for i in range(index): if(postags[index-i-1][0]=='n')&(index-i-1 != 0): continue else: if(words[index-1]=='组织')&(postags[index-2][0]!='n'): continue if(i != 0): #print "组织:","".join(words[index-i:index]) return "".join(words[index-i:index]) def fact_attribute_from_text(self,text): """ """ text = text.replace(',','。') sentence_one = text.split("。") fact_attribute = [] for num in range(len(sentence_one)-1): if('袭击' in sentence_one[num]): #attribute_define0(sentence_one[num],'事件') #print sentence_one[num] sentence_temp = self.multiple_replace(sentence_one[num]) if('发生' in sentence_temp)|('遭遇' in sentence_temp): #print '---------------',sentence_temp temp_atrribut1 = self.attribute_define2(sentence_temp,'事件') #print temp_atrribut1 if((temp_atrribut1)==None): temp_atrribut2 = self.attribute_define2(sentence_temp,'袭击') #print temp_atrribut2 if temp_atrribut2==None: return fact_attribute.append(str(temp_atrribut2)) else: fact_attribute.append(str(temp_atrribut1)) #print '------------------' if(len(fact_attribute)==0): #print '事件属性:unkown!' return 'None' else: #print '事件属性1:', len(fact_attribute),''.join(fact_attribute) #print '事件属性:',max(fact_attribute, key=len) return max(fact_attribute, key=len) def organization_from_text(self,text): """ 事实三元组抽取的总控程序 Args: in_file_name: 输入文件的名称 #out_file_name: 输出文件的名称 begin_line: 读文件的起始行 end_line: 读文件的结束行 """ sentence_one = text.split("。") #print '---------------------------',sentence_one[0] ogniz = [] for num in range(len(sentence_one)-1): if('负责' in sentence_one[num]): if('宣称' in sentence_one[num]): #print sentence_one[num] sentence_temp = sentence_one[num].replace('“','') sentence_temp = sentence_temp.replace('”','') temp_org = self.organization_define(sentence_temp,'宣称') if(temp_org != None): ogniz.append(temp_org) if(len(ogniz)==0): if('宣称' in sentence_one[num]): #print sentence_one[num] sentence_temp = sentence_one[num].replace('“','') sentence_temp = sentence_temp.replace('”','') temp_org = self.organization_define1(sentence_temp,'宣称') if(temp_org != None): ogniz.append(temp_org) if(len(ogniz)==0): #print '组织:unkown!' return 'unknown' else: #print '组织:',max(ogniz, key=len) #print ogniz return max(ogniz, key=len) def death_num_from_text(self,text): """ 事实三元组抽取的总控程序 Args: in_file_name: 输入文件的名称 #out_file_name: 输出文件的名称 begin_line: 读文件的起始行 end_line: 读文件的结束行 """ text = text.replace(',','。') text = text.replace('、','。') sentence_one = text.split("。") death_num = None hurt_num = None total_num = None #print '---------------------------',sentence_one[0] for num in range(len(sentence_one)-1): if('死亡' in sentence_one[num])|('丧生' in sentence_one[num]): #print sentence_one[num] if(death_num == None): death_num = self.num_define(sentence_one[num]) #print '死亡人数:',death_num if('受伤' in sentence_one[num]): #print sentence_one[num] if(hurt_num == None): hurt_num = self.num_define(sentence_one[num]) #print '受伤人数:',hurt_num if('伤亡' in sentence_one[num]): #print sentence_one[num] if(total_num == None): total_num = self.num_define(sentence_one[num]) #print type(death_num),type(hurt_num),type(total_num) return death_num,hurt_num,total_num def fact_triple_extract(self,sentence, out_file): #print sentence """ 对于给定的句子进行事实三元组抽取 Args: sentence: 要处理的语句 """ words = self.segmentor.segment(sentence) postags = self.postagger.postag(words) netags = self.recognizer.recognize(words, postags) arcs = self.parser.parse(words, postags) child_dict_list = self.build_parse_child_dict(words, postags, arcs) Entity_Address=[] Entity_Name = [] for index in range(len(postags)): e1 = '' if netags[index][0] == 'S' or netags[index][0] == 'B': if 'Ns' in netags[index]: ni = index if netags[ni][0] == 'B': while netags[ni][0] != 'E': ni += 1 e1 = ''.join(words[index:ni+1]) else: e1 = words[ni] Entity_Address.append(e1) if "Nh" in netags[index]: ni = index if netags[ni][0]=='B': while netags[ni][0]!='E': ni+=1 e1= ''.join(words[index:ni+1]) else: e1=words[ni] Entity_Name .append(e1) Entity_Address = list(set(Entity_Address)) Entity_Name = list(set(Entity_Name)) for i in Entity_Name: print i AddressTp =[] LocateAddress = [] for index in range(len(postags)): # 抽取以谓词为中心的事实三元组 if postags[index] == 'v': child_dict = child_dict_list[index] # 主谓宾 Flag = False if child_dict.has_key('SBV') and child_dict.has_key('VOB'): e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) r = words[index] e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) out_file.write("主语谓语宾语关系\t(%s, %s, %s)\n" % (e1, r, e2)) for address in Entity_Address: if address in e1 and ( ("袭击" in e1 or "袭击" in e2) or ("事件" in e2 or "事件" in e1)): for name in Entity_Name: if name in e1: Flag == False break else: Flag = True if Flag == True: for i in Entity_Address: if i in e1 or i in e2: AddressTp.append(i) out_file.flush() # 定语后置,动宾关系 if arcs[index].relation == 'ATT': if child_dict.has_key('VOB'): e1 = self.complete_e(words, postags, child_dict_list, arcs[index].head - 1) r = words[index] e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) temp_string = r+e2 if temp_string == e1[:len(temp_string)]: e1 = e1[len(temp_string):] if temp_string not in e1: #print "定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2) out_file.write("定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2)) out_file.flush() # 含有介宾关系的主谓动补关系 if child_dict.has_key('SBV') and child_dict.has_key('CMP'): #e1 = words[child_dict['SBV'][0]] e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) cmp_index = child_dict['CMP'][0] r = words[index] + words[cmp_index] if child_dict_list[cmp_index].has_key('POB'): e2 = self.complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0]) #print "介宾关系主谓动补\t(%s, %s, %s)\n" % (e1, r, e2) out_file.write("介宾关系主谓动补\t(%s, %s, %s)\n" % (e1, r, e2)) out_file.flush() # 尝试抽取命名实体有关的三元组 if netags[index][0] == 'S' or netags[index][0] == 'B': ni = index if netags[ni][0] == 'B': while netags[ni][0] != 'E': ni += 1 e1 = ''.join(words[index:ni+1]) else: e1 = words[ni] if arcs[ni].relation == 'ATT' and postags[arcs[ni].head-1] == 'n' and netags[arcs[ni].head-1] == 'O': r = self.complete_e(words, postags, child_dict_list, arcs[ni].head-1) if e1 in r: r = r[(r.index(e1)+len(e1)):] if arcs[arcs[ni].head-1].relation == 'ATT' and netags[arcs[arcs[ni].head-1].head-1] != 'O': e2 = self.complete_e(words, postags, child_dict_list, arcs[arcs[ni].head-1].head-1) mi = arcs[arcs[ni].head-1].head-1 li = mi if netags[mi][0] == 'B': while netags[mi][0] != 'E': mi += 1 e = ''.join(words[li+1:mi+1]) e2 += e if r in e2: e2 = e2[(e2.index(r)+len(r)):] if r+e2 in sentence: #print "人名//地名//机构\t(%s, %s, %s)\n" % (e1, r, e2) out_file.write("人名//地名//机构\t(%s, %s, %s)\n" % (e1, r, e2)) out_file.flush() AddressTp = list(set(AddressTp)) LocateAddress=AddressTp Tp = LocateAddress for i in LocateAddress: for k in AddressTp: if i!=k and (i in k): Tp.remove(i) address = '' for i in Tp: address+=i print "地点:",address def build_parse_child_dict(self,words, postags, arcs): """ 为句子中的每个词语维护一个保存句法依存儿子节点的字典 Args: words: 分词列表 postags: 词性列表 arcs: 句法依存列表 """ child_dict_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: if child_dict.has_key(arcs[arc_index].relation): child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) #if child_dict.has_key('SBV'): # print words[index],child_dict['SBV'] child_dict_list.append(child_dict) return child_dict_list def complete_e(self,words, postags, child_dict_list, word_index): """ 完善识别的部分实体 """ child_dict = child_dict_list[word_index] prefix = '' if child_dict.has_key('ATT'): for i in range(len(child_dict['ATT'])): prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i]) postfix = '' if postags[word_index] == 'v': if child_dict.has_key('VOB'): postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) if child_dict.has_key('SBV'): prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix return prefix + words[word_index] + postfix def attribute_define0(self,text,keywords): words = self.segmentor.segment(text) postags = self.postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): if(words[index]==keywords): for i in range(index): if(postags[index-i-2][0]=='n'): continue else: print "事件属性:","".join(words[index-i-1:index+1]) break def attribute_define1(self,text,keywords): words = self.segmentor.segment(text) postags = self.postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): if(words[index]==keywords): for i in range(index): if(postags[index-i-2][0]=='n'): continue else: if(i != 0): print "事件属性:","".join(words[index-i-1:index+1]) break def attribute_define2(self,text,keywords): #print text words = self.segmentor.segment(text) print words #print self.segmentor #print '\t'.join(words) #postags = postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): #print words[index] if(words[index]==keywords): for i in range(index): if(words[index-i-1]!=('发生' or '是')):#|(words[index-i-1]!='遭遇'): continue else: if(i != 0): attribute = "".join(words[index-i:index+1]) if attribute in '恐怖袭击事件': return return attribute else: return def organization_define(self,text,keywords): words = self.segmentor.segment(text) postags = self.postagger.postag(words)#词性标注 if keywords in text: for index in range(len(words)): if(words[index]==keywords): for i in range(index): if(postags[index-i-1][0]=='n'): continue else: if(words[index-1]=='组织')&(postags[index-2][0]!='n'): continue if(i != 0): print "组织:","".join(words[index-i:index]) return "".join(words[index-i:index]) def fact_attribute(self,in_file_name, out_file_name, begin_line, end_line): """ 事实三元组抽取的总控程序 Args: in_file_name: 输入文件的名称 #out_file_name: 输出文件的名称 begin_line: 读文件的起始行 end_line: 读文件的结束行 """ in_file = open(in_file_name, 'r') out_file = open(out_file_name, 'a') line_index = 1 sentence_number = 0 text_line = in_file.readline() while text_line: #小于起始段的直接跳过 if line_index < begin_line: text_line = in_file.readline() line_index += 1 continue if end_line != 0 and line_index > end_line: break sentence = text_line.strip() #长段(大于1000)直接跳过 if sentence == "" or len(sentence) > 1000: text_line = in_file.readline() line_index += 1 continue sentence_one = sentence.split(" ")#"。" for num in range(len(sentence_one)-1): attribute_define0(sentence_one[num],'事件') attribute_define2(sentence_one[num],'袭击') sentence_number += 1 if sentence_number % 50 == 0: print "%d done" % (sentence_number) text_line = in_file.readline() line_index += 1 in_file.close() out_file.close() ''' def fact_attribute_from_text(text): """ 事实三元组抽取的总控程序 Args: in_file_name: 输入文件的名称 #out_file_name: 输出文件的名称 begin_line: 读文件的起始行 end_line: 读文件的结束行 """ text = text.replace(',','。') sentence_one = text.split("。") fact_attribute = [] for num in range(len(sentence_one)-1): if('袭击' in sentence_one[num]): #attribute_define0(sentence_one[num],'事件') #print sentence_one[num] sentence_temp = multiple_replace(sentence_one[num]) if('发生' in sentence_temp)|('遭遇' in sentence_temp): print '---------------',sentence_temp temp_atrribut1 = self.attribute_define2(sentence_temp,'事件') fact_attribute.append(str(temp_atrribut1)) if((temp_atrribut1)==None): temp_atrribut2 = self.attribute_define2(sentence_temp,'袭击') fact_attribute.append(str(temp_atrribut2)) print '------------------' if(len(fact_attribute)==0): print '事件属性:unkown!' return 'unknown' else: print '事件属性1:', len(fact_attribute),fact_attribute print '事件属性:',max(fact_attribute, key=len) return max(fact_attribute, key=len) ''' '''
class semantic_annotation: LTP_DATA_DIR = 'D:/LTP/ltp_data' ATT_ADV = ['ATT', 'ADV'] N = ['a', 'd', 'b'] dp_arcs = ['VOB', 'SBV', 'FOB'] def __init__(self): cws_model_path = os.path.join(config.LTP_DATA_DIR, 'cws.model') self.segmentor = Segmentor() self.segmentor.load_with_lexicon(cws_model_path, config.dic_path) pos_model_path = os.path.join(config.LTP_DATA_DIR, 'pos.model') self.postagger = Postagger() self.postagger.load(pos_model_path) par_model_path = os.path.join(config.LTP_DATA_DIR, 'parser.model') self.parser = Parser() self.parser.load(par_model_path) def Model_release(self): try: self.segmentor.release() self.postagger.release() self.parser.release() except Exception as e: s = "释放分词,词性标注,句法分析模型运行发生异常Model_releasen" + str(e) Logger.log_ERROR.error(s) Logger.log_ERROR.exception(sys.exc_info()) raise TypeError(s) def set_sentence(self, sentence): try: words = self.segmentor.segment(sentence) words_list = list(words) Logger.log_DEBUG.debug('分词结果:' + str(words_list)) postags = self.postagger.postag(words) postags_list = list(postags) Logger.log_DEBUG.debug('词性标注结果:' + str(postags_list)) arcs = self.parser.parse(words, postags) arcs_list = list(arcs) s = '句法分析结果:' for a in arcs_list: s = s + str(a.head) + ":" + a.relation + ' ' Logger.log_DEBUG.debug(s) sen = sentence_class(words_list, postags_list, arcs_list) return sen except Exception as e: s = "设置句子属性发生异常set_sentence" + str(e) Logger.log_ERROR.error(s) Logger.log_ERROR.exception(sys.exc_info()) raise TypeError(s) def semantic_annotation(self, sentence): try: sen = self.set_sentence(sentence) dic_r = {} ap_ID = self.find_ap(sen) if ap_ID >= 0: dic_r['ap'] = sen.class_word_list[ap_ID].name Logger.log_DEBUG.debug('动作属性:' + sen.class_word_list[ap_ID].name) else: dic_r['ap'] = '' Logger.log_DEBUG.debug('没有找到动作属性:') indiv_ID = self.find_indiv(sen) if indiv_ID >= 0: dic_r['indiv'] = sen.class_word_list[indiv_ID].name Logger.log_DEBUG.debug('个体:' + sen.class_word_list[indiv_ID].name) else: dic_r['indiv'] = '' Logger.log_DEBUG.debug('没有找到个体:') adv = self.find_AdvAdj(sen, ap_ID, 'adv') dic_r['adv'] = adv Logger.log_DEBUG.debug('状语:' + adv) dp_ID = self.find_dp(sen) if dp_ID >= 0: dic_r['dp'] = sen.class_word_list[dp_ID].name Logger.log_DEBUG.debug('数据属性:' + sen.class_word_list[dp_ID].name) else: dic_r['dp'] = '' Logger.log_DEBUG.debug('没有找到数据属性:') adj = self.find_AdvAdj(sen, dp_ID, 'adj') dic_r['adj'] = adj Logger.log_DEBUG.debug('定语:' + adj) other = '' for w in sen.class_word_list: if w.Semantic_markup == 'other': other = other + ',' + w.name dic_r['other'] = other Logger.log_DEBUG.debug('其他词:' + other) return dic_r except Exception as e: s = "语义标注主函数运行发生异常semantic_annotation" + str(e) Logger.log_ERROR.error(s) Logger.log_ERROR.exception(sys.exc_info()) raise TypeError(s) def find_dp(self, sen): try: self.word_merge_A(sen) self.word_merge_COO(sen) self.word_merge_VOB(sen) cwl = sen.class_word_list dp = [] for i in range(len(cwl)): if cwl[i].Semantic_markup == 'other' and cwl[i].pos not in semantic_annotation.N and \ cwl[i].arcs_relation in semantic_annotation.dp_arcs and cwl[i].arcs_head == sen.ap_ID + 1: dp.append(i) if len(dp) <= 0: for i in range(len(cwl)): if cwl[i].Semantic_markup == 'other' and cwl[i].pos not in semantic_annotation.N and \ cwl[i].pos == 'n': dp.append(i) if len(dp) > 0: sen.dp_ID = dp[0] cwl[dp[0]].Semantic_markup = 'dp' return sen.dp_ID except Exception as e: s = "确定数属发生异常find_dp" + str(e) Logger.log_ERROR.error(s) Logger.log_ERROR.exception(sys.exc_info()) raise TypeError(s) def word_merge_VOB(self, sen): try: cwl = sen.class_word_list VOB_num = 0 for i in range(len(cwl)): if cwl[i].Semantic_markup == 'other' and \ cwl[i].arcs_relation == 'VOB' and cwl[i].arcs_head == sen.hed_ID + 1: VOB_num = VOB_num + 1 for n in range(VOB_num): for i in range(len(cwl)): if cwl[i].Semantic_markup == 'other' and \ cwl[i].arcs_relation == 'VOB' and cwl[i].arcs_head == sen.hed_ID + 1: ioc = i + 1 for j in range(i, len(cwl)): if cwl[j].Semantic_markup == 'other' and \ cwl[j].arcs_relation == 'VOB' and cwl[j].arcs_head == sen.hed_ID + 1: if self.is_merge(sen, ioc, j + 1): Logger.log_DEBUG.debug('合并: ' + cwl[j].name + ' 和 ' + cwl[ioc - 1].name) cwl[ioc - 1].name = cwl[ioc - 1].name + cwl[j].name cwl[ioc - 1].arcs_relation = 'VOB' cwl[j].Semantic_markup = 'merge' except Exception as e: s = "合并核心词宾语发生异常word_merge_VOB" + str(e) Logger.log_ERROR.error(s) Logger.log_ERROR.exception(sys.exc_info()) raise TypeError(s) def word_merge_COO(self, sen): try: cwl = sen.class_word_list COO_num = 0 for i in range(len(cwl)): if (cwl[i].Semantic_markup) == 'other' and (cwl[i].pos) not in semantic_annotation.N and \ cwl[i].arcs_relation == 'COO' and cwl[i].arcs_head == sen.ap_ID + 1: COO_num = COO_num + 1 for n in range(COO_num): for i in range(len(cwl)): if cwl[i].Semantic_markup == 'other' and cwl[i].pos not in semantic_annotation.N and \ cwl[i].arcs_relation == 'COO' and cwl[i].arcs_head == sen.ap_ID + 1: ioc = i + 1 for j in range(len(cwl)): if cwl[j].Semantic_markup == 'other' and cwl[j].arcs_head == ioc and \ cwl[j].arcs_relation == 'VOB': if self.is_merge(sen, ioc, j + 1): Logger.log_DEBUG.debug('合并: ' + cwl[j].name + ' 和 ' + cwl[ioc - 1].name) if ioc > j + 1: cwl[ioc - 1].name = cwl[j].name + cwl[ioc - 1].name else: cwl[ioc - 1].name = cwl[ioc - 1].name + cwl[j].name cwl[ioc - 1].arcs_relation = 'VOB' cwl[j].Semantic_markup = 'merge' except Exception as e: s = "合并与动属并列的动词与其宾语发生异常word_merge_COO" + str(e) Logger.log_ERROR.error(s) Logger.log_ERROR.exception(sys.exc_info()) raise TypeError(s) def word_merge_A(self, sen): try: cwl = sen.class_word_list A_num = 0 for i in range(len(cwl)): if cwl[i].Semantic_markup == 'other' and cwl[i].pos not in semantic_annotation.N and \ cwl[i].arcs_relation in semantic_annotation.ATT_ADV: A_num = A_num + 1 for n in range(A_num): for i in range(len(cwl)): if cwl[i].Semantic_markup == 'other' and cwl[i].pos not in semantic_annotation.N and \ cwl[i].arcs_relation in semantic_annotation.ATT_ADV: Ioc = cwl[i].arcs_head if cwl[Ioc - 1].Semantic_markup == 'other' and self.is_merge(sen, i + 1, Ioc): s = '合并: ' + cwl[i].name + ' 和 ' + cwl[Ioc - 1].name Logger.log_DEBUG.debug(s) if i + 1 > Ioc: cwl[Ioc - 1].name = cwl[Ioc - 1].name + cwl[i].name else: cwl[Ioc - 1].name = cwl[i].name + cwl[Ioc - 1].name cwl[i].Semantic_markup = 'merge' break except Exception as e: s = "合并定中和状中关系发生异常word_merge_A" + str(e) Logger.log_ERROR.error(s) Logger.log_ERROR.exception(sys.exc_info()) raise TypeError(s) def is_merge(self, sen, A, Ioc): try: markup_List = ['merge'] if abs(A - Ioc) == 1: return True if A > Ioc: for i in range(Ioc - 1, A): if sen.class_word_list[i].Semantic_markup in markup_List: return True else: for i in range(A - 1, Ioc): if sen.class_word_list[i].Semantic_markup in markup_List: return True return False except Exception as e: s = "判断两个词是否可以合并发生异常is_merge" + str(e) Logger.log_ERROR.error(s) Logger.log_ERROR.exception(sys.exc_info()) raise TypeError(s) def find_AdvAdj(self, sen, ID, s_markup): try: advAdj = '' if ID < 0: return advAdj cwl = sen.class_word_list for i in range(len(cwl)): if cwl[i].arcs_head == ID + 1 and cwl[i].pos in semantic_annotation.N and \ cwl[i].Semantic_markup == 'other': advAdj = advAdj + cwl[i].name + "," cwl[i].Semantic_markup = s_markup if s_markup == 'adj': if cwl[i].pos in semantic_annotation.N and \ cwl[i].Semantic_markup == 'other' and cwl[i].name not in advAdj: advAdj = advAdj + cwl[i].name + "," cwl[i].Semantic_markup = s_markup return advAdj except Exception as e: s = "确定状语和定语发生异常find_AdvAdj" + str(e) Logger.log_ERROR.error(s) Logger.log_ERROR.exception(sys.exc_info()) raise TypeError(s) def find_indiv(self, sen): try: if len(sen.class_word_list) <= 3: for i in range(len(sen.class_word_list)): if sen.class_word_list[i].Semantic_markup == 'other': sen.class_word_list[i].Semantic_markup = 'indiv' sen.indiv_ID = i return sen.indiv_ID flag_1 = -1 flag_2 = -1 for i in range(len(sen.class_word_list)): if sen.class_word_list[i].Semantic_markup == 'other': if flag_1 < 0: flag_1 = i continue elif flag_2 < 0: flag_2 = i break if flag_2 - flag_1 != 1: sen.class_word_list[flag_1].Semantic_markup = 'indiv' sen.indiv_ID = flag_1 return sen.indiv_ID wc0 = sen.class_word_list[flag_1] wc1 = sen.class_word_list[flag_2] if wc0.arcs_head == flag_2 + 1 and wc0.arcs_relation in semantic_annotation.ATT_ADV: if wc1.Semantic_markup == 'other' and wc1.pos == 'n': wc1.name = wc0.name + wc1.name wc1.Semantic_markup = 'indiv' wc0.Semantic_markup = 'indiv_ATT' sen.indiv_ID = flag_2 else: wc0.Semantic_markup = 'indiv' sen.indiv_ID = flag_1 else: wc0.Semantic_markup = 'indiv' sen.indiv_ID = flag_1 return sen.indiv_ID except Exception as e: s = "确定个体发生异常find_indiv" + str(e) Logger.log_ERROR.error(s) Logger.log_ERROR.exception(sys.exc_info()) raise TypeError(s) def find_ap(self, sen): try: cwl = sen.class_word_list for i in range(len(cwl)): if cwl[i].pos == 'v': sen.V_ID = i break if sen.V_ID < 0: Logger.log_DEBUG.debug('该句子分词经过词性标注后没有动词') sen.ap_ID = -1 return sen.ap_ID for i in range(len(cwl)): if cwl[i].arcs_relation == 'HED': sen.hed_ID = i break if sen.hed_ID < 0: Logger.log_DEBUG.debug('经过句法分析没有核心词') sen.ap_ID = sen.V_ID return sen.ap_ID if sen.V_ID == sen.hed_ID: sen.ap_ID = sen.hed_ID cwl[sen.ap_ID].Semantic_markup = 'ap' return sen.ap_ID if cwl[sen.hed_ID].pos != 'v': new_hed = self.find_late(sen.hed_ID, sen.postags_list) if new_hed == sen.V_ID: sen.ap_ID = sen.V_ID else: if cwl[new_hed].arcs_head == sen.V_ID or cwl[sen.V_ID].arcs_head == new_hed: sen.ap_ID = sen.V_ID else: sen.ap_ID = new_hed else: if cwl[sen.V_ID].arcs_head == sen.hed_ID + 1: sen.ap_ID = sen.V_ID else: sen.ap_ID = sen.hed_ID cwl[sen.ap_ID].Semantic_markup = 'ap' cwl[sen.hed_ID].arcs_head = cwl[sen.ap_ID].arcs_head cwl[sen.hed_ID].arcs_relation = cwl[sen.ap_ID].arcs_relation return sen.ap_ID except Exception as e: s = "确定动作属性发生异常find_ap" + str(e) Logger.log_ERROR.error(s) Logger.log_ERROR.exception(sys.exc_info()) raise TypeError(s) def find_late(self, flag_hed, postags_list): try: flag_L = -1 flag_R = -1 for i in range(0, flag_hed): if postags_list[i] == 'v': flag_L = i for i in range(flag_hed, len(postags_list)): if postags_list[i] == 'v': flag_R = i if flag_L > 0 and flag_R > 0: if abs(flag_L - flag_hed) <= abs(flag_R - flag_hed): return flag_L else: return flag_R else: if flag_L > 0: return flag_L else: return flag_R except Exception as e: s = "找最近动词发生异常find_late" + str(e) Logger.log_ERROR.error(s) Logger.log_ERROR.exception(sys.exc_info()) raise TypeError(s)
class Semantic_Parser(object): def __init__(self): self.cws_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/cws.model' self.pos_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/pos.model' self.parser_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/parser.model' self.ner_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/ner.model' self.srl_model_path = '/home/irlab0/LTP/ltp-data/ltp_data/srl/' def load(self): self.segmentor = Segmentor() self.segmentor.load(self.cws_model_path) self.postagger = Postagger() self.postagger.load(self.pos_model_path) self.parser = Parser() self.parser.load(self.parser_model_path) self.recognizer = NamedEntityRecognizer() self.recognizer.load(self.ner_model_path) self.labeller = SementicRoleLabeller() self.labeller.load(self.srl_model_path) def release(self): self.segmentor.release() self.postagger.release() self.parser.release() self.recognizer.release() self.labeller.release() def get_cws(self, sentence): try: cws = self.segmentor.segment(sentence) except: cws = self.segmentor.segment(sentence.decode('utf8')) print(" ".join(cws)) return cws def get_pos(self, cws): postags = self.postagger.postag(cws) print(" ".join(postags)) return postags def get_arcs(self, cws, postags): arcs = self.parser.parse(cws, postags) label = " ".join("%s:%d:%s" % (word, arc.head, arc.relation) for word, arc in zip(cws, arcs)) print(label) return arcs def get_role(self, cws, postags, arcs): netags = self.recognizer.recognize(cws, postags) roles = self.labeller.label(cws, postags, netags, arcs) for role in roles: print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) def get_query(self, cws, arcs): ''' 对问句做句法分析后,提取其中的主干部分 先取HED,然后分别取SBV和VOB :param cws: :param arcs: :return: ''' words = [word for word in cws] head = [arc.head for arc in arcs] relation = [arc.relation for arc in arcs] print(words) print(head) print(relation) hed_index = index(head, 0)[0] + 1 import_index = index(head, hed_index) print(import_index) sbv = [words[i] for i in import_index if relation[i] == 'SBV'] vob = [words[i] for i in import_index if relation[i] == 'VOB'] print(''.join(sbv)) print(''.join(vob)) return ''.join(sbv), ''.join(vob)
def extract_opinion(document): saywords = load_saywords() LTP_DATA_DIR = r'../ltp_data/' # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 par_model_path = os.path.join( LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 # cut to sentences sentences = SentenceSplitter.split(document) table = [] for sentence in sentences: # cut words words = segmentor.segment(sentence) # 分词 # postages postags = postagger.postag(words) # 词性标注 # ner netags = recognizer.recognize(words, postags) # 命名实体识别 # dependency parsing arcs = parser.parse(words, postags) # 句法分析 child_dict_list = build_parse_child_dict(words, postags, arcs) index = 0 for arc in arcs: if arc.relation == 'HED': break index += 1 # 谓语是说一类的词 predicate = words[index] child_dict = child_dict_list[index] if ('SBV' in child_dict) and ('VOB' in child_dict): e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) r = words[index] e2 = ''.join(words[index + 1:]) if r in saywords: table.append((e1, r, e2)) segmentor.release() # 释放模型 postagger.release() # 释放模型 recognizer.release() # 释放模型 parser.release() # 释放模型 return table
def parser_ltp_arc(word_list, tag_list): parser = Parser() #初始化实例 parser.load(par_model_path) arcs = parser.parse(word_list, tag_list) #依存句法分析,一条评论的依存句法分析 parser.release() return arcs
class Extraction(): def __init__(self, cws_model_path: str, pos_model_path: str, ner_model_path: str, parser_model_path: str, spoken_words: str, word2vec): self.spoken_words = spoken_words self.truncate_index = 8 self.segmentor = Segmentor() self.postagger = Postagger() self.ner = NamedEntityRecognizer() self.parser = Parser() self.segmentor.load(str(cws_model_path)) self.postagger.load(str(pos_model_path)) self.ner.load(str(ner_model_path)) self.parser.load(str(parser_model_path)) self.word2vec = Word2Vec.load(word2vec) def release(self): self.segmentor.release() self.postagger.release() self.ner.release() self.parser.release() def get_next_sentence(self, news, index): stop1 = news[index + 1:].find("。") stop2 = news[index + 1:].find("!") stop3 = news[index + 1:].find("?") stop_list = [stop for stop in [stop1, stop2, stop3] if stop != -1] if stop_list is None: False stop = min(stop_list) return news[index:index + stop + 2], index + stop + 2 def cut(self, string): return " ".join(jieba.cut(string)) def sentence_distance(self, sentence1, sentence2): word_list1 = self.cut(sentence1) word_list2 = self.cut(sentence2) vec_1 = 0 vec_2 = 0 ### get representation of sentence 1 for i in range(len(word_list1)): if word_list1[i] in self.word2vec.wv.vocab: vec_1 += self.word2vec.wv[word_list1[i]] ### get representation of sentence 2 for i in range(len(word_list2)): if word_list2[i] in self.word2vec.wv.vocab: vec_2 += self.word2vec.wv[word_list2[i]] return np.dot(vec_1, vec_2) / (np.linalg.norm(vec_1) * np.linalg.norm(vec_2)) def get_sentence(self, news, word_list, idx, postag_list, sub_id): # 取得 说的内容 及SBV的宾语成分 # idx 为表示说的词在新闻中的位置信息 index = len("".join(word_list[:idx + 1])) sub_index = len("".join(word_list[:sub_id])) if news[index] == "。" or news[index] == "!" or news[index] == "?": stop1 = news[:index].rfind("。") stop2 = news[:index].rfind("!") stop3 = news[:index].rfind("?") # 检查是不是后面没有句子了 stop_list = [stop for stop in [stop1, stop2, stop3] if stop != -1] if len(stop_list) == 0: stop = 0 else: stop = max(stop_list) + 1 begin = float("inf") end = float("inf") if "“" in news[:index] and "”" in news[:index]: begin = news[:index].rfind("“") end = news[:index].rfind("”") # 第一种情况 双引号在stop前面,表示说词后面跟的是双引号的句子 则双引号里的内容即为说的内容 if sub_index - end < self.truncate_index: result = news[begin + 1:end] else: result = news[stop:sub_index] # print(result) else: stop1 = news[index:].find("。") stop2 = news[index:].find("!") stop3 = news[index:].find("?") # 检查是不是后面没有句子了 stop_list = [stop for stop in [stop1, stop2, stop3] if stop != -1] if len(stop_list) == 0: return False # 返回后面的第一个句子 stop = min(stop_list) sentence = news[index:stop + index + 1] if postag_list[idx + 1] == 'wp': sentence = sentence[1:] if postag_list[idx + 2] == 'wp': sentence = sentence[1:] result = sentence sim = 1 next_id = stop + index + 2 # 检查第二个句子是否也是说的内容,通过检查句子相似性来判断 若是相似度大于某个数值则表示 相近 这个句子也是说的内容 # 如果相似度大于0.7表示 该句话和前一句内容相似 所以这句话 也为说的内容 继续检查下一句话 while sim > 0.85 and next_id <= len(news): next_sentence_id = next_id if next_sentence_id <= len(news): next_sentence, next_id = self.get_next_sentence( news, next_sentence_id) sim = self.sentence_distance(sentence, next_sentence) if sim > 0.85: result += next_sentence sentence = next_sentence return result def get_sub_and_view(self, idxs, news, word_list, postag_list): sub = [] speech = [] for sub_id, spoken_id in idxs: sub.append(word_list[sub_id]) speech.append( self.get_sentence(news, word_list, spoken_id, postag_list, sub_id)) return sub, speech def find_spoken_word_id_and_sub(self, spoken_words, word_list, ner_list, parser_list): #取得 新闻中 包含SBV关系 并且V表示的是说的意思 的主语和谓语的位置 id_list = [] for sub_id, parse_relation in enumerate(parser_list): index, relation = parse_relation if relation == "SBV" and (ner_list[sub_id] == "S-Nh" or ner_list[sub_id] == "S-Ni" or ner_list[sub_id] == "S-Ns"): spoken_word = word_list[index - 1] if spoken_word in spoken_words: word_id = index - 1 id_list.append((sub_id, word_id)) return id_list def newsExtraction(self, news): word_list = list(self.segmentor.segment(news)) postag_list = list(self.postagger.postag(word_list)) ner_list = list(self.ner.recognize(word_list, postag_list)) arcs = self.parser.parse(word_list, postag_list) parser_list = [(arc.head, arc.relation) for arc in arcs] idx = self.find_spoken_word_id_and_sub(self.spoken_words, word_list, ner_list, parser_list) # print(idx) sub, speech = self.get_sub_and_view(idx, news, word_list, postag_list) # for i in range(len(sub)): # print(sub[i], speech[i]) return sub, speech
class Senten_Parse: def __init__(self, flag=True): self.flag = flag # 是否使用自定义词典,True:使用,False:不使用,默认为True self.segmentor = Segmentor() # 分词模型初始化 self.postagger = Postagger() # 词性标注模型初始化 self.parser = Parser() # 句法分析模型初始化 if self.flag: if os.path.exists(config_TM.userdict_path): # 加载分词模型,使用自定义词典 self.segmentor.load_with_lexicon(config_TM.cws_model_path, config_TM.userdict_path) Logger.log_DEBUG.info('分词模型加载成功,使用自定义词典') else: Logger.log_ERROR.error('没找到自定义词典文件,请检查路径是否正确') else: # 加载分词模型,不使用自定义词典 self.segmentor.load(config_TM.cws_model_path) Logger.log_DEBUG.info('分词模型加载成功,不使用自定义词典') # 加载词性标注模型 self.postagger.load(config_TM.pos_model_path) Logger.log_DEBUG.info('词性标注模型加载成功') # 加载句法分析模型 self.parser.load(config_TM.par_model_path) Logger.log_DEBUG.info('句法分析模型加载成功') def sentence_parse(self, sentence): """ 语句分析 :param sentence: 待处理语句 :return: 返回分词结果(list)、词性标注结果(list)、句法分析结果(list-tuple) """ segmentor = self.segmentor postagger = self.postagger parser = self.parser # 分词结果列表 words_list = list(segmentor.segment(sentence)) # 词性标注 postags = postagger.postag(words_list) # 词性标注结果列表 pos_list = [pos for word, pos in zip(words_list, postags)] # 句法分析 arcs = parser.parse(words_list, postags) # 句法分析结果列表 arcs_list = [] temp_arcs_list = [(arc.head, arc.relation) for arc in arcs] arcslist_dic = dict((i, c) for i, c in enumerate(temp_arcs_list)) words_dic = dict((i, c) for i, c in enumerate(words_list)) for key in arcslist_dic: arcslist_dic_key = arcslist_dic[key] if arcslist_dic_key[1] == 'HED': temp_list = [words_dic[key], arcslist_dic_key[1], words_dic[key]] arcs_list.append(temp_list) else: temp_list = [words_dic[key], arcslist_dic_key[1], words_dic[arcslist_dic_key[0] - 1]] arcs_list.append(temp_list) Logger.log_DEBUG.info('语句分析完成!') return words_list, pos_list, arcs_list def __del__(self): """ 释放模型 :return: """ self.segmentor.release() # 分词模型释放 self.postagger.release() # 词性标注模型释放 self.parser.release() # 句法分析模型释放 print('-------') print('模型释放完成')
if postags[i] in exceptposttag: continue print(words[i]) print('\t'.join(postags)) postagger.release() # 释放模型 from pyltp import NamedEntityRecognizer recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load('/home/curtank/Documents/ltp_data/ner.model') # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 print('\t'.join(netags)) recognizer.release() # 释放模型 from pyltp import Parser parser = Parser() parser.load('/home/curtank/Documents/ltp_data/parser.model') arcs = parser.parse(words, postags) # 句法分析 print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # 释放模型 from pyltp import SementicRoleLabeller labeller = SementicRoleLabeller() # 初始化实例 labeller.load('/home/curtank/Documents/ltp_data/srl') # 加载模型 roles = labeller.label(words, postags, netags, arcs) # 语义角色标注 for role in roles: print( role.index, " ".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) labeller.release() # 释放模型
class ExtractViews(object): def __init__(self): self.sents = self.load() self.sent_embd = SentEmbedding() self.segmentor = Segmentor() self.postagger = Postagger() self.parser = Parser() def load(self, inpath='./data/news_content.json'): data = read_json(inpath) all_sents = [] for news in data: for frag in news: sents = frag['sents'] all_sents.append(sents) print('finished loading all sentences') return all_sents def prepare_data(self, sampled_sents=1000): print('start preparing items for sentences embedding') sentences_sampled = random.sample(self.sents, sampled_sents) self.sent_embd.prepare(sentences_sampled) print('sentence embedding data prepare finished') def prepare_nlp_parser(self): self.segmentor.load(r'./ltpmodels/cws.model') self.postagger.load(r'./ltpmodels/pos.model') self.parser.load(r'./ltpmodels/parser.model') def extract_news(self, content): content = content.strip() paras = content.split('\n') sentences = [] for para in paras: sentences.append(cut_sent(para)) views = self._extract_views(sentences) return views def _extract_views(self, all_sents): nums = len(all_sents) views_in_sents = [] print('totally {} paragraphs needing processed'.format(nums)) for i, sents in enumerate(all_sents): views_tmp = [] if i % 100 == 0: print('processing paras : {}/{}'.format(i, nums)) for j, sent in enumerate(sents): sent = sent.replace('\\n', '\n').strip() # sentence长度达到1000左右时,ltp会报错 if len(sent) == 0 or len(sent) > 500: continue # words = list(jieba.cut(sent)) words = list(self.segmentor.segment(sent)) contains = contain_candidates(words) if len(contains) == 0: continue tags = list(self.postagger.postag(words)) arcs = list(self.parser.parse(words, tags)) sbv, head = get_sbv_head(arcs, words, tags) if sbv[0] is None or head[0] is None or head[0] not in contains: continue subj = sbv[0] view = clean_view(words[head[1] + 1:]) views_tmp.append((subj, view, j)) views_final = self._get_final_views(sents, views_tmp) if len(views_final) > 0: views_in_sents.extend(views_final) return views_in_sents def extract(self): all_sents = self.sents views_in_sents = self._extract_views(all_sents) return views_in_sents def _get_final_views(self, sents, views_tmp): def _entire_emb(emb: np.array, sents_no_ind): dim = emb.shape[1] for ind in sents_no_ind: # 不存在embedding的句子补0 emb = np.insert(emb, ind, np.zeros((1, dim)), axis=0) return emb embeddings, sents_no_ind = self.sent_embd.sents_embedding(sents) # 获得所有句子的embeding embeddings = _entire_emb(embeddings, sents_no_ind) views_final = [] for i, view in enumerate(views_tmp): start = view[2] stop = len(views_tmp) if i < len(views_tmp) - 1: stop = views_tmp[i + 1][2] end = self._get_view_end(embeddings, start, stop) views_final.append({ 'subj': view[0], 'view': view[1] + ''.join(sents[start + 1:end]) }) return views_final def _get_view_end(self, embeddings, start, stop, sim_threshold=0.8): # 判断view是不是在尾句或view不存在embedding if start + 1 >= stop or np.sum(np.abs(embeddings[start])) == 0: return start end = start + 1 for i in range(start + 1, stop): sent_emb = embeddings[i] curr_emb = np.mean(embeddings[start:i]) sim = self.sent_embd.cos_similarity(curr_emb, sent_emb) if sim < sim_threshold: break end += 1 return end def release_nlp_parser(self): self.segmentor.release() self.postagger.release() self.parser.release() def run(self): self.prepare_data() self.prepare_nlp_parser() views = self.extract() self.release_nlp_parser() write_json('./data/news_views_final.json', views) print('finished extract views')
class myLTP(object): def __init__(self): LTP_DATA_DIR = 'ltp_data_v3.4.0' # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') self.recognizer = NamedEntityRecognizer() # 初始化实例 self.recognizer.load(ner_model_path) # 加载模型 self.segmentor = Segmentor() self.segmentor.load(cws_model_path) self.postagger = Postagger() # 初始化实例 self.postagger.load(pos_model_path) # 加载模型 self.parser = Parser() # 初始化实例 self.parser.load(par_model_path) # 加载 #get postags def segment(self, text): words = list(self.segmentor.segment(text)) #print(words) return words def postag(self, words): postags = list(self.postagger.postag(words)) return postags def arcs(self, words, postags): arcs = self.parser.parse(words, postags) # 句法分析 arcs = list(self.parser.parse(words, postags)) #print("HEAD:",head) return arcs #get time def extract_info(self, text): words = self.segment(text) postags = self.postag(words) arcs = self.arcs(words, postags) #get time time_lst = [] i = 0 for tag, word in zip(postags, words): if tag == 'nt': j = i while postags[j] == 'nt' or words[j] in ['至', '到']: j += 1 time_lst.append(''.join(words[i:j])) i += 1 # 去重子字符串的情形 remove_lst = [] for i in time_lst: for j in time_lst: if i != j and i in j: remove_lst.append(i) text_time_lst = [] for item in time_lst: if item not in remove_lst: text_time_lst.append(item) #get entity netags = list(self.recognizer.recognize(words, postags)) # 命名实体识别 entity_index = [i for i in range(len(netags)) if netags[i] != 'O'] print(entity_index) entity_words = [] #words = [x for (index,x) in enumerate(words) if index in entity_index] #print(words) #merge words i = 0 tags = [] while i < len(entity_index): if netags[entity_index[i]][0] == "B": tags.append(netags[entity_index[i]][2:]) begin = entity_index[i] end = entity_index[i] for j in range(begin + 1, len(netags)): i = i + 1 if netags[j][0] == "E": end = j break new_word = "".join(words[begin:end + 1]) entity_words.append(new_word) else: entity_words.append(words[entity_index[i]]) tags.append(netags[entity_index[i]][2:]) i += 1 print(entity_words) #print(text_time_lst) # print("\n\n\n") relation_lst = [] child_dict_list = self.build_parse_child_dict(words, postags, arcs) for index in range(len(postags)): # 抽取以谓词为中心的事实三元组 if postags[index] == 'v': child_dict = child_dict_list[index] # 主谓宾 if 'SBV' in child_dict and 'VOB' in child_dict: e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) r = words[index] e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) if e1 not in entity_words: entity_words.append(e1) tags.append("Nr") #node regular if r not in entity_words: entity_words.append(r) tags.append("Nv") #node verb if e2 not in entity_words: entity_words.append(e2) tags.append("Nr") relation_lst.append(Relation(e1, r, "SBV")) # subject-verb relation_lst.append(Relation(r, e2, "VOB")) # verb-object print("主语谓语宾语关系\t(%s, %s, %s)\n" % (e1, r, e2)) #out_file.write("主语谓语宾语关系\t(%s, %s, %s)\n" % (e1, r, e2)) #out_file.flush() # 定语后置,动宾关系 if arcs[index].relation == 'ATT': if 'VOB' in child_dict: e1 = self.complete_e(words, postags, child_dict_list, arcs[index].head - 1) r = words[index] e2 = self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) temp_string = r + e2 if temp_string == e1[:len(temp_string)]: e1 = e1[len(temp_string):] if temp_string not in e1: if e1 not in entity_words: entity_words.append(e1) tags.append("Nr") #node regular if r not in entity_words: entity_words.append(r) tags.append("Nv") #node verb if e2 not in entity_words: entity_words.append(e2) tags.append("Nr") relation_lst.append(Relation(r, e2, "VOB")) relation_lst.append(Relation(e1, r, "ATT")) print("定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2)) # out_file.write("定语后置动宾关系\t(%s, %s, %s)\n" % (e1, r, e2)) #out_file.flush() # 含有介宾关系的主谓动补关系 if 'SBV' in child_dict and 'CMP' in child_dict: #e1 = words[child_dict['SBV'][0]] e1 = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) cmp_index = child_dict['CMP'][0] r = words[index] + words[cmp_index] if 'POB' in child_dict_list[cmp_index]: e2 = self.complete_e( words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0]) if e1 not in entity_words: entity_words.append(e1) tags.append("Nr") #node regular if r not in entity_words: entity_words.append(r) tags.append("Nv") #node verb if e2 not in entity_words: entity_words.append(e2) tags.append("Nr") relation_lst.append(Relation( e1, r, "SBV")) # Subject Verb Complement relation_lst.append(Relation(r, e2, "CMP")) print("介宾关系主谓动补\t(%s, %s, %s)\n" % (e1, r, e2)) #out_file.write("介宾关系主谓动补\t(%s, %s, %s)\n" % (e1, r, e2)) #out_file.flush() return (entity_words, tags, relation_lst, text_time_lst) def build_parse_child_dict(self, words, postags, arcs): """ 为句子中的每个词语维护一个保存句法依存儿子节点的字典 Args: words: 分词列表 postags: 词性列表 arcs: 句法依存列表 """ child_dict_list = [] #print(list(words), len(words)) for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) #if child_dict.has_key('SBV'): # print words[index],child_dict['SBV'] child_dict_list.append(child_dict) #print(child_dict_list) return child_dict_list def complete_e(self, words, postags, child_dict_list, word_index): """ 完善识别的部分实体 """ child_dict = child_dict_list[word_index] prefix = '' if 'ATT' in child_dict: for i in range(len(child_dict['ATT'])): prefix += self.complete_e(words, postags, child_dict_list, child_dict['ATT'][i]) postfix = '' if postags[word_index] == 'v': if 'VOB' in child_dict: postfix += self.complete_e(words, postags, child_dict_list, child_dict['VOB'][0]) if 'SBV' in child_dict: prefix = self.complete_e(words, postags, child_dict_list, child_dict['SBV'][0]) + prefix return prefix + words[word_index] + postfix def free_ltp(self): self.postagger.release() self.recognizer.release() # 释放模型 self.segmentor.release() self.parser.release()
class DSFN: """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析 Attributes: default_user_dict_dir:str,用户自定义词典目录 default_model_dir:str,ltp模型文件目录 """ entity_verb_new = entity_verb_new() all_entity = entity_verb_new.readAllEntity( "../../entity_verb//entity_verb_result\\all_entity.json") default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' # LTP模型文件目录 def __init__(self, model_dir=default_model_dir, all_entity=all_entity): self.default_model_dir = model_dir # 加载ltp模型 # default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' # LTP模型文件目录 self.segmentor = Segmentor() user_dict = "..\\source\\user.txt" segmentor_flag = self.segmentor.load_with_lexicon( os.path.join(default_model_dir, 'cws.model'), user_dict) # segmentor_flag = self.segmentor.load(os.path.join(default_model_dir, 'cws.model')) # 词性标注模型 self.postagger = Postagger() postag_flag = self.postagger.load( os.path.join(self.default_model_dir, 'pos.model')) # 命名实体识别模型 self.recognizer = NamedEntityRecognizer() ner_flag = self.recognizer.load( os.path.join(self.default_model_dir, 'ner.model')) # 依存句法分析模型 self.parser = Parser() parser_flag = self.parser.load( os.path.join(self.default_model_dir, 'parser.model')) if segmentor_flag or postag_flag or ner_flag or parser_flag: # 可能有错误 print('load model failed') def segment(self, sentence, entity_postag=dict()): words = self.segmentor.segment(sentence) lemmas = [] for lemma in words: lemmas.append(lemma) return lemmas def getPostag(self): return self.postagger def postag(self, lemmas): """ Parameters ---------- lemmas : List,分词后的结果 entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生 Returns ------- words:WordUnit List,包括分词与词性标注的结果 """ words = [] # 词性标注 postags = self.postagger.postag(lemmas) for i in range(len(lemmas)): # 存储分词与词性标记后的词单元WordUnit,编号从1开始 word = WordUnit(i + 1, lemmas[i], postags[i]) words.append(word) # self.postagger.release() #释放 return words def get_postag(self, word): """获得单个词的词性标注 Args: word:str,单词 Returns: pos_tag:str,该单词的词性标注 """ pos_tag = self.postagger.postag([word]) return pos_tag[0] def netag(self, words): """ 命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并 Parameters words : WordUnit list,包括分词与词性标注结果 Returns words_netag:WordUnit list,包含分词,词性标注与命名实体识别的结果 """ lemmas = [] # 存储分词后的结果 postags = [] # 存储词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 命名实体识别 netags = self.recognizer.recognize(lemmas, postags) print(netags) for netag in netags: print(netag) words_netag = EntityCombine().combine(words, netags) return words_netag def parse(self, words): """ 对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选) Args: words_netag:WordUnit list,包含分词,词性标注与命名实体识别结果 Returns *:sentenceUnit 句子单元 """ lemmas = [] # 分词结果 postags = [] # 词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 依存句法分析 arcs = self.parser.parse(lemmas, postags) for i in range(len(arcs)): words[i].head = arcs[i].head words[i].dependency = arcs[i].relation return SentenceUnit(words) def close(self): """ 关闭与释放 """ # pynlpir.close() self.postagger.release() self.recognizer.release() self.parser.release() def dsfn1_2_3_4COO(self, sentence, item1, item2): allTripes = [] """ 判断两个实体是否属于DSFN1的情况,并输出三元组 """ if (item1.dependency == "ATT"): AttWord = item1.head_word AttWordDict = dict() AttWordStr = "" while AttWord.ID < item2.ID: AttWordDict[AttWord.ID] = AttWord.lemma # AttWordStr += AttWord.lemma if (AttWord.dependency == "ATT"): AttWord = AttWord.head_word else: break if (AttWord.ID == item2.ID): flag = True while flag: len1 = len(AttWordDict) AttList = AttWordDict.keys() for id in range(item1.ID + 1, item2.ID): item = sentence.get_word_by_id(id) if item.head_word != None and item.head_word.ID in AttList and ( item.dependency == "ATT"): AttWordDict[item.ID] = item.lemma if len1 == len(AttWordDict): flag = False else: flag = True AttWordDict = sorted(AttWordDict.items(), key=lambda item: item[0]) AttWordStr = "" for i in AttWordDict: AttWordStr += i[1] print("三元组:(" + item1.lemma + "," + AttWordStr + "," + item2.lemma + ")") allTripes.append([item1.lemma, AttWordStr, item2.lemma]) """ 判断两个实体是否属于DSFN1的情况,并输出三元组 """ """ 考虑DSFN2的情况 """ if item1.dependency == "SBV" and item1.head_word.postag == "v": pred1 = item1.head_word predDict = dict() predDict[pred1.ID] = pred1.lemma if item2.dependency == "VOB" and item2.head_word.postag == "v": pred2 = item2.head_word predDict[pred2.ID] = pred2.lemma if (len(predDict) == 1): PredWordStr = "" for i in predDict: PredWordStr += predDict[i] print("DSFN2三元组:(" + item1.lemma + "," + PredWordStr + "," + item2.lemma + ")") allTripes.append([item1.lemma, PredWordStr, item2.lemma]) """ 新加,为了考虑“习近平视察和访问上海”的情况 """ if len(predDict) == 2: num = self.get_entity_num_between(pred1, pred2, sentence) flagSBV = True flagVOB = True for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred2.ID: flagSBV = False if word.dependency == "VOB" and word.head_word.ID == pred1.ID: flagVOB = False print("pred1:" + pred1.lemma + ",pred2:" + pred2.lemma + ",num:" + str(num)) if num == 0: if flagVOB == True: print("DSFN2三元组:(" + item1.lemma + "," + pred1.lemma + "," + item2.lemma + ")") allTripes.append( [item1.lemma, pred1.lemma, item2.lemma]) if flagSBV == True: print("DSFN2三元组:(" + item1.lemma + "," + pred2.lemma + "," + item2.lemma + ")") allTripes.append( [item1.lemma, pred2.lemma, item2.lemma]) """ DSFN3.0 """ pred = None prep = None if item1.dependency == "SBV" and item1.head_word.postag == "v" and item2.dependency == "POB": pred = item1.head_word prep = item2.head_word elif item1.dependency == "FOB" and item2.dependency == "POB": # 考虑介词为“被”的情况,如 “小王被小明所陷害” pred = item1.head_word prep = item2.head_word c = item1 item1 = item2 item2 = c if pred != None and prep != None: if prep.dependency == "ADV": if prep.head_word.ID == pred.ID: pred2 = None object = None objectForPred2 = None for i in range(pred.ID + 1, len(sentence.words) + 1): item = sentence.get_word_by_id(i) if item.dependency == "VOB" and item.head_word.ID == pred.ID: object = item objectDict = dict() objectDict[object.ID] = object for word in sentence.words: if word.head_word != None and word.dependency == "ATT" and word.head_word.ID == object.ID: objectDict[word.ID] = word objectDict = sorted(objectDict.items(), key=lambda item: item[0]) objectStr = "" for objectItem in objectDict: objectStr += objectItem[1].lemma print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "" + objectStr + "," + item2.lemma + ")") allTripes.append([ item1.lemma, pred.lemma + "" + objectStr, item2.lemma ]) if object == None: print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")") allTripes.append( [item1.lemma, pred.lemma, item2.lemma]) """ DSFN4 """ pred = None prep = None prep1 = None pred2 = None if item1.dependency == "SBV" and item1.head_word.postag == "v" and item2.dependency == "POB": pred = item1.head_word prep = item2.head_word if prep.dependency == "CMP" and prep.head_word.postag == "v": pred2 = prep.head_word if pred2.ID == pred.ID: print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep.lemma + "," + item2.lemma + ")") allTripes.append([ item1.lemma, pred.lemma + "" + prep.lemma, item2.lemma ]) else: num = self.get_entity_num_between(pred1, pred2, sentence) flagSBV = True flagVOB = True for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred2.ID: flagSBV = False if word.dependency == "VOB" and word.head_word.ID == pred.ID: flagVOB = False # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num)) if num == 0: flag = True for word in sentence.words: if word.dependency == "CMP" and word.head_word.ID == pred.ID: prep1 = word if prep1 != None: if flagVOB == True: # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep1.lemma + "," + item2.lemma + ")") allTripes.append([ item1.lemma, pred.lemma + "" + prep1.lemma, item2.lemma ]) # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")") if flagSBV == True: allTripes.append([ item1.lemma, pred2.lemma + "" + prep.lemma, item2.lemma ]) else: if flagVOB == True: # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")") allTripes.append( [item1.lemma, pred.lemma, item2.lemma]) if flagSBV == True: # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")") allTripes.append([ item1.lemma, pred2.lemma + "" + prep.lemma, item2.lemma ]) """ DSFN5 """ # self.dsfn5and6(rawSentence,sentence,item1,item2) return allTripes def get_entity_num_between(self, verb1, verb2, sentence): """ 获得两个动词之间的实体数量 Parameters ---------- entity1 : WordUnit,动词1 entity2 : WordUnit,动词2 Returns: num:int,两动词间的实体数量 """ if verb1.ID > verb2.ID: c = verb1 verb1 = verb2 verb2 = c num = 0 i = verb1.ID while i < verb2.ID - 1: if self.is_entity(sentence.words[i]): num += 1 i += 1 return num def is_entity(self, entry): """判断词单元是否是实体 Args: entry:WordUnit,词单元 Returns: *:bool,实体(True),非实体(False) """ #候选实体词性列表 entity_postags = ['nh', 'ni', 'ns', 'nz', 'j', 'n', 'v', 'i'] print(entry.lemma + " : " + entry.postag) if entry.postag in entity_postags: return True else: return False def dsfnAttCOO(self, sentence, item1, item2): item1Att = item1 item2Att = item2 while item1Att.dependency == "ATT": item1Att = item1Att.head_word allTripe = self.dsfn1_2_3_4COO(sentence, item1Att, item2) if allTripe == None or len(allTripe) == 0: while item2Att.dependency == "ATT": item2Att = item2Att.head_word allTripe = self.dsfn1_2_3_4COO(sentence, item1, item2Att) if allTripe == None or len(allTripe) == 0: allTripe = self.dsfn1_2_3_4COO(sentence, item1Att, item2Att) for tripe in allTripe: if tripe[0] == item1Att.lemma: tripe[0] = item1.lemma if tripe[2] == item2Att.lemma: tripe[2] = item2.lemma return allTripe def dsfn5COO(self, sentence, item1, item2): if item1.dependency == "COO": item1COO = item1.head_word allTripes1 = self.dsfn1_2_3_4COO(sentence, item1COO, item2) # print(allTripes1) for tripe in allTripes1: if tripe[0] == item1COO.lemma: tripe[0] = item1.lemma elif tripe[2] == item1COO.lemma: tripe[2] = item1.lemma return allTripes1 # print("allTripes1"+str(allTripes1)) def dsfn6COO(self, sentence, item1, item2): if item2.dependency == "COO": item2COO = item2.head_word allTripes2 = self.dsfn1_2_3_4COO(sentence, item1, item2COO) for tripe in allTripes2: if tripe[2] == item2COO.lemma: tripe[2] = item2.lemma elif tripe[0] == item2COO.lemma: tripe[0] = item2.lemma return allTripes2 def dsfn5and6COO(self, sentence, item1, item2): if item1.dependency == "COO": item1COO = item1.head_word if item2.dependency == "COO": item2COO = item2.head_word allTripe = self.dsfn1_2_3_4COO(sentence, item1COO, item2COO) for tripe in allTripe: if tripe[0] == item1COO.lemma and tripe[ 2] == item2COO.lemma: tripe[0] = item1.lemma tripe[2] = item2.lemma if tripe[2] == item1COO.lemma and tripe[ 0] == item2COO.lemma: tripe[2] = item1.lemma tripe[0] = item2.lemma return allTripe def dsfnStartCOO3(self, rawSentence, entity1, entity2): nounRelatedWithPosition = ['主席', '总理', '教授', '校长'] resultList = [] lemmas = dsfn.segment(rawSentence) words = dsfn.postag(lemmas) words_netag = dsfn.netag(words) sentence = dsfn.parse(words_netag) print(sentence.to_string()) for item in sentence.words: if (item.lemma == entity1): item1 = item if (item.lemma == entity2): item2 = item if item1.ID > item2.ID: c = item1 item1 = item2 item2 = c itemCopy1 = item1 itemCopy2 = item2 allTripes = self.dsfnStartCOO2(sentence, item1, item2) if allTripes != None and len(allTripes) == 0: if item1.postag in ['n', 'nh', 'nl', 'ns', 'nz', 'ni' ] and item1.dependency == "ATT": item1 = item1.head_word while item1.dependency == "ATT": item1 = item1.head_word if 'n' in item1.postag and item1.postag not in [ 'nh', 'ns', 'nz', 'ni' ]: pass else: item1 = itemCopy1 if item2.postag in ['n', 'nh', 'nl', 'ns', 'nz', 'ni' ] and item2.dependency == "ATT": item2 = item2.head_word while item2.dependency == "ATT": item2 = item2.head_word if ('n' in item2.postag or 'q' in item2.postag) and item2.postag not in [ 'nh', 'ns', 'nz', 'ni' ]: pass else: item2 = itemCopy2 allTripes = self.dsfnStartCOO2(sentence, item1, item2) print("注意") print(allTripes) if len(allTripes) != 0: for tripe in allTripes: if tripe[0] == item1.lemma: tripe[0] = itemCopy1.lemma elif tripe[2] == item1.lemma: tripe[2] = itemCopy1.lemma if tripe[0] == item2.lemma: tripe[0] = itemCopy2.lemma elif tripe[2] == item2.lemma: tripe[2] = itemCopy2.lemma print("12345") resultList.append(tripe) print("最终结果") print(np.array(set([tuple(t) for t in resultList]))) else: print("最终结果") print(np.array(set([tuple(t) for t in allTripes]))) def dsfnStartCOO2(self, sentence, item1, item2): nounRelatedWithPosition = ['主席', '总理', '教授', '校长'] resultList = [] itemCopy1 = item1 itemCopy2 = item2 """ 来解决ATT依赖的名词,如 李克强[ATT] <----- 总理[SBV] """ print(item1.lemma) print(item2.lemma) allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2) if len(allTripes) == 0: print("11111111") allTripes = self.dsfn5COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: print("2222222") allTripes = self.dsfn6COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: print("3333333") allTripes = self.dsfn5and6COO(sentence, item1, item2) # if allTripes == None or len(allTripes) == 0: # print("44444444444") # allTripes = self.dsfnAttCOO(sentence,item1,item2) # print("第一次"+str(allTripes)) if allTripes != None and len(allTripes) != 0: for tripe in allTripes: resultList.append(tripe) print("第二次") pred1 = None subForCoo = None for item in sentence.words: if item.postag == "v" and item.dependency == "COO": pred1 = item.head_word for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred1.ID: for phrase in sentence.words: if phrase.dependency == "SBV" and phrase.head_word.ID == item.ID: subForCoo = phrase if subForCoo == None or ( subForCoo != None and subForCoo.ID == word.ID): # 处理动词COO的情况,必须要保证此并列动词没有额外主语。 # 考虑到:习近平主席视察厦门,李克强总理访问香港 word.head_word = item print(sentence.to_string()) allTripes = self.dsfn1_2_3_4COO( sentence, item1, item2) if len(allTripes) == 0: # print("11111111") allTripes = self.dsfn5COO( sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("2222222") allTripes = self.dsfn6COO( sentence, item1, item2) if allTripes == None or len( allTripes) == 0: print("3333333") allTripes = self.dsfn5and6COO( sentence, item1, item2) # if allTripes == None or len(allTripes) == 0: # allTripes = self.dsfnAttCOO(sentence,item1,item2) # print("第二次"+str(allTripes)) if allTripes != None and len(allTripes) != 0: for tripe in allTripes: # if tripe[0] == item1.lemma: # tripe[0] = itemCopy1.lemma # elif tripe[2] == item1.lemma: # tripe[2] = itemCopy1.lemma # # if tripe[0] == item2.lemma: # tripe[0] = itemCopy2.lemma # elif tripe[2] == item2.lemma: # tripe[2] = itemCopy2.lemma resultList.append(tripe) print(np.array(set([tuple(t) for t in resultList]))) return resultList