def segmentation(filename, output_filename): print "segmenting '%s' to '%s'" % (filename, output_filename) f = open(filename, "r") lines = f.readlines() f.close() MODELDIR = "./ltp_data/" # segment segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) # postag postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) # Named Entity Recognize recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) # Parse and get SVO parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) f = open(output_filename, "w") fner = open(output_filename.split(".")[0]+"_ner.txt", "w") for _line in lines: line = _line[:-1] if line[-1] in "\n\r": line = line[:-1] words = segmentor.segment(line) postags = postagger.postag(words) # netags = recognizer.recognize(words, postags) # arcs = parser.parse(words, postags) for i in range(len(words)): f.write( "%s/%s\t" % (words[i], postags[i])) # if netags[i]!='O': # fner.write("%s/%s\t" % (words[i], netags[i])) f.write("\n") # fner.write("\n") f.close()
def __init__(self): self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` segmentor = Segmentor() segmentor.load(self.cws_model_path) self.words = segmentor.segment(data) # print("|".join(words)) segmentor.release() postagger = Postagger() # 初始化实例 postagger.load(self.pos_model_path) # 加载模型 self.postags = postagger.postag(self.words) # 词性标注 # print('\t'.join(postags)) postagger.release() # 释放模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(self.ner_model_path) # 加载模型 self.netags = recognizer.recognize(self.words, self.postags) # 命名实体识别 # print('\t'.join(netags)) recognizer.release() # 释放模型
def mingming_shiti(words,postags): """命名实体。机构名(Ni)人名(Nh)地名(Ns)""" recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print ("\t".join(netags))
postagger = Postagger() # 加载模型 postagger.load(pos_model_path) # 分词结果 words = ['元芳', '你', '怎么', '看'] # 词性标注 postags = postagger.postag(words) print('\t'.join(postags)) # 释放模型 postagger.release() # 4. 命名实体识别 # 初始化实例 recognizer = NamedEntityRecognizer() # 加载模型 recognizer.load(ner_model_path) words = ['元芳', '你', '怎么', '看'] postags = ['nh', 'r', 'r', 'v'] # 命名实体识别 netags = recognizer.recognize(words, postags) print('\t'.join(netags)) # 释放模型 recognizer.release() # 5. 依存句法分析 # 初始化实例 parser = Parser() # 加载模型 parser.load(par_model_path) words = ['元芳', '你', '怎么', '看'] postags = ['nh', 'r', 'r', 'v']
# comma_punc=re.compile(r"[,:: ]".decode("utf8")) comma_punc = re.compile(r"[,:: ]") # period_punc=re.compile(r"[。?!;?!;]".decode("utf8")) period_punc = re.compile(r"[。?!;?!;]") # del_punc=re.compile(r"[‘’“” ]".decode("utf8")) del_punc = re.compile(r"[‘’“” ]") # sub_punc=re.compile(r"[,]".decode("utf8")) sub_punc = re.compile(r"[,]") ###load models segmentor = Segmentor() segmentor.load(model_dir + "cws.model") postagger = Postagger() postagger.load(model_dir + "pos.model") recognizer = NamedEntityRecognizer() recognizer.load(model_dir + "ner.model") parser = Parser() parser.load(model_dir + "parser.model") labeller = SementicRoleLabeller() labeller.load(model_dir + "srl") def parse(sent): #this functions detects the structure of a sentence if len(sent) < 12: return "el" if len(sent) > 60: return "el" words = segmentor.segment(sent.strip()) #word segmentation postags = postagger.postag(words) #pos tagging netags = recognizer.recognize(words, postags) #entity recognition
class Char_Feature_Pipeline(): def __init__(self, char_voca, freq_dict, pyltp_path): self.char_voca = char_voca self.freq_dict = freq_dict self.pyltp_path = pyltp_path PYLTP_PATH = self.pyltp_path self.segmentor = Segmentor() self.segmentor.load(PYLTP_PATH + '/cws.model') self.postagger = Postagger() # 初始化实例 self.postagger.load(PYLTP_PATH + '/pos.model') # 加载模型 self.recognizer = NamedEntityRecognizer() # 初始化实例 self.recognizer.load(PYLTP_PATH + '/ner.model') # 加载模型 def pipeline(self, raw_iter, label_bool=False): for Id, sample in enumerate(raw_iter): dct = self.gene_pyltp_feature( sample, Id) # use pyltp to get new seg, pos and ner dct = self.replace_raw_text(dct) dct = self.gene_ner_feature(dct) # generate the NER feature if label_bool: dct = self.gene_ner_label(dct) # gene_ner_label dct = self.gene_ner_weight(dct) # gene_label_weight yield dct def release_pyltp_model(self): self.segmentor.release() self.postagger.release() self.recognizer.release() @add_to_input def gene_pyltp_feature(self, sample, Id): words = self.segmentor.segment(sample['text']) postags = self.postagger.postag(words) netags = self.recognizer.recognize(words, postags) res = [{ 'word': word, 'postag': postag, 'netag': netag } for word, postag, netag in zip(words, postags, netags)] res = {'_id': sample.get('_id', Id), 'pyltp_tags': res} return res @add_to_input def replace_raw_text(self, sample): def mask_books(text): parts = [] last_tail = 0 for match in re.finditer('《[^》]*》', text): parts.append(text[last_tail:match.span()[0]]) last_tail = match.span()[1] parts.append(''.join( ['《', 'X' * (match.span()[1] - match.span()[0] - 2), '》'])) parts.append(text[last_tail:]) return ''.join(parts) text = sample['text'] new_text = re.sub('[A-Z]', 'B', text) # 大写英文字母统一变为B new_text = re.sub('[a-z]', 'b', new_text) # 小写英文字母统一变为b new_text = re.sub('[0-9]', 'x', new_text) # 数字统一变为x new_text = mask_books(new_text) # 中文书名号《》内统一变为X res = {'raw_text': text, 'text': new_text, '_id': sample['_id']} return res @add_to_input def gene_ner_feature(self, sample): char_index = [ self.char_voca.loadWord2idAndId2Word(char) for char in sample['text'] ] char_pos, char_bmes = postag2char_pos_bmes(sample['postag']) ltp_other, ltp_char_bmes = postag2char_pos_bmes( sample['pyltp_tags'], word_key='word', other_keys=['postag', 'netag'], check_length=True, text=sample['text']) pos_index = [POS_VOCA[pos] for pos in char_pos] bmes_index = [BMES_VOCA[k] for k in char_bmes] char_freq = [self.freq_dict[s] for s in sample['text']] ltp_bmes_index = [BMES_VOCA[k] for k in ltp_char_bmes] ltp_pos_index = [POS_VOCA[pos] for pos in ltp_other[0]] ltp_ner_index = [NER_VOCA[k] for k in ltp_other[1]] if not len(char_pos) == len(char_index): pos_index = ltp_pos_index bmes_index = ltp_bmes_index assert len(char_index) == len(pos_index) res = { '_id': sample['_id'], 'char_index': char_index, 'char_size': len(sample['text']), 'pos_index': pos_index, 'bmes_index': bmes_index, 'char_freq': char_freq, 'ltp_pos_index': ltp_pos_index, 'ltp_bmes_index': ltp_bmes_index, 'ltp_ner_index': ltp_ner_index } return res @add_to_input def gene_ner_label(self, sample): text = sample['text'] char_length = len(text) subjects = set([spo['subject'] for spo in sample['spo_list']]) objects = set([spo['object'] for spo in sample['spo_list']]) locates = np.zeros(char_length, dtype=int) for bject in subjects: for span in my_finditer(bject, text): locates[span[0]:span[1]] = 1 sub_locates = locates.tolist() locates = np.zeros(char_length, dtype=int) for bject in objects: for span in my_finditer(bject, text): locates[span[0]:span[1]] = 1 ob_locates = locates.tolist() res = { '_id': sample['_id'], 'sub_label': sub_locates, 'ob_label': ob_locates } return res @add_to_input def gene_ner_weight(self, sample): sub_weight = calculate_weight(sample['sub_label']) ob_weight = calculate_weight(sample['ob_label']) res = { '_id': sample['_id'], 'sub_weight': sub_weight, 'ob_weight': ob_weight } return res
class DSFN: """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析 Attributes: default_user_dict_dir:str,用户自定义词典目录 default_model_dir:str,ltp模型文件目录 """ entity_verb_new = entity_verb_new() all_entity = entity_verb_new.readAllEntity( "../../entity_verb//entity_verb_result\\all_entity.json") default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' # LTP模型文件目录 location_entity = [ "中和殿", "太庙", "人文地理", "亚运村", "九龙壁", "圆明园", "古典建筑", "庑殿顶", "天井", "无量殿", "慈宁宫", "三希堂", "居庸关", "延寿寺", "排云殿", "东桥", "圜丘", "南天门", "垂花门", "西六宫", "配楼", "柳荫街", "中国四大名园", "午门", "乾东五所", "建筑管理", "世界博物馆", "西什库教堂", "晚清", "万泉河", "东暖阁", "储秀宫", "西华门", "院落", "地安门东大街", "御路", "知鱼桥", "清宁宫", "金水河", "景山前街", "司马台长城", "景山公园", "乐寿堂", "东六宫", "延陵", "宜芸馆", "芍药居", "承乾宫", "琉璃瓦", "湘江", "敌楼", "安定门外大街", "三音石", "崇文门", "天坛路", "台基", "东城区", "外朝", "武备", "全国重点文物保护单位", "房山石", "静园", "香山", "中东", "坤宁宫", "彩画", "江南园林", "北河沿大街", "岳阳楼", "丽景轩", "巴黎圣母院", "钟表馆", "戏楼", "白银", "红海", "中原", "明长城", "神乐署", "瀛洲", "码头", "百度地图", "旋子彩画", "乾西五所", "天圆地方", "琉璃厂文化街", "广岛", "御沟", "井亭", "古柏林", "石坊", "北京故宫", "宝云阁", "甬道", "熙和门", "乾清门", "北京城", "暖温带", "沥粉贴金", "安定路", "北齐长城", "减柱造", "宅园", "清华园", "天坛东门站", "西苑", "土山", "温带季风气候", "宫古", "东直门", "美国国务卿", "北海", "中华梦石城", "东门站", "天坛公园", "江山", "谐趣园", "修宅", "苏堤", "玉泉", "牌坊", "蓟镇", "高速公路", "钟粹宫", "无梁殿", "政治家", "牌楼", "波斯", "西内", "老龙头", "阴阳石", "三神山", "丹陛桥", "中国第一历史档案馆", "建筑艺术", "四川", "护城河", "文华殿", "静宜园", "乐峰", "永和宫", "金砖", "清漪园", "安定门", "宫殿", "梵华楼", "龙井", "水街", "东华门", "歇山式顶", "斋宫", "渤海镇", "仁和", "白浮村", "建筑风格", "买卖街", "藻鉴堂", "寿安宫", "奉先殿", "后海", "宋", "承德避暑山庄", "前门站", "寿安山", "八达岭", "棂星门", "经幢", "泰山", "后三宫", "天桥商场", "维新派", "拙政园", "北京十六景", "南湖岛", "山寨", "东海", "寺庙", "图书馆", "西山", "延禧宫", "九土", "十七孔桥", "鹊桥", "石鼓", "样式雷", "礼乐", "圆石", "动物园", "西湖", "齐长城遗址", "京畿", "正脊", "神武门", "洛神赋图", "绿地面积", "暖阁", "多宝塔", "磨砖对缝", "湖心亭", "崇楼", "五谷丰登", "养性殿", "关山", "砖雕", "北境", "凤凰墩", "金殿", "永定路", "世界遗产", "古柏", "郡王府", "慕田峪", "皇舆全览图", "古典园林", "坐北朝南", "皇极殿", "皇家园林", "东四十条", "京西", "黄花镇", "通惠河", "宁寿宫", "旅游局", "大角楼", "昆明湖", "后溪", "东堤", "汉白玉石", "皇史宬", "湖心岛", "长春宫", "玉澜堂", "紫檀", "玉泉山", "玉山", "茶楼", "敌台", "乾清宫", "巴县", "藕香榭", "斗拱", "苏州街", "紫禁城", "颐和轩", "皇穹宇", "南方", "智慧海", "八小部洲", "拱券", "门楣", "太和殿", "銮仪卫", "法门寺地宫", "清音阁", "龙王庙", "城岛", "皇陵", "筒瓦", "天地坛", "张古", "建筑史", "武英殿", "北长街", "天坛", "云山", "大石桥", "北平", "宫殿建筑", "山东", "博物馆", "昆明池", "交道口南大街", "平流村", "聊城", "三大殿", "清晏舫", "墀头", "养心殿", "御道", "百花园", "翊坤宫", "神道", "落地罩", "渔村", "丹陛", "歇山顶", "畅音阁", "漱芳斋", "黄鹤楼", "柱础", "嘉乐堂", "庆长", "档案", "保定", "上海", "佛香阁", "望柱", "德和园", "天桥", "北京旅游网", "祈年殿", "颐和园", "攒尖顶", "香岩宗印之阁", "分界线", "大杂院", "交泰殿", "太和门", "南郊", "健翔桥", "瓮山", "勤政殿", "云南", "景仁宫", "小山村", "金水桥", "保和殿", "寄畅园", "珍妃井", "德和园大戏楼", "正房", "第一批全国重点文物保护单位", "三合院", "万寿山", "厉家菜", "玉峰塔", "藻井", "恭王府花园", "文昌阁", "景山", "前门东大街", "端门", "代王府", "万寿亭", "景阳宫", "东四环", "景明楼", "祈谷坛", "大戏楼", "安佑宫", "石舫", "流杯亭", "行宫", "法华寺", "圜丘坛", "正义路", "居庸关长城", "箭扣长城", "石牌坊", "回音壁", "和玺彩画", "二龙戏珠", "北四环", "玉龙", "广州", "盛京", "四合院", "曲尺", "谷仓", "永定门", "宝顶", "苏式彩画", "皇宫", "寿康宫" ] def __init__(self, model_dir=default_model_dir, all_entity=all_entity): self.default_model_dir = model_dir # 加载ltp模型 # default_model_dir = 'D:\python-file\knowledge_extraction-master-tyz\\ltp_data_v3.4.0\\' # LTP模型文件目录 self.segmentor = Segmentor() user_dict = "..\\source\\user.txt" segmentor_flag = self.segmentor.load_with_lexicon( os.path.join(default_model_dir, 'cws.model'), user_dict) # self.segmentor2 = Segmentor() # segmentor_flag = self.segmentor.load(os.path.join(default_model_dir, 'cws.model')) # 词性标注模型 self.postagger = Postagger() postag_flag = self.postagger.load( os.path.join(self.default_model_dir, 'pos.model')) # 命名实体识别模型 self.recognizer = NamedEntityRecognizer() ner_flag = self.recognizer.load( os.path.join(self.default_model_dir, 'ner.model')) # 依存句法分析模型 self.parser = Parser() parser_flag = self.parser.load( os.path.join(self.default_model_dir, 'parser.model')) if segmentor_flag or postag_flag or ner_flag or parser_flag: # 可能有错误 print('load model failed') def segment(self, sentence, entity_postag=dict()): words = self.segmentor.segment(sentence) lemmas = [] for lemma in words: lemmas.append(lemma) return lemmas def getPostag(self): return self.postagger def postag(self, lemmas): """ Parameters ---------- lemmas : List,分词后的结果 entity_dict:Set,实体词典,处理具体的一则判决书的结构化文本时产生 Returns ------- words:WordUnit List,包括分词与词性标注的结果 """ words = [] # 词性标注 postags = self.postagger.postag(lemmas) for i in range(len(lemmas)): # 存储分词与词性标记后的词单元WordUnit,编号从1开始 word = WordUnit(i + 1, lemmas[i], postags[i]) words.append(word) # self.postagger.release() #释放 return words def get_postag(self, word): """获得单个词的词性标注 Args: word:str,单词 Returns: pos_tag:str,该单词的词性标注 """ pos_tag = self.postagger.postag([word]) return pos_tag[0] def netag(self, words): """ 命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并 Parameters words : WordUnit list,包括分词与词性标注结果 Returns words_netag:WordUnit list,包含分词,词性标注与命名实体识别的结果 """ lemmas = [] # 存储分词后的结果 postags = [] # 存储词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 命名实体识别 netags = self.recognizer.recognize(lemmas, postags) words_netag = EntityCombine().combine(words, netags) return words_netag def parse(self, words): """ 对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选) Args: words_netag:WordUnit list,包含分词,词性标注与命名实体识别结果 Returns *:sentenceUnit 句子单元 """ lemmas = [] # 分词结果 postags = [] # 词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 依存句法分析 arcs = self.parser.parse(lemmas, postags) for i in range(len(arcs)): words[i].head = arcs[i].head words[i].dependency = arcs[i].relation return SentenceUnit(words) def close(self): """ 关闭与释放 """ # pynlpir.close() self.postagger.release() self.recognizer.release() self.parser.release() def splitSentence(self, text): pattern = r'。|!|?|;|=' result_list = re.split(pattern, text) result_list = list(filter(self.not_empty, result_list)) # print(result_list) return result_list def splitSentenceByComma(self, text): pattern = r',' result_list = re.split(pattern, text) result_list = list(filter(self.not_empty, result_list)) final_list = [] for sentence in result_list: if len(sentence) <= 40: final_list.append(sentence) return final_list def not_empty(self, s): return s and "".join(s.split()) def dsfn1_2_3_4COO(self, sentence, item1, item2): allTripes = [] """ 判断两个实体是否属于DSFN1的情况,并输出三元组 """ location_position_list = [ '主席', '总统', '总理', '主任', '内', '东门', '西门', '南门', '北门', '大门', '外', '国家主席', '尚书' ] if self.dsfnConstraints3(sentence, item1, item2) and ( item1.dependency == "ATT" and item1.head_word.postag != 'v' and item1.head_word.postag != 'a'): AttWord = item1.head_word AttWordDict = dict() AttWordStr = "" while AttWord.ID < item2.ID: AttWordDict[AttWord.ID] = AttWord.lemma # AttWordStr += AttWord.lemma if (AttWord.dependency == "ATT" and AttWord.head_word.postag != 'v' and AttWord.head_word.postag != 'a'): AttWord = AttWord.head_word else: break if (AttWord.ID == item2.ID): flag = True while flag: len1 = len(AttWordDict) AttList = AttWordDict.keys() for id in range(item1.ID + 1, item2.ID): item = sentence.get_word_by_id(id) if item.head_word != None and item.head_word.ID in AttList and ( item.dependency == "ATT" and item.postag != 'v' and item.postag != 'a'): AttWordDict[item.ID] = item.lemma if len1 == len(AttWordDict): flag = False else: flag = True AttWordDict = sorted(AttWordDict.items(), key=lambda item: item[0]) AttWordStr = "" for i in AttWordDict: AttWordStr += i[1] # print("三元组:(" + item1.lemma + "," + AttWordStr + "," + item2.lemma + ")") if AttWordStr in location_position_list: allTripes.append([item1.lemma, AttWordStr, item2.lemma]) """ 考虑DSFN2的情况 """ if item1.dependency == "SBV" and item1.head_word.postag == "v": pred1 = item1.head_word predDict = dict() predDict[pred1.ID] = pred1.lemma if item2.dependency == "VOB" and item2.head_word.postag == "v": pred2 = item2.head_word predDict[pred2.ID] = pred2.lemma if (len(predDict) == 1): PredWordStr = "" for i in predDict: PredWordStr += predDict[i] # print("DSFN2三元组:(" + item1.lemma + "," + PredWordStr + "," + item2.lemma + ")") allTripes.append([item1.lemma, PredWordStr, item2.lemma]) """ 新加,为了考虑“习近平视察和访问上海”的情况 """ if len(predDict) == 2: num = self.get_entity_num_between(pred1, pred2, sentence) flagSBV = True flagVOB = True for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred2.ID: flagSBV = False if (word.dependency == "VOB" and word.head_word.ID == pred1.ID) or (word.dependency == "POB" \ and word.head_word.dependency == "ADV" and word.head_word.head_word.ID == pred1.ID): flagVOB = False flagCMP = True if pred1 != None and pred1.dependency == "CMP" and pred1.head_word.ID == pred2.ID: flagCMP = False if pred2 != None and pred2.dependency == "CMP" and pred2.head_word.ID == pred1.ID: flagCMP = False flagCOO = True if pred1 != None and pred1.dependency == "COO" and pred1.head_word.ID == pred2.ID: flagCOO = False if pred2 != None and pred2.dependency == "COO" and pred2.head_word.ID == pred1.ID: flagCOO = False # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num)) if num == 0: if flagCMP == False: if flagVOB == True and flagSBV == True: allTripes.append([ item1.lemma, pred1.lemma + "" + pred2.lemma, item2.lemma ]) if flagCOO == False: if flagVOB == True and flagSBV == True: allTripes.append([ item1.lemma, pred1.lemma + "" + pred2.lemma, item2.lemma ]) else: if flagVOB == True: allTripes.append( [item1.lemma, pred1.lemma, item2.lemma]) if flagSBV == True: allTripes.append( [item1.lemma, pred2.lemma, item2.lemma]) """ DSFN3.0 """ pred = None if item1.dependency == "SBV" and item1.head_word.postag == "v" and item2.dependency == "POB": pred = item1.head_word prep = item2.head_word elif item1.dependency == "FOB" and item2.dependency == "POB": # 考虑介词为“被”的情况,如 “小王被小明所陷害” pred = item1.head_word prep = item2.head_word c = item1 item1 = item2 item2 = c if pred != None and prep != None: if prep.dependency == "ADV": if prep.head_word.ID == pred.ID: pred2 = None object = None objectForPred2 = None for i in range(pred.ID + 1, len(sentence.words) + 1): item = sentence.get_word_by_id(i) if item.dependency == "VOB" and item.head_word.ID == pred.ID: object = item objectDict = dict() objectDict[object.ID] = object for word in sentence.words: if word.head_word != None and word.dependency == "ATT" and word.head_word.ID in objectDict: objectDict[word.ID] = word objectDict = sorted(objectDict.items(), key=lambda item: item[0]) objectStr = "" for objectItem in objectDict: objectStr += objectItem[1].lemma allTripes.append([ item1.lemma, pred.lemma + "" + objectStr, item2.lemma ]) if object == None: hasPOB = False for i in range(pred.ID + 1, len(sentence.words) + 1): item = sentence.get_word_by_id(i) if item.dependency == "POB" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID == pred.ID: hasPOB = True allTripes.append([ item1.lemma, pred.lemma + "" + item.head_word.lemma + "" + item.lemma, item2.lemma ]) # print("DSFN3三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")") if hasPOB == False: allTripes.append( [item1.lemma, pred.lemma, item2.lemma]) """ DSFN4 """ pred = None prep = None prep1 = None pred2 = None if item1.dependency == "SBV" and item2.dependency == "POB": pred = item1.head_word prep = item2.head_word if prep.dependency == "CMP": pred2 = prep.head_word if pred2.ID == pred.ID: # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep.lemma + "," + item2.lemma + ")") allTripes.append([ item1.lemma, pred.lemma + "" + prep.lemma, item2.lemma ]) else: num = self.get_entity_num_between(pred, pred2, sentence) flagSBV = True flagVOB = True for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred2.ID: flagSBV = False if (word.dependency == "VOB" and word.head_word.ID == pred.ID) or (word.dependency == "POB" \ and word.head_word.dependency == "ADV" and word.head_word.head_word.ID == pred.ID): flagVOB = False # print("pred1:"+pred1.lemma+",pred2:"+pred2.lemma+",num:"+str(num)) if num == 0: flag = True for word in sentence.words: if word.dependency == "CMP" and word.head_word.ID == pred.ID: prep1 = word if prep1 != None: if flagVOB == True: # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "" + prep1.lemma + "," + item2.lemma + ")") allTripes.append([ item1.lemma, pred.lemma + "" + prep1.lemma, item2.lemma ]) # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")") if flagSBV == True: allTripes.append([ item1.lemma, pred2.lemma + "" + prep.lemma, item2.lemma ]) else: if flagVOB == True: # print("DSFN4三元组:(" + item1.lemma + "," + pred.lemma + "," + item2.lemma + ")") allTripes.append( [item1.lemma, pred.lemma, item2.lemma]) if flagSBV == True: # print("DSFN4三元组:(" + item1.lemma + "," + pred2.lemma + "" + prep.lemma + "," + item2.lemma + ")") allTripes.append([ item1.lemma, pred2.lemma + "" + prep.lemma, item2.lemma ]) """ DSFN5 """ # self.dsfn5and6(rawSentence,sentence,item1,item2) return allTripes def get_entity_num_between(self, verb1, verb2, sentence): """ 获得两个动词之间的实体数量 Parameters ---------- entity1 : WordUnit,动词1 entity2 : WordUnit,动词2 Returns: num:int,两动词间的实体数量 """ if verb1.ID > verb2.ID: c = verb1 verb1 = verb2 verb2 = c num = 0 i = verb1.ID while i < verb2.ID - 1: if self.is_entity(sentence.words[i]): num += 1 i += 1 return num def is_entity(self, entry): """判断词单元是否是实体 Args: entry:WordUnit,词单元 Returns: *:bool,实体(True),非实体(False) """ #候选实体词性列表 entity_postags = ['nh', 'ni', 'ns', 'nz', 'j', 'n', 'v', 'm'] # print(entry.lemma+" : "+entry.postag) if entry.postag in entity_postags: return True else: return False def dsfnAttCOO(self, sentence, item1, item2): item1Att = item1 item2Att = item2 while item1Att.dependency == "ATT": item1Att = item1Att.head_word allTripe = self.dsfn1_2_3_4COO(sentence, item1Att, item2) if allTripe == None or len(allTripe) == 0: while item2Att.dependency == "ATT": item2Att = item2Att.head_word allTripe = self.dsfn1_2_3_4COO(sentence, item1, item2Att) if allTripe == None or len(allTripe) == 0: allTripe = self.dsfn1_2_3_4COO(sentence, item1Att, item2Att) for tripe in allTripe: if tripe[0] == item1Att.lemma: tripe[0] = item1.lemma if tripe[2] == item2Att.lemma: tripe[2] = item2.lemma return allTripe def dsfn5COO(self, sentence, item1, item2): if item1.dependency == "COO": item1COO = item1.head_word allTripes1 = self.dsfn1_2_3_4COO(sentence, item1COO, item2) # print(allTripes1) for tripe in allTripes1: if tripe[0] == item1COO.lemma: tripe[0] = item1.lemma elif tripe[2] == item1COO.lemma: tripe[2] = item1.lemma return allTripes1 # print("allTripes1"+str(allTripes1)) def dsfn6COO(self, sentence, item1, item2): if item2.dependency == "COO": item2COO = item2.head_word allTripes2 = self.dsfn1_2_3_4COO(sentence, item1, item2COO) for tripe in allTripes2: if tripe[2] == item2COO.lemma: tripe[2] = item2.lemma elif tripe[0] == item2COO.lemma: tripe[0] = item2.lemma return allTripes2 def dsfn5and6COO(self, sentence, item1, item2): if item1.dependency == "COO": item1COO = item1.head_word if item2.dependency == "COO": item2COO = item2.head_word allTripe = self.dsfn1_2_3_4COO(sentence, item1COO, item2COO) for tripe in allTripe: if tripe[0] == item1COO.lemma and tripe[ 2] == item2COO.lemma: tripe[0] = item1.lemma tripe[2] = item2.lemma if tripe[2] == item1COO.lemma and tripe[ 0] == item2COO.lemma: tripe[2] = item1.lemma tripe[0] = item2.lemma return allTripe def dsfnStart(self, rawSentence, entity1, entity2, all_entity): nounRelatedWithPosition = ['主席', '总理', '教授', '校长'] resultList = [] lemmas = dsfn.segment(rawSentence) words = dsfn.postag(lemmas) words_netag = dsfn.netag(words) sentence = dsfn.parse(words_netag) # print(sentence.to_string()) Rawitem1 = None Rawitem2 = None item1 = None item2 = None Rawitem1Index = -1 Rawitem2Index = -1 indexList = [-1, -1] for item in sentence.words: if (item.lemma == entity1): Rawitem1 = item if (item.lemma == entity2): Rawitem2 = item if Rawitem1 != None and Rawitem2 != None and ( Rawitem1.ID != Rawitem1Index or Rawitem2.ID != Rawitem2Index): Rawitem1Index = Rawitem1.ID Rawitem2Index = Rawitem2.ID # if item1 == None or item2 == None: # return None item1 = Rawitem1 item2 = Rawitem2 if item1.ID > item2.ID: c = item1 item1 = item2 item2 = c # print(str(item1.ID) + " " + str(item2.ID)) itemCopy1 = item1 itemCopy2 = item2 # print(item1.lemma) # print(item2.lemma) # print(self.dsfnConstraints2(sentence,item1,item2,all_entity)) if self.dsfnConstraints2(sentence, item1, item2, all_entity) == False: continue allTripes = self.dsfnStartCOO2(sentence, item1, item2) # print("111"+item2.lemma) # print(allTripes) if allTripes == None or (allTripes != None and len(allTripes) == 0): # print("我要走ATT的部分了") while item1.dependency == "ATT": item1 = item1.head_word while item2.dependency == "ATT": item2 = item2.head_word allTripes = self.dsfnStartCOO2(sentence, item1, item2) if len(allTripes) != 0: for tripe in allTripes: if tripe[1] != "": if tripe[0] == item1.lemma: if item1.ID < itemCopy1.ID: tripe[ 0] = item1.lemma + "" + itemCopy1.lemma elif item1.ID > itemCopy1.ID: tripe[ 0] = itemCopy1.lemma + "" + item1.lemma else: tripe[0] = itemCopy1.lemma elif tripe[2] == item1.lemma: if item1.ID < itemCopy1.ID: tripe[ 2] = item1.lemma + "" + itemCopy1.lemma elif item1.ID > itemCopy1.ID: tripe[ 2] = itemCopy1.lemma + "" + item1.lemma else: tripe[2] = itemCopy1.lemma # tripe[2] = itemCopy1.lemma if tripe[0] == item2.lemma: if item2.ID < itemCopy2.ID: tripe[ 0] = item2.lemma + "" + itemCopy2.lemma elif item2.ID > itemCopy2.ID: tripe[ 0] = itemCopy2.lemma + "" + item2.lemma else: tripe[0] = itemCopy2.lemma elif tripe[2] == item2.lemma: # print(item2.lemma) if item2.ID < itemCopy2.ID: tripe[ 2] = item2.lemma + "" + itemCopy2.lemma elif item2.ID > itemCopy2.ID: tripe[ 2] = itemCopy2.lemma + "" + item2.lemma else: tripe[2] = itemCopy2.lemma # print("12345") resultList.append(tripe) else: for tripe in allTripes: if tripe[1] != "": resultList.append(tripe) # if len(resultList) > 0: # return np.array(set([tuple(t) for t in resultList])) if item1 == None or item2 == None: return None if len(resultList) > 0: # return np.array(set([tuple(t) for t in resultList])) # print("输出结果1"+str(resultList)) return resultList def dsfnStartCOO2(self, sentence, item1, item2): nounRelatedWithPosition = ['主席', '总理', '教授', '校长'] resultList = [] itemCopy1 = item1 itemCopy2 = item2 """ 来解决ATT依赖的名词,如 李克强[ATT] <----- 总理[SBV] """ # print(item1.lemma) # print(item2.lemma) allTripes = self.dsfn1_2_3_4COO(sentence, item1, item2) if len(allTripes) == 0: # print("11111111") allTripes = self.dsfn5COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("2222222") allTripes = self.dsfn6COO(sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("3333333") allTripes = self.dsfn5and6COO(sentence, item1, item2) # if allTripes == None or len(allTripes) == 0: # print("44444444444") # allTripes = self.dsfnAttCOO(sentence,item1,item2) # print("第一次"+str(allTripes)) if allTripes != None and len(allTripes) != 0: for tripe in allTripes: resultList.append(tripe) # print("第二次") pred1 = None subForCoo = None for item in sentence.words: if item.postag == "v" and item.dependency == "COO": pred1 = item.head_word for word in sentence.words: if word.dependency == "SBV" and word.head_word.ID == pred1.ID: for phrase in sentence.words: if phrase.dependency == "SBV" and phrase.head_word.ID == item.ID: subForCoo = phrase if subForCoo == None or ( subForCoo != None and subForCoo.ID == word.ID): # 处理动词COO的情况,必须要保证此并列动词没有额外主语。 # 考虑到:习近平主席视察厦门,李克强总理访问香港 word.head_word = item # print(sentence.to_string()) # print(item1.lemma) # print(item2.lemma) allTripes = self.dsfn1_2_3_4COO( sentence, item1, item2) if len(allTripes) == 0: # print("11111111") allTripes = self.dsfn5COO( sentence, item1, item2) if allTripes == None or len(allTripes) == 0: # print("2222222") allTripes = self.dsfn6COO( sentence, item1, item2) if allTripes == None or len( allTripes) == 0: # print("3333333") allTripes = self.dsfn5and6COO( sentence, item1, item2) # if allTripes == None or len(allTripes) == 0: # allTripes = self.dsfnAttCOO(sentence,item1,item2) # print("第二次"+str(allTripes)) if allTripes != None and len(allTripes) != 0: for tripe in allTripes: resultList.append(tripe) # print(np.array(set([tuple(t) for t in resultList]))) return resultList def dsfnConstraints1(self, rawSentence, maxLength): """ :param rawSentence: 原句子 :param maxLength: 句子的最大长度 :return: 小于maxLength的长度 """ newSentence = [] if len(rawSentence) <= maxLength: newSentence.append(rawSentence) return newSentence else: newSentence = self.splitSentenceByComma(rawSentence) return newSentence def dsfnConstraints2(self, sentence, item1, item2, allEntities): countEntity = 0 countChar = 0 for index in range(item1.ID + 1, item2.ID): word = sentence.get_word_by_id(index) countChar += len(word.lemma) if word.lemma in allEntities: countEntity += 1 # print(countEntity) # print(countChar) if countEntity > 3: return False elif countChar > 12: # print(countChar) return False else: return True def dsfnConstraints3(self, sentence, item1, item2): countChar = 0 for index in range(item1.ID + 1, item2.ID): word = sentence.get_word_by_id(index) countChar += len(word.lemma) if countChar > 5: return False else: return True def getSPO(self, sentence): all_result = [] raw_sentence = [] RawSentence = sentence lemmas = self.segment(sentence) words = self.postag(lemmas) words_netag = self.netag(words) sentence = self.parse(words_netag) # print(sentence.to_string()) for itemWord in sentence.words: #来找到一个动词,这个动词要么是一句话的HED,要么与一句话的HED是COO的依存关系 if (itemWord.head_word == None and itemWord.postag == "v" ) or (itemWord.postag == "v" and itemWord.dependency == "COO" and itemWord.head_word.head_word == None)\ or (itemWord.postag == "v") : relation_verb = itemWord #将找到的这个动词,作为relation_verb relationString = relation_verb.lemma # print(relationString) if itemWord.head_word == None: # print("1") verbId = itemWord.ID #关系动词的ID verbId2 = None elif itemWord.head_word.head_word == None: # print("2") verbId = itemWord.ID #该关系动词的ID if itemWord.dependency == "COO" or self.get_entity_num_between( itemWord, itemWord.head_word, sentence) == 0: verbId2 = itemWord.head_word.ID # 这句话的HED,用来找SUB else: verbId2 = None else: # print("3") verbId = itemWord.ID #该关系动词的ID verbId2 = None O_dict = dict() #存储所有的Object S_dict = dict() #存储所有的Subject verb_dict = dict() #存储所有的verb,主要考虑的情况为:习近平主席在北京大学发表演讲 OBJ = None SUB = None DSFN3 = dict() for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId: #寻找这个动词的主语 # if SUB == None or SUB.lemma != entity: SUB = item #找到主语 S_dict[SUB.ID] = SUB.lemma #将主语加入到字典中 if (item.dependency == "VOB" and item.head_word.ID == verbId and item.postag != "v"): # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词) OBJ = item O_dict[OBJ.ID] = OBJ.lemma relationString = relation_verb.lemma verb_dict[OBJ.ID] = relationString if (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID == verbId): # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词) OBJ = item O_dict[OBJ.ID] = OBJ.lemma relationString = relation_verb.lemma + "" + item.head_word.lemma verb_dict[OBJ.ID] = relationString if (item.dependency == "POB" and (item.head_word.postag == "p" or item.head_word.postag == 'd')\ and item.head_word.dependency == "ADV" and item.head_word.head_word.ID == verbId \ and item.postag!='v'): # 找到这个动词的宾语,其中包括:直接宾语,介词宾语(该宾语依赖POB---->介词(词性为p)--ADV or CMP-->动词) OBJ = item O_dict[OBJ.ID] = OBJ.lemma verbObj = None DSFN3[OBJ.ID] = True objectDict = dict() relationString = relation_verb.lemma for eachWord in sentence.words: if eachWord.dependency == "VOB" and eachWord.head_word.ID == relation_verb.ID: # relationString = relation_verb.lemma + "" + eachWord.lemma verbObj = eachWord objectDict[verbObj.ID] = verbObj if verbObj != None: for word in sentence.words: if word.head_word != None and word.dependency == "ATT" and word.head_word.ID == verbObj.ID: objectDict[word.ID] = word objectDict = sorted(objectDict.items(), key=lambda item: item[0]) objectStr = "" for objectItem in objectDict: objectStr += objectItem[1].lemma relationString = relation_verb.lemma + "" + objectStr else: for eachWord in sentence.words: if eachWord.dependency == "POB" and eachWord.head_word.dependency == "CMP" and\ eachWord.head_word.head_word.ID == relation_verb.ID: relationString = relation_verb.lemma + "" + eachWord.head_word.lemma + "" + eachWord.lemma verb_dict[OBJ.ID] = relationString if SUB == None: #如果没找到主语,那么就找与该动词并列的verbId2的主语 for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verbId2: # if SUB == None or SUB.lemma != entity: SUB = item S_dict[SUB.ID] = SUB.lemma # print(verbId2) if OBJ == None: verb_coo = None for item in sentence.words: if item.dependency == "COO" and item.head_word.ID == verbId and item.ID > verbId: verb_coo = item break flag = True if verb_coo != None and self.get_entity_num_between( relation_verb, verb_coo, sentence) == 0: for item in sentence.words: if item.dependency == "SBV" and item.head_word.ID == verb_coo.ID: flag = False if flag != False: for item in sentence.words: if (item.dependency == "VOB" and item.head_word.ID == verb_coo.ID)\ or (item.dependency == "POB" and item.head_word.postag == "p" and item.head_word.dependency == "CMP" and item.head_word.head_word.ID== verb_coo.ID) or (item.dependency == "POB" and item.head_word.postag == "p"\ and item.head_word.dependency == "ADV" and item.head_word.head_word.ID== verb_coo.ID): OBJ = item O_dict[OBJ.ID] = OBJ.lemma # print(S_dict) # print(verb_dict) # print(O_dict) SUB_COO = None OBJ_COO = None for item in sentence.words: if item.head_word != None: if SUB != None and item.dependency == "COO" and item.head_word.ID in S_dict: #获得主语的COO SUB_COO = item S_dict[SUB_COO.ID] = SUB_COO.lemma if item.head_word != None and OBJ != None: if item.dependency == "COO" and item.head_word.ID in O_dict: #获得宾语的COO OBJ_COO = item O_dict[OBJ_COO.ID] = OBJ_COO.lemma S_new = [] for sub in S_dict: # if sentence.get_word_by_id(sub).postag == 'r': # continue S_dict2 = dict() # 存放主语ATT的列表 S_dict2[sub] = S_dict[sub] flag = True while flag == True: len1 = len(S_dict2) for item in sentence.words: if item.head_word != None: SUBList = S_dict2.keys() if item.head_word.ID in SUBList and ( item.dependency == "ATT" or item.dependency == "ADV"): SUBATT = item S_dict2[SUBATT.ID] = SUBATT.lemma if len(S_dict2) != len1: flag = True else: flag = False S_dict2 = sorted(S_dict2.items(), key=lambda item: item[0]) Subject = "" for i in S_dict2: Subject += i[1] S_new.append(Subject) O_new = [] V_new = [] for obj in O_dict: # if sentence.get_word_by_id(obj).postag == 'r': # continue O_dict2 = dict() # 存放宾语ATT的列表 O_dict2[obj] = O_dict[obj] if verb_dict != None: if obj in verb_dict: relationString2 = verb_dict[obj] else: relationString2 = relation_verb.lemma else: relationString2 = relation_verb.lemma V_new.append(relationString2) flag = True while flag == True: len2 = len(O_dict2) for item in sentence.words: if item.head_word != None: OBJList = O_dict2.keys() if item.head_word.ID in OBJList and ( item.dependency == "ADV" or item.dependency == "ATT" or item.dependency == "VOB" or (item.dependency == "COO" and item.head_word.ID != obj)): if item.dependency == "ATT" and item.postag == "v": if self.get_entity_num_between( item, sentence.get_word_by_id(obj), sentence) > 0: continue else: OBJATT = item O_dict2[OBJATT.ID] = OBJATT.lemma else: OBJATT = item O_dict2[OBJATT.ID] = OBJATT.lemma # print(OBJATT.lemma) if len(O_dict2) != len2: flag = True else: flag = False #一直循环,直到找不到新的修饰词 O_dict2 = sorted(O_dict2.items(), key=lambda item: item[0]) Object = "" for i in O_dict2: Object += i[1] flag = False # if obj in DSFN3: # for location in self.location_entity: # if location in Object : # flag = True # if flag == True: # O_new.append(Object) # if flag == False: # O_new.append("") # else: O_new.append(Object) # print(O_dict) # print(O_new) for sub in S_new: for i in range(0, len(O_new)): obj = O_new[i] relationWord = V_new[i] if obj != "": # print(RawSentence) # print((sub, relationWord, obj)) all_result.append([sub, relationWord, obj]) raw_sentence.append(RawSentence) return all_result, raw_sentence def hasEntity(self, word, allEntity): for entity in allEntity: if entity in word: # print(entity) return True return False def PostProcessSPO(self, rawSentence, allTripes, allEntity): output_list = [] for i in range(0, len(allTripes)): tripe = allTripes[i] sub = tripe[0] obj = tripe[2] # print(sub) # print(obj) if self.hasEntity(sub, allEntity) and self.hasEntity( obj, allEntity): output_list.append(tripe) return output_list
class LtpParser(): def __init__(self): LTP_DIR = "./ltp_data" self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) '''ltp基本操作''' def basic_parser(self, words): postags = list(self.postagger.postag(words)) netags = self.recognizer.recognize(words, postags) return postags, netags '''ltp获取词性''' def get_postag(self, words): return list(self.postagger.postag(words)) '''基于实体识别结果,整理输出实体列表''' def format_entity(self, words, netags, postags): name_entity_dist = {} name_entity_list = [] place_entity_list = [] organization_entity_list = [] ntag_E_Nh = "" ntag_E_Ni = "" ntag_E_Ns = "" index = 0 for item in zip(words, netags): word = item[0] ntag = item[1] if ntag[0] != "O": if ntag[0] == "S": if ntag[-2:] == "Nh": name_entity_list.append(word+'_%s ' % index) elif ntag[-2:] == "Ni": organization_entity_list.append(word+'_%s ' % index) else: place_entity_list.append(word + '_%s ' % index) elif ntag[0] == "B": if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index elif ntag[0] == "I": if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index else: if ntag[-2:] == "Nh": ntag_E_Nh = ntag_E_Nh + word + '_%s ' % index name_entity_list.append(ntag_E_Nh) ntag_E_Nh = "" elif ntag[-2:] == "Ni": ntag_E_Ni = ntag_E_Ni + word + '_%s ' % index organization_entity_list.append(ntag_E_Ni) ntag_E_Ni = "" else: ntag_E_Ns = ntag_E_Ns + word + '_%s ' % index place_entity_list.append(ntag_E_Ns) ntag_E_Ns = "" index += 1 name_entity_dist['nhs'] = self.modify_entity(name_entity_list, words, postags, 'nh') name_entity_dist['nis'] = self.modify_entity(organization_entity_list, words, postags, 'ni') name_entity_dist['nss'] = self.modify_entity(place_entity_list,words, postags, 'ns') return name_entity_dist '''entity修正,为rebuild_wordspostags做准备''' def modify_entity(self, entity_list, words, postags, tag): entity_modify = [] if entity_list: for entity in entity_list: entity_dict = {} subs = entity.split(' ')[:-1] start_index = subs[0].split('_')[1] end_index = subs[-1].split('_')[1] entity_dict['stat_index'] = start_index entity_dict['end_index'] = end_index if start_index == entity_dict['end_index']: consist = [words[int(start_index)] + '/' + postags[int(start_index)]] else: consist = [words[index] + '/' + postags[index] for index in range(int(start_index), int(end_index)+1)] entity_dict['consist'] = consist entity_dict['name'] = ''.join(tmp.split('_')[0] for tmp in subs) + '/' + tag entity_modify.append(entity_dict) return entity_modify '''基于命名实体识别,修正words,postags''' def rebuild_wordspostags(self, name_entity_dist, words, postags): pre = ' '.join([item[0] + '/' + item[1] for item in zip(words, postags)]) post = pre for et, infos in name_entity_dist.items(): if infos: for info in infos: post = post.replace(' '.join(info['consist']), info['name']) post = [word for word in post.split(' ') if len(word.split('/')) == 2 and word.split('/')[0]] words = [tmp.split('/')[0] for tmp in post] postags = [tmp.split('/')[1] for tmp in post] return words, postags '''依存关系格式化''' def syntax_parser(self, words, postags): arcs = self.parser.parse(words, postags) words = ['Root'] + words postags = ['w'] + postags tuples = list() for index in range(len(words)-1): arc_index = arcs[index].head arc_relation = arcs[index].relation tuples.append([index+1, words[index+1], postags[index+1], words[arc_index], postags[arc_index], arc_index, arc_relation]) return tuples '''为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, tuples): child_dict_list = list() for index, word in enumerate(words): child_dict = dict() for arc in tuples: if arc[3] == word: if arc[-1] in child_dict: child_dict[arc[-1]].append(arc) else: child_dict[arc[-1]] = [] child_dict[arc[-1]].append(arc) child_dict_list.append([word, postags[index], index, child_dict]) return child_dict_list '''parser主函数''' def parser_main(self, words, postags): tuples = self.syntax_parser(words, postags) child_dict_list = self.build_parse_child_dict(words, postags, tuples) return tuples, child_dict_list '''基础语言分析''' def basic_process(self, sentence): words = list(self.segmentor.segment(sentence)) postags, netags = self.basic_parser(words) name_entity_dist = self.format_entity(words, netags, postags) words, postags = self.rebuild_wordspostags(name_entity_dist, words, postags) return words, postags
class Model: def __init__(self): self.name_says = defaultdict( list) #定义成全局变量有可能从sentence_process()中写入,也可能从single_sentence()写入 self.model = Word2Vec.load(path) self.word_total_count = self.model.corpus_total_words self.word_dict = self.model.wv.vocab self.dim = 256 self.postagger = Postagger() # 初始化实例 self.postagger.load(pos_model_path) # 加载模型 self.say_sim = [ '诊断', '交代', '说', '说道', '指出', '报道', '报道说', '称', '警告', '所说', '告诉', '声称', '表示', '时说', '地说', '却说', '问道', '写道', '答道', '感叹', '谈到', '说出', '认为', '提到', '强调', '宣称', '表明', '明确指出', '所言', '所述', '所称', '所指', '常说', '断言', '名言', '告知', '询问', '知道', '得知', '质问', '问', '告诫', '坚称', '辩称', '否认', '还称', '指责', '透露', '坦言', '表达', '中说', '中称', '他称', '地问', '地称', '地用', '地指', '脱口而出', '一脸', '直说', '说好', '反问', '责怪', '放过', '慨叹', '问起', '喊道', '写到', '如是说', '何况', '答', '叹道', '岂能', '感慨', '叹', '赞叹', '叹息', '自叹', '自言', '谈及', '谈起', '谈论', '特别强调', '提及', '坦白', '相信', '看来', '觉得', '并不认为', '确信', '提过', '引用', '详细描述', '详述', '重申', '阐述', '阐释', '承认', '说明', '证实', '揭示', '自述', '直言', '深信', '断定', '获知', '知悉', '得悉', '透漏', '追问', '明白', '知晓', '发觉', '察觉到', '察觉', '怒斥', '斥责', '痛斥', '指摘', '回答', '请问', '坚信', '一再强调', '矢口否认', '反指', '坦承', '指证', '供称', '驳斥', '反驳', '指控', '澄清', '谴责', '批评', '抨击', '严厉批评', '诋毁', '责难', '忍不住', '大骂', '痛骂', '问及', '阐明' ] self.valid_sentence = [] self.parser = Parser() self.parser.load(par_model_path) self.segmentor = Segmentor() self.segmentor.load(cws_model_path) self.recognizer = NamedEntityRecognizer() self.recognizer.load(ner_model_path) # @functools.lru_cache() # @fn_timer def get_count(self, word): """ O(1) """ # word_count = 0 #定义默认值 vector = np.zeros(1) #定义默认值 if word in self.word_dict: wf = self.word_dict[word].count wv = self.model.wv[word] else: wf = 1 wv = np.zeros(self.dim) return wf / self.word_total_count, wv # keys = self.model.wv.vocab.keys() # 获取词频及词向量 # total_words_count = sum([v.count for k,v in self.model.wv.vocab.items()]) #单词总数 # if word in keys: # word_count = self.model.wv.vocab[word].count # vector = self.model.wv[word] # 单词词语数量 # word_frequency=word_count/total_words_count # 词频 # return word_frequency,vector #获取句子向量 #TODO: 计算P(w)的过程可以优化 def sentence_embedding(self, sentence): # 按照论文算法Vs=1/|s|*∑a/(a+p(w))*Vw sentences = self.process_content(sentence).replace(' ', '') a = 1e-3 #0.001 # words = list(self.pyltp_cut(sentences)) # sentence_length = len(words) #句子长度 # sum_vector = sum([a/(a+float(self.get_count(w)[0]))*self.get_count(w)[1] for w in words]) words = self.pyltp_cut(sentences) sum_vector = np.zeros(self.dim) for i, w in enumerate(words): wf, wv = self.get_count(w) sum_vector += a / (a + wf) * wv # sentence_vector = sum_vector/sentence_length return sum_vector / (i + 1) # 欧式距离 def euclidSimilar(self, inA, inB): return 1.0 / (1.0 + la.norm(inA - inB)) # 皮尔逊相关系数 def pearsonSimilar(self, inA, inB): if len(inA) != len(inB): return 0.0 if len(inA) < 3: return 1.0 return 0.5 + 0.5 * np.corrcoef(inA, inB, rowvar=0)[0][1] # 余弦相似度 def cosSimilar(self, inA, inB): inA = np.mat(inA) inB = np.mat(inB) num = float(inA * inB.T) denom = la.norm(inA) * la.norm(inB) return 0.5 + 0.5 * (num / denom) # 句子依存分析 def parsing(self, sentence): words = self.pyltp_cut(sentence) # pyltp分词 # words=list(jieba.cut(sentence)) #结巴分词 postags = self.postagger.postag(words) # 词性标注 # tmp=[str(k+1)+'-'+v for k,v in enumerate(words)] # print('\t'.join(tmp)) # parser = Parser() # 初始化实例 # parser.load(par_model_path) # 加载模型 arcs = self.parser.parse(words, postags) # 句法分析 # parser.release() # 释放模型 return arcs # 命名实体 # @functools.lru_cache() def get_name_entity(self, strs): sentence = ''.join(strs) # recognizer = NamedEntityRecognizer() # 初始化实例 # recognizer.load(ner_model_path) # 加载模型 # words = list(jieba.cut(sentence)) # 结巴分词 words = self.pyltp_cut(sentence) #pyltp分词更合理 postags = self.postagger.postag(words) # 词性标注 netags = self.recognizer.recognize(words, postags) # 命名实体识别 # tmp=[str(k+1)+'-'+v for k,v in enumerate(netags)] # print('\t'.join(tmp)) # recognizer.release() # 释放模型 return netags # 输入单个段落句子数组 def valid_sentences_(self, sentences, res): expect = 0.76 tmp = "" # 储存前一个言论 while sentences: curr = sentences.pop(0) if curr[0] == '“': # 当前句子或为 “言论在发言人前的直接引用”。 print(curr) people = re.search('”(.+)“|”(.+)', curr) # 提取发言人所在句段 if people: people = [i for i in people.groups() if i][0] elif res: res[-1][1] += '。' + curr continue else: continue saying = curr.replace(people, '') # 剩余部分被假设为“言论” if res and self.judge_pronoun(people): res[-1][1] += '。' + saying else: comb = self.single_sentence(people) if comb: saying += comb[1] if comb[1] else '' res.append([comb[0], saying]) continue # 尝试提取新闻 发言人,言论内容 combi = self.single_sentence(curr) # 无发言人: 当前句子属于上一个发言人的言论 或 不属于言论 if not combi: if res and tmp and self.compare_sentence( tmp, curr) > expect: #基于句子相似度判断 print('{} - {} : {}'.format( tmp, curr, self.compare_sentence(tmp, curr))) res[-1][1] += '。' + curr tmp = curr continue # 有发言人: 提取 发言人 和 言论。 name, saying = combi if res and self.judge_pronoun(curr) and saying: res[-1][1] += '。' + saying elif saying: res.append([name, saying]) tmp = saying return res # 输入单个段落句子数组(deprecated) #TODO: deprecated #def valid_sentences(self, sentences): # expect = 0.75 #近似语句期望系数,本人根据测试估算值 # # n_s=defaultdict(list) #用于返回人物:言论 # first = '' #第一个句子 # if len(sentences) == 1: # if self.single_sentence(sentences[0]): # self.name_says[self.single_sentence(sentences[0])[0]].append(self.single_sentence(sentences[0])[1]) # return self.name_says # while sentences: # if len(sentences) == 1: # second = sentences.pop(0) # 第二个句子 # else: # first = first + ',' + sentences.pop(0) # 第一个句子与上一个叠加 # second = sentences.pop(0) # 第二个句子 # if self.compare_sentence(first, second) > expect or (self.judge_pronoun(second) and self.single_sentence(second)) or (re.findall(r'^“(.+?)$”', second) and self.single_sentence(first)): #语句近似或者second为代词表达的句子 # first = first+','+second # elif self.single_sentence(second) and self.single_sentence(first): # self.name_says[self.single_sentence(first)[0]].append(self.single_sentence(first)[1]) #将第一个语句到此,解析后存入字典中 # first=second #第二语句赋值到第一语句 # else: # first = first + ',' + second # if self.single_sentence(first):#while循环后遗留的first句子 # self.name_says[self.single_sentence(first)[0]].append(self.single_sentence(first)[1]) # return self.name_says # 输入一个句子,若为包含‘说’或近似词则提取人物、言论,否则返回空 # just_name:仅进行返回名字操作 ws:整句分析不进行多个“说判断” @functools.lru_cache() def single_sentence(self, sentence, just_name=False, ws=False): sentence = ','.join([x for x in sentence.split(',') if x]) cuts = list(self.pyltp_cut(sentence)) # pyltp分词更合理 # mixed = list(set(self.pyltp_cut(sentence)) & set(self.say_sim)) # mixed.sort(key=cuts.index) # if not mixed: return False # 判断是否有‘说’相关词: mixed = [word for word in cuts if word in self.say_sim] if not mixed: return False ne = self.get_name_entity(tuple(sentence)) #命名实体 wp = self.parsing(sentence) #依存分析 wp_relation = [w.relation for w in wp] postags = list(self.postagger.postag(cuts)) name = '' stack = [] for k, v in enumerate(wp): # save the most recent Noun if postags[k] in ['nh', 'ni', 'ns']: stack.append(cuts[k]) if v.relation == 'SBV' and (cuts[v.head - 1] in mixed): #确定第一个主谓句 name = self.get_name(cuts[k], cuts[v.head - 1], cuts, wp_relation, ne) if just_name == True: return name #仅返回名字 says = self.get_says(cuts, wp_relation, [i.head for i in wp], v.head) if not says: quotations = re.findall(r'“(.+?)”', sentence) if quotations: says = quotations[-1] return name, says # 若找到‘:’后面必定为言论。 if cuts[k] == ':': name = stack.pop() says = ''.join(cuts[k + 1:]) return name, says return False # 输入主语第一个词语、谓语、词语数组、词性数组,查找完整主语 def get_name(self, name, predic, words, property, ne): index = words.index(name) cut_property = property[index + 1:] #截取到name后第一个词语 pre = words[:index] #前半部分 pos = words[index + 1:] #后半部分 #向前拼接主语的定语 while pre: w = pre.pop(-1) w_index = words.index(w) if property[w_index] == 'ADV': continue if property[w_index] in ['WP', 'ATT', 'SVB'] and (w not in [ ',', '。', '、', ')', '(' ]): name = w + name else: pre = False while pos: w = pos.pop(0) p = cut_property.pop(0) if p in ['WP', 'LAD', 'COO', 'RAD'] and w != predic and (w not in [ ',', '。', '、', ')', '(' ]): name = name + w # 向后拼接 else: #中断拼接直接返回 return name return name # 获取谓语之后的言论 def get_says(self, sentence, property, heads, pos): # word = sentence.pop(0) #谓语 if ':' in sentence: return ''.join(sentence[sentence.index(':') + 1:]) while pos < len(sentence): w = sentence[pos] p = property[pos] h = heads[pos] # 谓语尚未结束 if p in ['DBL', 'CMP', 'RAD']: pos += 1 continue # 定语 if p == 'ATT' and property[h - 1] != 'SBV': pos = h continue # 宾语 if p == 'VOB': pos += 1 continue # if p in ['ATT', 'VOB', 'DBL', 'CMP']: # 遇到此性质代表谓语未结束,continue # continue else: if w == ',': return ''.join(sentence[pos + 1:]) else: return ''.join(sentence[pos:]) #解析处理语句并返回给接口 def sentence_process(self, sentence): # 文章 -->清除空行 # 文章 -->句号分割:如果句号分割A.B, 若B存在‘说’,对B独立解析,否则判断A | B是否相似,确定A是否抛弃B句。 # 句子 -->确定主谓宾: 依存分析、命名实体识别 -->首先要找到宾语,然后确定宾语是否与说近似,若存在多个与‘说’近似,确定第一个为陈述。在说前找命名实体,说后面到本句结尾为宾语 # 命名实体 -->通过命名实体识别,若S - NE, NE = S - NE。若B - NE / I - NE / E - NE,NE = B - NE + I - NE + E - NE self.name_says = defaultdict(list) sentence = sentence.replace('\r\n', '\n') sections = sentence.split('\n') #首先切割成段落 sections = [s for s in sections if s.strip()] valids = '' res = [] for sec in sections: #段落 # sec = sec.replace('。”', '”。') #当做纠正语法错误... # sentence_list = sec.split('。') # 段落拆分成句子 sentence_list = split(sec) sentence_list = [s.strip() for s in sentence_list if s.strip()] self.cut_sententce_for_name = [s for s in sentence_list if s] # valids = self.valid_sentences(sentence_list) res += self.valid_sentences_(sentence_list, []) # print(valids) # print("*****************") # print(self.cut_sententce_for_name) # print(self.valid_sentences) # print(self.valid_sentence) # print("%%%%%%%%%%%%%") if res: self.name_says = defaultdict() for name, saying in res: if name and saying: self.name_says[name] = self.name_says.get( name, '') + saying + ' | ' return self.name_says # 判断是否为代词结构句子“他认为...,他表示....” #@fn_timer def judge_pronoun(self, sentence): subsentence = re.search('(.+)“|”(.+)', sentence) if subsentence: sentence = subsentence.group(1) cuts = list(self.pyltp_cut(sentence)) # 确定分词 wp = self.parsing(sentence) # 依存分析 postags = list(self.postagger.postag(cuts)) for k, v in enumerate(wp): if v.relation == 'SBV' and postags[k] == 'r': # 确定第一个主谓句 return True return False # #获取人物及人物观点中的命名实体 # def get_name_saywords(self,content): # name_says=self.sentence_process(content) # result=[] # says_list=[] # if name_says: # for name,says in name_says.items(): # print(name) # print(says) # says_str = ''.join([''.join(s) for s in says]) # # name,says=name_says[0],name_says[1] # name_entity=self.get_name_entity(tuple(says_str)) # name_entity=' '.join(name_entity) # result.append((name,name_entity)) # else: # return None #获取整个新闻文章中的命名实体 #TODO: This function hasn't been used. # def get_news_ne(self,sentence): # self.name_says = defaultdict(list) # sections=sentence.split('\r\n') #首先切割成段落 # sections = [s for s in sections if s.strip()] # ne_list = [] # for sec in sections: #段落 # words = list(self.pyltp_cut(sentence)) # nes = self.get_name_entity(tuple(sec)) # for k, v in enumerate(nes): # if v != 'O': # ne_list.append(words[k]) # ne_list=list(set(ne_list)) # return ' '.join(ne_list) # #获取文章中关键词 # def get_news_keywords(self,news,totalnews): # print(news) # print("*******************") # print(totalnews) #句子比对皮尔逊系数 def compare_sentence(self, inA, inB): inC = self.sentence_embedding(inA) inD = self.sentence_embedding(inB) return self.pearsonSimilar(inC, inD) #皮尔逊 # print(self.euclidSimilar(inC,inD)) # print(self.pearsonSimilar(inC,inD)) # print(self.cosSimilar(inC,inD)) # print('------------------------') #pyltp中文分词 #@fn_timer # @functools.lru_cache() def pyltp_cut(self, sentence): # segmentor = Segmentor() # 初始化实例 # segmentor.load(cws_model_path) # 加载模型 words = self.segmentor.segment(sentence) # 分 # segmentor.release() # 释放模型 return words #结巴词性标注 def jieba_pseg(self, sentence): return pseg.cut(sentence) #结巴与哈理工词性标注比较 #TODO: function hasn't been used # def jieba_compare_pyltp(self,sentence): # sentence = sentence.replace('\r\n', '\n') # sections = sentence.split('\n') # 首先切割成段落 # sections = [s for s in sections if s.strip()] # for sec in sections: # 段落 # sentence_list = sec.split('。') # 段落拆分成句子 # sentence_list = [s for s in sentence_list if s] # for sl in sentence_list: # jieba_cut = list(jieba.cut(sl)) # jieba_pseg = list(self.jieba_pseg(sl)) # print("pyltp 分词:") # pyltp=list(self.pyltp_cut(sl)) #pyltp分词 # print(pyltp) # print("结巴分词:") # print(jieba_cut) # print("pyltp词性标注:") # pyltp_pseg=list(self.postagger.postag(jieba_cut)) # print(pyltp_pseg) # print("结巴词性标注:") # print(jieba_pseg) # parsed=[(x.head,x.relation) for x in list(self.parsing(sl))] # print(parsed) def document_frequency(self, word, document): if sum(1 for n in document if word in n) == 0: print(word) print(type(document)) print(len(document)) print(document[0]) return sum(1 for n in document if word in n) def idf(self, word, content, document): """Gets the inversed document frequency""" return math.log10( len(content) / self.document_frequency(word, document)) def tf(self, word, document): """ Gets the term frequemcy of a @word in a @document. """ words = document.split() return sum(1 for w in words if w == word) #TODO: The function hasn't been used # def get_keywords_of_a_ducment(self,content,document): # content=self.process_content(content) # documents=[self.process_content(x) for x in document] # words = set(content.split()) # tfidf = [(w, self.tf(w,content) * self.idf(w,content,documents)) for w in words] # tfidf = sorted(tfidf, key=lambda x: x[1], reverse=True) # tfidf=' '.join([w for w,t in tfidf[:5]]) #取前5为关键词 # return tfidf def process_content(self, content): # print(type(content)) # content=''.join(content) content = re.sub('[+——() ? 【】“”!,:。?、~@#¥%……&*()《 》]+', '', content) content = ' '.join(jieba.cut(content)) return content def release_all(self): self.segmentor.release() self.recognizer.release() self.parser.release() self.postagger.release()
class NERTagger(object): def __init__(self, model_dir_path, com_blacklist): # 初始化相关模型文件路径 self.model_dir_path = model_dir_path self.cws_model_path = os.path.join(self.model_dir_path, 'cws.model') # 分词模型路径,模型名称为`cws.model` self.pos_model_path = os.path.join(self.model_dir_path, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self.ner_model_path = os.path.join(self.model_dir_path, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` # 初始化分词模型 self.segmentor = Segmentor() self.segmentor.load(self.cws_model_path) # 初始化词性标注模型 self.postagger = Postagger() self.postagger.load(self.pos_model_path) # 初始化NER模型 self.recognizer = NamedEntityRecognizer() self.recognizer.load(self.ner_model_path) # 初始化公司名黑名单 self.com_blacklist = set() with open(com_blacklist, 'r', encoding='UTF-8') as f_com_blacklist: for line in f_com_blacklist: if len(line.strip()) > 0: self.com_blacklist.add(line.strip()) def ner(self, text, entity_dict): words = self.segmentor.segment(text) # 分词 post_tags = self.postagger.postag(words) ner_tags = self.recognizer.recognize(words, post_tags) # 命名实体识别 entity_list = [] entity = "" for word, post_tag, ner_tag in zip(words, post_tags, ner_tags): tag = ner_tag[0] entity_type = ner_tag[2:] if tag == 'S' : # 单独成实体 entity_list.append((word, entity_type)) elif tag in 'BIE': # 实体的开始词、中间词、结束词 entity += word if tag == 'E': #判断公司名黑名单 if entity in self.com_blacklist: # 黑名单公司名称,在配置文件中 entity_list.append((entity, "n")) else: entity_list.append((entity, entity_type)) entity = "" elif tag == 'O': # 不构成命名实体 if post_tag == 'nt': # 如果词性是机构团体 entity += word else: if entity != "": # 这时候已经把刚才漏掉的机构团体名称重新赋值给entity了 entity_list.append((entity, 'nt')) entity = "" # 排除错误数字识别,例如“大宗” if post_tag == 'm' and not re.match("[0-9]+.*",word): post_tag = 'n' # 识别数字中的百分数 if post_tag == 'm' and re.match("[0-9.]+%",word): post_tag = 'mp' entity_list.append((word, post_tag)) entity_list = self.ner_tag_by_dict(entity_dict, entity_list) # entity_dict空字典.抽取出命名实体中里漏掉的公司简称实体 return NERTaggedText(text, entity_list) def ner_tag_by_dict(self, entity_dict, entity_list): i = 0 while i < len(entity_list) - 1: has_entity = False for entity_len in range(4,1,-1): segment = "".join([ x[0] for x in entity_list[i:i+entity_len]]) # 对多个相邻的词进行组合 segment_uni = segment if segment_uni in entity_dict: # 查看词是否在公司简称的字典里 has_entity = True entity_list[i] = (segment, entity_dict[segment_uni]) del entity_list[i+1:i+entity_len] i = i + entity_len break if not has_entity: i += 1 return entity_list def __del__(self): self.segmentor.release() self.postagger.release() self.recognizer.release()
cache = [] curid += 1 else: tmp = tmp[tmp.find("|||") + 4:] cache.append(tmp.split()) if cache: rs.append(" ".join([ "<qid_" + str(curid) + ">", "|||", getans(cache, frdt.readline(), mapd, postagger, recognizer) ])) cache = [] rs = "\n".join(rs) with open(rsf, "w") as fwrt: fwrt.write(rs.encode("utf-8")) if __name__ == "__main__": ltpdata = "/media/Storage/data/ltp_data/" postagger = Postagger() postagger.load(os.path.join(ltpdata, "pos.model")) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(ltpdata, "ner.model")) if len(sys.argv) < 6: handle(sys.argv[1].decode("utf-8"), sys.argv[2].decode("utf-8"), sys.argv[3].decode("utf-8"), sys.argv[4].decode("utf-8")) else: handle(sys.argv[1].decode("utf-8"), sys.argv[2].decode("utf-8"), sys.argv[3].decode("utf-8"), sys.argv[4].decode("utf-8"), int(sys.argv[5].decode("utf-8")))
class Tools: def __init__(self): path = 'ltp_data_v3.4.0' self.par_model_path = os.path.join(path, 'parser.model') self.cws_model_path = os.path.join(path, 'cws.model') self.pos_model_path = os.path.join(path, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self.ner_model_path = os.path.join(path, 'ner.model') self.srl_model_path = os.path.join(path, 'pisrl_win.model') self.recognizer = NamedEntityRecognizer() # 初始化实例 self.postagger = Postagger() # 初始化实例 self.segmentor = Segmentor() # 初始化实例 self.labeller = SementicRoleLabeller() # 初始化实例 self.parser = Parser() # 初始化实例 self.parser.load(self.par_model_path) # 加载模型 self.labeller.load(self.srl_model_path) # 加载模型 self.recognizer.load(self.ner_model_path) # 加载模型 self.postagger.load(self.pos_model_path) # 加载模型 self.segmentor.load(self.cws_model_path) # 加载模型 def __del__(self): self.parser.release() self.labeller.release() # self.recognizer.release() self.postagger.release() self.segmentor.release() def read_file_or_dir(self,path): if os.path.exists(path): pass else: print("路径不存在!") os._exit() if os.path.isdir(path): file_list = os.listdir(path) file_path_list = [path + "/" + file_name for file_name in file_list] return file_path_list else: try: with open(path,encoding="utf-8") as rd: content = rd.read() except UnicodeDecodeError: with open(path,encoding="gbk") as rd: content = rd.read() return content def nltk(self,txt):#传入单句 words = self.segmentor.segment(txt) postags = self.postagger.postag(words) arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) return list(words),list(postags),roles def deal_with_pos_str(self,pos_str): reg_pattern = "n+" reg_pattern_2 = "ncn" reg_pattern_9 = "an" reg_pattern_3 = "un" reg_pattern_4 = "v+" reg_pattern_5 = "rn" reg_pattern_8 = "vcv" reg_pattern_6 = "pbnv" reg_pattern_7 = "pnv" reg_pattern_10 = "av" pos_str = re.sub(reg_pattern,"n",pos_str) pos_str = re.sub(reg_pattern_2,"n",pos_str) pos_str = re.sub(reg_pattern_4,"v",pos_str) pos_str = re.sub(reg_pattern_5,"n",pos_str) pos_str = re.sub(reg_pattern_9,"n",pos_str) pos_str = re.sub(reg_pattern_3,"n",pos_str) pos_str = re.sub(reg_pattern_8,"v",pos_str) pos_str = re.sub(reg_pattern_6,"v",pos_str) pos_str = re.sub(reg_pattern_7,"v",pos_str) pos_str = re.sub(reg_pattern_10,"v",pos_str) return pos_str
def cal_sentiment_NER(df_text): """ natural language processing on every row from the input. 1. for loop dataframe: 2. preprocess text in the df. 3. get entity using pyLTP 4. get sentiment, keywords, summary using SnowNLP. 5. append result to df Keyword Arguments: df_text -- """ # 词性标注 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 # 命名实体识别 ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) if isinstance(df_text, gftIO.GftTable): df_text = df_text.as_mutable_column_tab() df_result = pd.DataFrame(columns=[ 'datetime', 'people', 'geography', 'organization', 'keyword', 'summary', 'score' ]) for item in df_text[:10].iterrows(): # print(item[1]['Conclusion']) logging.info(item[0]) text = item[1]['Conclusion'] datetime = item[1]['WritingDate'] if not pd.isnull(text): text_split = preprocessing.preprocess_string(text) # 词性标注 # postagger = Postagger() # 初始化实例 words = text_split.split() # 分词结果 postags = postagger.postag(words) # 词性标注 netags = recognizer.recognize(words, postags) # 命名实体识别 dict_netags = defaultdict(list) ls_netags = list(zip(netags, words)) for x, y in ls_netags: dict_netags[x].append(y) s = SnowNLP(text) score = s.sentiments * 2 # # 人名(Nh)、地名(Ns)、机构名(Ni。) # # B、I、E、S ls_organization = [ dict_netags[x] for x in ['S-Ni', 'B-Ni', 'E-Ni', 'I-Ni'] ] ls_people = [ dict_netags[x] for x in ['S-Nh', 'B-Nh', 'E-Nh', 'I-Nh'] ] ls_geography = [ dict_netags[x] for x in ['S-Ns', 'B-Ns', 'E-Ns', 'I-Ns'] ] try: df_result = df_result.append( { 'datetime': datetime, 'keyword': ','.join(s.keywords()), 'organization': list(itertools.chain.from_iterable(ls_organization)), 'people': list(itertools.chain.from_iterable(ls_people)), 'geography': list(itertools.chain.from_iterable(ls_geography)), 'summary': ';'.join(s.summary()), 'score': score # 'text': text, }, ignore_index=True) except: continue return df_result
def simlify(text): LTP_DATA_DIR = r'E:\anaconda\ltpmoxin\ltp_data' # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` lexicon_path = os.path.join(LTP_DATA_DIR, 'lexicon') # 分词词典lexicon segmentor = Segmentor() # 初始化实例 # segmentor.load(cws_model_path) # 加载模型,如果不想自定义词典,就用这一句load模型即可 segmentor.load_with_lexicon(cws_model_path, lexicon_path) # 加载模型,参数lexicon是自定义词典的文件路径 words = segmentor.segment(text) # 分词 #print('|'.join(words))#打印分词结果 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 postags = postagger.postag(words) # 词性标注,这里words是分词后的list #print(' | '.join(postags)) postagger.release() # 释放模型 par_model_path = os.path.join( LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 arcs = parser.parse(words, postags) # 句法分析 parser.release() # 释放模型 #信息提取,结果展示 rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 #for i in range(len(words)): #print(relation[i] +'(' + words[i] +', ' + heads[i] +')') array = [] for i in range(len(words)): dict = {} dict["dep"] = words[i] dict["gov"] = heads[i] dict["pos"] = relation[i] array.append(dict) return array ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 #for word, ntag in zip(words, netags): # print(word + '/' + ntag) recognizer.release() # 释放模型
class LtpHelper(Component): """A new component""" name = "ltp" provides = [] requires = [] defaults = {} language_list = None def __init__(self, component_config: Dict[Text, Any] = None): super(LtpHelper, self).__init__(component_config) self.path = component_config['path'] self.lexicon = component_config['lexicon'] self.dimension = component_config['dimension'] ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) MODELDIR = os.path.join(ROOTDIR, self.path) self.segmentor = Segmentor() self.segmentor.load_with_lexicon(os.path.join(MODELDIR, "cws.model"), self.lexicon) self.postagger = Postagger() self.postagger.load(os.path.join(MODELDIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(MODELDIR, 'parser.model')) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(MODELDIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(MODELDIR, "pisrl.model")) def extract_tokens(self, message: Message): tokens = list(self.segmentor.segment(message.text)) segments = [] start = 0 for idx, token in enumerate(tokens): end = start + len(token) segments.append({'start': start, 'end': end}) start = end message.set("segments", segments) message.set("tokens", tokens) def extract_poses(self, message: Message): if not message.get("tokens", default=None): self.extract_tokens(message) message.set("poses", list(self.postagger.postag(message.get("tokens")))) def extract_tagseq(self, message: Message): """ 实体抽取, 这部分需要扩张 :param message: :return: """ message.set( "tagseq", list( self.recognizer.recognize(message.get("tokens"), message.get("poses")))) def extract_parses(self, message: Message): message.set( "arcs", self.parser.parse(message.get("tokens"), message.get("poses"))) def extract_labels(self, message: Message): message.set( "labels", self.labeller.label(message.get("tokens"), message.get("poses"), message.get("arcs"))) def train(self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any) -> None: """Train this component. """ pass def extract_entities(self, message: Message): # step1. 序列标注 self.extract_tagseq(message) # step2. tokens, labels = message.get("tokens"), message.get("tagseq") i, start, end = 0, 0, 0 spans = [] while i < len(labels): if labels[i].startswith('E'): dim = labels[i].split('-')[1] # 实体->词条 value = "".join(tokens[start:i + 1]) # 句子开始 _start = get_start(start, tokens=tokens) # 句子结束 _end = get_start(i, tokens=tokens) + len(value) ent = { 'label': self.dimension[dim], 'start': _start, 'end': _end, } spans.append(ent) start = 0 elif labels[i].startswith('B'): start = i elif labels[i].startswith('S'): dim = labels[i].split('-')[1] value = "".join(tokens[i:i + 1]) _start = get_start(i, tokens=tokens) _end = _start + len(value) ent = { 'label': self.dimension[dim], 'start': _start, 'end': _end, } spans.append(ent) else: # O pass i += 1 message.set("spans", spans, add_to_output=True) def extract_pronouns(self, message: Message, **kwargs: Any): pronouns = [] tokens, poses = message.get("tokens"), message.get("poses") for i, (w, p) in enumerate(zip(tokens, poses)): if p == 'r' and legalPronouns(w): # 增加性别、单复数属性 start = get_start(i, tokens=tokens) end = start + len(w) pronouns.append({ 'start': start, 'end': end, 'label': "Pronoun" }) message.set("spans", message.get("spans", []) + pronouns, add_to_output=True) def entity_segment(self, message: 'Message', **kwargs: Any): # type: (List, List[Dict])->List """ 属性链接 :param tokens: [word, word, word] :param entities: [{'entity': 'A', 'body': 'word', 'start': 0, 'end': 1}, ...] :return: [word, word, word] """ entities = message.get("entities") tokens = message.get("tokens") if len(entities) == 0: return tokens else: # 求出tokens中所有词的starts和ends的坐标 lengths = [len(w) for w in tokens] pos = [0] for p in lengths: pos.append(p + pos[-1]) starts = pos[:-1] ends = pos[1:] # 标注长度和位置信息 i = 0 for e in entities: e['length'], e['index'] = e['end'] - e['start'], i i += 1 # 保证entities的start和end,在starts和ends里面,否则筛除 valid_entities = [ e for e in entities if (e['start'] in starts) and (e['end'] in ends) ] token_entities = [{ 'entity': w, 'body': w, 'start': start, 'end': end } for w, start, end in zip(tokens, starts, ends)] # 对entities按长度的降序排列,意味着如果位置相同,长词语优先保留 valid_entities.sort(key=lambda x: x['length'], reverse=True) valid_entities.extend(token_entities) valid_entities.sort(key=lambda x: x['start'], reverse=False) # 筛选实体,如有包含,较长的实体优先;如有交叉,先出现的实体优先;如完全相同,取第1个(意味着随机) p = 0 filtered_entities = [] for e in valid_entities: if e['start'] == p: filtered_entities.append(e) p = e['end'] # 改变token word_tokens = [ message.text[e['start']:e['end']] for e in filtered_entities ] # 记录词语的位置 entity_selected = {} i = 1 for e in filtered_entities: if 'length' in e: e.update({'index': i}) entity_selected.update({i: e}) i += 1 valid_pos = list(entity_selected.keys()) message.set("tokens", word_tokens) message.set("entity_selected", entity_selected) message.set("valid_pos", valid_pos) def link_analyze(self, message: Message, **kwargs: Any): tokens = message.get("tokens", []) postags = message.get("poses", []) arcs = message.get("arcs") arcs = [(arc.head, arc.relation) for arc in arcs] semantic = list( zip(list(range(1, len(tokens) + 1)), tokens, postags, arcs)) logging.debug('semantic structrue: {}'.format(semantic)) # 以下是特殊情况下的句法调整 # 第一种情况:记录动词“是”和“为”的位置 loc = [] for struc in semantic: if (struc[1] in ['是', '为']) and (struc[2] == 'v'): loc.append(struc[0]) for i in loc: pre_loc = 0 suf_loc = 0 for j in range(1, i): if (semantic[j - 1][3][0] == i) and (semantic[j - 1][3][1] == 'SBV'): pre_loc = j for j in range(i + 1, min(len(semantic) + 1, i + 10)): # 最多间隔10个词语,对于宾语来说已经足够 if (semantic[j - 1][3][0] == i) and (semantic[j - 1][3][1] == 'VOB'): suf_loc = j if pre_loc and suf_loc: semantic[pre_loc - 1] = (semantic[pre_loc - 1][0], semantic[pre_loc - 1][1], semantic[pre_loc - 1][2], (suf_loc, 'SEO')) # 第二种情况:此处是句法分析出错的情况,将实体识别成谓语成分SBV,词性为i loc = [] for struc in semantic: if struc[2] == 'i': loc.append(struc[0]) for i in loc: for j in range(1, i): if (semantic[j - 1][3][0] == i) and (semantic[j - 1][3][1] == 'SBV'): semantic[j - 1] = (semantic[j - 1][0], semantic[j - 1][1], semantic[j - 1][2], (i, 'SEO')) # 第三种情况:记录动词“名叫”和“叫”的位置 loc = [] for struc in semantic: if (struc[1] in ['名叫', '叫', '叫做']) and (struc[2] == 'v'): loc.append(struc[0]) for i in loc: for j in range(i + 1, min(len(semantic) + 1, i + 10)): if (semantic[j - 1][3][0] == i) and (semantic[j - 1][3][1] == 'VOB'): semantic[j - 1] = (semantic[j - 1][0], semantic[j - 1][1], semantic[j - 1][2], (semantic[i - 1][3][0], 'SEO')) message.set('semantic', semantic, add_to_output=False) def process(self, message: Message, **kwargs: Any): """Process an incoming message. This is the components chance to process an incoming message. The component can rely on any context attribute to be present, that gets created by a call to :meth:`components.Component.pipeline_init` of ANY component and on any context attributes created by a call to :meth:`components.Component.process` of components previous to this one.""" # TODO 分词, 如果利用其它分词组件, 需要进一步调整 if not message.get("tokens", default=None): self.extract_tokens(message) # 词性标注 self.extract_poses(message) # 句法依存 self.extract_parses(message) # 抽取实体<序列标注+实体提取> self.extract_entities(message) # 抽取代词 self.extract_pronouns(message) else: # rasa tokenizers tokens = message.get("tokens") message.set("tokenizers", tokens) # List tokens tokens = [tokenizer_extract(token) for token in tokens] message.set("tokens", tokens) self.extract_poses(message) # 句法依存 self.extract_parses(message) # 抽取实体<序列标注+实体提取> # 语义分割 -> self.entity_segment(message) # 属性分析 -> self.link_analyze(message) @classmethod def load(cls, meta: Dict[Text, Any], model_dir: Optional[Text] = None, model_metadata: Optional[Metadata] = None, cached_component: Optional = None, **kwargs): return cls(meta)
if __name__ == '__main__': # testLine = '著名相声家成龙的师傅是马季。' while True: testLine = raw_input('请输入字符串:(-1退出)') namedEntityTagTupleList = [] segmentor = Segmentor() # segmentor.load(inout.getLTPPath(index.CWS)) segmentor.load_with_lexicon(inout.getLTPPath(index.CWS), inout.getResourcePath('userDic.txt')) words = segmentor.segment(testLine) segmentor.release() postagger = Postagger() postagger.load(inout.getLTPPath(index.POS)) postags = postagger.postag(words) postagger.release() recognizer = NamedEntityRecognizer() recognizer.load(inout.getLTPPath(index.NER)) netags = recognizer.recognize(words, postags) recognizer.release() for word, netag in zip(words, netags): namedEntityTagTupleList.append((word, netag)) neTagList = '\t'.join(netags).split('\t') printEscapeStr(namedEntityTagTupleList) printEscapeStr(neTagList)
class LtpModel(object): """ 封装pyltp model 类,方便使用 """ @pysnooper.snoop() def __init__(self, LTP_DATA_DIR): """加载pyltp模型""" self.LTP_DATA_DIR = LTP_DATA_DIR # pyltp的存放路径 # 分词模型路径,分词模型名称是 'cws.model' cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') self.segmentor = Segmentor() self.segmentor.load(cws_model_path) # 词性标注模型路径,分词模型名称是 'pos.model' pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model') self.postager = Postagger() self.postager.load(pos_model_path) # 命名实体识别模型路径,模型名称为'ner.model' ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model') self.recognizer = NamedEntityRecognizer() self.recognizer.load(ner_model_path) # 依存句法分析模型路径,模型名称为 'parser.model' par_model_path = os.path.join(self.LTP_DATA_DIR, 'parser.model') self.parser = Parser() self.parser.load(par_model_path) # # 语义角色标注模型目录路径,模型目录为'pisrl.model' # srl_model_path = os.path.join(self.LTP_DATA_DIR, 'pisrl.model') # self.labeller = SementicRoleLabeller() # 初始化实例 # self.labeller.load(srl_model_path) # 加载模型 def load_model(self): # """加载pyltp模型""" # # 分词模型路径,分词模型名称是‘cws.model’ # self.segment = Segmentor() # print(cws_model_path) # self.segment.load(cws_model_path) # # 词性标注模型路径,分词模型名称是‘pos.model’ # self.postager = Postagger() # self.postager.load(pos_model_path) # # # 命名实体识别模型路径,模型名称为`pos.model` # self.recognizer = NamedEntityRecognizer() # self.recognizer.load(ner_model_path) # # # 依存句法分析模型路径,模型名称为`parser.model` # self.parser = Parser() # self.parser.load(par_model_path) # # # 语义角色标注模型目录路径,模型目录为`srl` # self.labeller = SementicRoleLabeller() # 初始化实例 # self.labeller.load(srl_model_path) # 加载模型 # 加载word2vec 模型 pass @pysnooper.snoop() def release_all_model(self): """释放模型""" self.segmentor.release() self.postager.release() self.recognizer.release() self.parser.release() # word2vec 模型的释放 pass # 分句 @pysnooper.snoop() def split_sentences(self, string): sents = SentenceSplitter.split(string) sentences = [s for s in sents if len(s) != 0] return sentences def jieba_word_cut(self, string): string = re.findall( '[\d|\w|\u3002 |\uff1f |\uff01 |\uff0c |\u3001 |\uff1b |\uff1a |\u201c |\u201d |\u2018 |\u2019 |\uff08 |\uff09 |\u300a |\u300b |\u3008 |\u3009 |\u3010 |\u3011 |\u300e |\u300f |\u300c |\u300d |\ufe43 |\ufe44 |\u3014 |\u3015 |\u2026 |\u2014 |\uff5e |\ufe4f |\uffe5]+', string) string = ' '.join(string) return ' '.join(jieba.cut(string)) # 分词 @pysnooper.snoop() def split_words(self, sentences): sents = [self.jieba_word_cut(s) for s in sentences] return sents # 词性分析 @pysnooper.snoop() def get_word_pos(self, sents): postags = [self.postager.postag(words.split()) for words in sents] postags = [list(w) for w in postags] return postags # 依存句法分析 @pysnooper.snoop() def dependency_parsing(self, sents, postags, said): contents = [] for index in range(len(sents)): wo = sents[index].split() po = postags[index] netags = self.recognizer.recognize(wo, po) # 命名实体识别 netags = list(netags) # print(netags) if ('S-Nh' not in netags) and ('S-Ni' not in netags) and ( 'S-Ns' not in netags): # 人名、机构名、地名 当人名、机构名、地名在该句中则进行依存句法分析 continue arcs = self.parser.parse(wo, po) arcs = [(arc.head, arc.relation) for arc in arcs] # print(arcs) #[(2, 'SBV'), (0, 'HED'), (5, 'SBV'), (5, 'ADV'), (2, 'VOB')] arcs = [(i, arc) for i, arc in enumerate(arcs) if arc[1] == 'SBV'] # SBV 主谓关系 找出主谓关系的句子 # print(arcs) #[(0, (2, 'SBV')), (2, (5, 'SBV'))] for arc in arcs: verb = arc[1][0] # 2 5 subject = arc[0] # 0 1 if wo[verb - 1] not in said: # 如果wo[verb - 1]这个所对应的词语 在已建词表said中,则打印出来 continue # print(wo[subject],wo[verb - 1],''.join(wo[verb:])) contents.append((wo[subject], wo[verb - 1], ''.join(wo[verb:]))) # 依次为人物、"说"的近义词、文本 return contents @pysnooper.snoop() def get_sentences_json_result(self, string): """ 对输入的句子进行SBV提取 :param string: :return: list of dict [{}] """ sentences = self.split_sentences(string) # 分句 sents = self.split_words(sentences) # 分词 postags = self.get_word_pos(sents) # 词性分析 contents = self.dependency_parsing(sents, postags, txt_said) # 依存句法分析 # 拼装json结果 contents_dict = [] for ones in enumerate(contents): # json 字段 result = { 'name': ones[1][0], 'trigger': ones[1][1], 'content': ones[1][2] } contents_dict.append(result) return contents_dict
class LtpParser: def __init__(self): LTP_DIR = 'E:\LTP\ltp_data_v3.4.0' # ltp模型目录的路径 self.segmentor = Segmentor() self.segmentor.load(os.path.join( LTP_DIR, "cws.model")) # 分词模型路径,模型名称为`cws.model` self.postagger = Postagger() self.postagger.load(os.path.join( LTP_DIR, "pos.model")) # 词性标注模型路径,模型名称为`pos.model` self.parser = Parser() self.parser.load(os.path.join( LTP_DIR, "parser.model")) # 依存句法分析模型路径,模型名称为`parser.model self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join( LTP_DIR, "ner.model")) # 命名实体识别模型路径,模型名称为`ner.model` self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, "pisrl_win.model") ) # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。 '''语义角色标注''' def format_labelrole(self, words, postags): arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) for role in roles: print(words[role.index]) print( role.index, "".join([ "%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments ])) roles_dict = {} for role in roles: roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } return roles_dict '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: #arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict( words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list def sentence_splitter(self, sentence='你好,你觉得这个例子从哪里来的?当然还是直接复制官方文档'): sents = SentenceSplitter.split(sentence) # 分句 return (list(sents))
from pyltp import Postagger from pyltp import Segmentor from pyltp import NamedEntityRecognizer pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 词性标注模型路径,模型名称为`pos.model` ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` postagger = Postagger() # 初始化实例 postagger.load_with_lexicon(pos_model_path, './model/lexicon') # 加载模型, 及自定外部詞典 #postagger.load(pos_model_path) # 加载模型, 及自定外部詞典 segmentor = Segmentor() # 初始化实例 segmentor.load_with_lexicon(cws_model_path, './model/lexicon') # 加载模型,第二个参数是您的外部词典文件路径 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 words = segmentor.segment(text) postags = postagger.postag(words) # 词性标注 netags = recognizer.recognize(words, postags) # 命名实体识别 arcs = parser.parse(words, postags) # 句法分析 print ('=' * 30) print ('\t'.join(words)) print ('\t'.join(postags)) print ('\t'.join(netags)) print ("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) postagger.release()
import sys reload(sys) sys.setdefaultencoding('utf8') import time from pyltp import Segmentor from pyltp import Postagger from pyltp import NamedEntityRecognizer import re segmentor = Segmentor() # 初始化实例 segmentor.load('/home/sherlock/Documents/ltp_data/cws.model') #实例化词性工具 postagger = Postagger() # 初始化实例 postagger.load('/home/sherlock/Documents/ltp_data/pos.model') # 加载模型 recognizer = NamedEntityRecognizer() recognizer.load('/home/sherlock/Documents/ltp_data/ner.model') def wdseg(inputstr, ret_type): words = segmentor.segment(inputstr) # 分词 if ret_type == 'str': seg_word = ' '.join(words) if ret_type == 'lst': seg_word = ' '.join(words) seg_word = seg_word.split() #segmentor.release() # 释放模型 return seg_word
class LtpParser: def __init__(self): LTP_DIR = "/home/python/ltp/ltp_data_v3.4.0" # 分词模型,单文件 self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) # 词性标注模型,单文件 self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) # 依存句法分析模型,单文件 self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) # 命名实体识别模型,单文件 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) # 语义角色标注模型,多文件 self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) def release_model(self): # 释放模型 self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.labeller.release() # 命名实体识别 def entity_ner(self, words, postags): netags = self.recognizer.recognize(words, postags) # 命名实体识别 entity_ner = list() for word, ntag in zip(words, netags): entity_ner.append((word, ntag)) return entity_ner # 语义角色标注 def format_labelrole(self, words, postags): arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } return roles_dict # 句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典 def build_parse_child_dict(self, words, postags, arcs): child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: # arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list # 通用实体识别 def comm_ner(self, sentence): class_entity = {"Ni": "institution", "Ns": "place", "Nh": "name"} words = jieba.cut(sentence) words = [word for word in words] postags = list(self.postagger.postag(words)) entity_ner = self.entity_ner(words, postags) result = set() entity = list() index = 0 for item in entity_ner: entity_name = item[0] entity_bz = item[1] temp = entity_bz.split("-") if len(temp) == 2: bz = temp[0] type = temp[1] if bz == "S": result.add((entity_name, class_entity.get(type))) else: entity.append((index, entity_name, class_entity.get(type))) if bz == "E": index = index + 1 if len(entity) > 0: entitydf = pd.DataFrame(entity) resulttemp = entitydf.groupby([ 0, 2 ])[1].apply(lambda x: "".join(list(x))).reset_index(name='实体名称') for item in np.array(resulttemp[["实体名称", 2]]).tolist(): result.add(tuple(item)) print("ltp=", result) return result
#!/usr/bin/env python # coding=utf-8 from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer from pprint import pprint segmentor = Segmentor() segmentor.load_with_lexicon("./ltp_data/cws.model", './construct_dict.txt') # segmentor.load("./ltp_data/cws.model") # 分词模型 postagger = Postagger() postagger.load("./ltp_data/pos.model") # 词性标注 parser = Parser() parser.load("./ltp_data/parser.model") # 依存句法分析 recognizer = NamedEntityRecognizer() recognizer.load("./ltp_data/ner.model") # 命名实体识别 in_file_name = "input_test" out_file_name = "output.txt" in_file = open(in_file_name, 'r', encoding="utf-8") out_file = open(out_file_name, 'w+', encoding="utf-8") construct_list = [] def get_contruct_list(): f = open('construct_dict.txt', 'r', encoding="utf-8") for line in f: construct = line.strip() if construct not in construct_list: construct_list.append(construct)
class LtpParser: def __init__(self): LTP_DIR = "./ltp_data" self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) def format_labelrole(self, words, postags): '''语义角色标注''' arcs = self.parser.parse(words, postags) roles = self.labeller.label(words, postags, arcs) roles_dict = {} for role in roles: roles_dict[role.index] = { arg.name: [arg.name, arg.range.start, arg.range.end] for arg in role.arguments } return roles_dict def build_parse_child_dict(self, words, postags, arcs): '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index + 1: #arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [ relation[i], words[i], i, postags[i], heads[i], rely_id[i] - 1, postags[rely_id[i] - 1] ] format_parse_list.append(a) return child_dict_list, format_parse_list def parser_main(self, sentence): '''parser主函数''' words = list(self.segmentor.segment(sentence)) postags = list(self.postagger.postag(words)) arcs = self.parser.parse(words, postags) child_dict_list, format_parse_list = self.build_parse_child_dict( words, postags, arcs) roles_dict = self.format_labelrole(words, postags) return words, postags, child_dict_list, roles_dict, format_parse_list
class LTP_CLASS(object): def __init__(self): self.LTP_DATA_DIR = '/Users/yf/Downloads/ltp_data_v3.4.0' # 自定义分词表 self.cut_file = '/Users/yf/Downloads/ltp_data_v3.4.0/cut.txt' # 分词结果 self.cut_list = [] # 依存关系 self.arcs = None # 词性 self.part_speech_list = [] # 分词 self.segmentor = Segmentor() self.segmentor.load_with_lexicon( os.path.join(self.LTP_DATA_DIR, 'cws.model'), self.cut_file) # 词性标注 self.postagger = Postagger() self.postagger.load(os.path.join(self.LTP_DATA_DIR, 'pos.model')) # 命名实体识别 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(self.LTP_DATA_DIR, 'ner.model')) # 依存句法分析 self.parser = Parser() self.parser.load(os.path.join(self.LTP_DATA_DIR, 'parser.model')) # 语义角色标注 self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(self.LTP_DATA_DIR, 'pisrl.model')) # 词性标注集 self._dict = { "a": "形容词", "ni": "机构名称", "b": "其他名词修饰语", "nl": "位置名词", "c": "连词", "ns": "地名", "d": "副词", "nt": "时态名词", "e": "感叹", "nz": "其他专有名词", "g": "词素", "o": "拟声词", "h": "字首", "p": "介词", "i": "成语", "q": "数量", "j": "缩写", "r": "代词", "k": "后缀", "u": "辅助的", "m": "数", "v": "动词", "n": "一般名词", "wp": "标点", "nd": "方向名词", "ws": "外来词", "nh": "人名", "x": "最小意义单位" } # 依存句法关系 self._dict2 = { "SBV": "主谓关系", "VOB": "动宾关系", "IOB": "间宾关系", "FOB": "前置宾语", "DBL": "兼语", "ATT": "定中关系", "ADV": "状中结构", "CMP": "动补结构", "COO": "并列关系", "POB": "介宾关系", "LAD": "左附加关系", "RAD": "右附加关系", "IS": "独立结构", "HED": "核心关系" } # 命名实体识别标注集 self._idct3 = { "O": "这个词不是NE", "S": "这个词单独构成一个NE", "B": "这个词为一个NE的开始", "I": "这个词为一个NE的中间", "E": "这个词位一个NE的结尾" } self._dict4 = {"Nh": "人名", "Ni": "机构名", "Ns": "地名"} # 语义角色类型 self._dict5 = { "ADV": "默认标记", "BNE": "受益人", "CND": "条件", "DIR": "方向", "DGR": "程度", "EXT": "扩展", "FRQ": "频率", "LOC": "地点", "MNR": "方式", "PRP": "目的或原因", "TMP": "时间", "TPC": "主题", "CRD": "并列参数", "PRD": "谓语动词", "PSR": "持有者", "PSE": "被持有" } # 释放对象 def colse_ltp(self): # 分词释放 self.segmentor.release() # 词性释放 self.postagger.release() # 实体释放 self.recognizer.release() # 依存关系释放 self.parser.release() # 语义角色释放 self.labeller.release() # 分句 def cut_split(self, msg): sents = SentenceSplitter.split(msg) return [i for i in sents] # 分词 def cut_words(self, msg): words = self.segmentor.segment(msg) self.cut_list = [i for i in words] return self.cut_list # 词性标注 def part_speech(self): postags = self.postagger.postag(self.cut_list) # 词性标注 self.part_speech_list = [i for i in postags] return self.part_speech_list # 实体识别 def notional_words(self): return self.recognizer.recognize(self.cut_list, self.part_speech_list) # 命名实体识别 # 依存句法分析 def interdependent(self): self.arcs = self.parser.parse(self.cut_list, self.part_speech_list) # 句法分析 return [(arc.head, arc.relation) for arc in self.arcs] # 语义角色标注 def role(self): roles = self.labeller.label(self.cut_list, self.part_speech_list, self.arcs) # 语义角色标注
from myfuncs import get_person_entity_set from pyltp import Segmentor, NamedEntityRecognizer, Parser, Postagger import os import platform cwd = os.getcwd() model_dir = '/Users/karloar/Documents/other/ltp_data_v3.4.0' if platform.system() == 'Windows': model_dir = r'E:\ltp_data' cws_model = os.path.join(model_dir, 'cws.model') cwd_dict = os.path.join(cwd, 'dict.txt') pos_model = os.path.join(model_dir, 'pos.model') ner_model = os.path.join(model_dir, 'ner.model') parser_model = os.path.join(model_dir, 'parser.model') if __name__ == '__main__': segmentor = Segmentor() segmentor.load_with_lexicon(cws_model, cwd_dict) # segmentor.load(cws_model) postagger = Postagger() postagger.load(pos_model) ner = NamedEntityRecognizer() ner.load(ner_model) parser = Parser() parser.load(parser_model) sentence = '新加坡《联合早报》曝出了赵薇与上海知名人士汪雨的儿子汪道涵热恋。' word_list = segmentor.segment(sentence) content = get_content_from_ltp(' '.join(list(word_list)), 'sdp') print(content)
class NLP: """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析 Attributes: default_user_dict_dir: str,用户自定义词典目录 default_model_dir: str,ltp模型文件目录 """ default_user_dict_dir = '../../resource/' # 默认的用户词典目录,清华大学法律词典 default_model_dir = '../../model/' # ltp模型文件目录 def __init__(self, user_dict_dir=default_user_dict_dir, model_dir=default_model_dir): self.default_user_dict_dir = user_dict_dir self.default_model_dir = model_dir # 初始化分词器 self.segmentor = Segmentor() self.segmentor.load(os.path.join(self.default_model_dir, "cws.model")) # pynlpir.open() # 初始化分词器 # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快 files = os.listdir(user_dict_dir) for file in files: file_path = os.path.join(user_dict_dir, file) # 文件夹则跳过 if os.path.isdir(file): continue with open(file_path, 'r', encoding='utf-8') as f: line = f.readline() while line: word = line.strip('\n').strip() jieba.add_word(word) # print(c_char_p(word.encode())) # pynlpir.nlpir.AddUserWord(c_char_p(word.encode())) line = f.readline() # 加载ltp模型 # 词性标注模型 self.postagger = Postagger() postag_flag = self.postagger.load(os.path.join(self.default_model_dir, 'pos.model')) # 命名实体识别模型 self.recognizer = NamedEntityRecognizer() ner_flag = self.recognizer.load(os.path.join(self.default_model_dir, 'ner.model')) # 依存句法分析模型 self.parser = Parser() parse_flag = self.parser.load(os.path.join(self.default_model_dir, 'parser.model')) if postag_flag or ner_flag or parse_flag: print('load model failed!') def segment(self, sentence, entity_postag=dict()): """采用NLPIR进行分词处理 Args: sentence: string,句子 entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生 Returns: lemmas: list,分词结果 """ # 添加实体词典 # if entity_postag: # for entity in entity_postag: # pynlpir.nlpir.AddUserWord(c_char_p(entity.encode())) # jieba.add_word(entity) # pynlpir.nlpir.AddUserWord(c_char_p('前任'.encode())) # 单个用户词加入示例 # pynlpir.nlpir.AddUserWord(c_char_p('习近平'.encode())) # 单个用户词加入示例 # 分词,不进行词性标注 # lemmas = pynlpir.segment(sentence, pos_tagging=False) # lemmas = jieba.lcut(sentence) # pynlpir.close() # 释放 lemmas = list(self.segmentor.segment(sentence)) return lemmas def postag(self, lemmas): """对分词后的结果进行词性标注 Args: lemmas: list,分词后的结果 entity_dict: set,实体词典,处理具体的一则判决书的结构化文本时产生 Returns: words: WordUnit list,包含分词与词性标注结果 """ words = [] # 存储句子处理后的词单元 # 词性标注 postags = self.postagger.postag(lemmas) for i in range(len(lemmas)): # 存储分词与词性标记后的词单元WordUnit,编号从1开始 word = WordUnit(i+1, lemmas[i], postags[i]) words.append(word) # self.postagger.release() # 释放 return words def get_postag(self, word): """获得单个词的词性标注 Args: word: str,单词 Returns: post_tag: str,该单词的词性标注 """ post_tag = self.postagger.postag([word, ]) return post_tag[0] def netag(self, words): """命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并 Args: words: WordUnit list,包含分词与词性标注结果 Returns: words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果 """ lemmas = [] # 存储分词后的结果 postags = [] # 存储词性标书结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 命名实体识别 netags = self.recognizer.recognize(lemmas, postags) # print('\t'.join(netags)) # just for test words_netag = EntityCombine().combine(words, netags) # self.recognizer.release() # 释放 return words_netag def parse(self, words): """对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选) Args: words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果 Returns: *: SentenceUnit,该句子单元 """ lemmas = [] # 分词结果 postags = [] # 词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 依存句法分析 arcs = self.parser.parse(lemmas, postags) for i in range(len(arcs)): words[i].head = arcs[i].head words[i].dependency = arcs[i].relation # self.parser.release() return SentenceUnit(words) def close(self): """关闭与释放nlp""" # pynlpir.close() self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release()
class PyltpAnalyzer(object): def __init__(self, fileDir=LTP_DATA_DIR): """ :param filename: """ print('77777&777777777777777') self.fileDir = fileDir # 初始化分词实例 self.cws_model_path = os.path.join( self.fileDir, 'cws.model') # 分词模型路径,模型名称为`cws.model` self.segmentor = Segmentor() self.segmentor.load(self.cws_model_path) # 加载模型 # 初始化标注实例 self.pos_model_path = os.path.join( self.fileDir, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self.postagger = Postagger() self.postagger.load(self.pos_model_path) # 加载模型 # 初始化命名实体识别实例 self.ner_model_path = os.path.join( self.fileDir, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` self.recognizer = NamedEntityRecognizer() self.recognizer.load(self.ner_model_path) # 加载模型 #依存句法分析 self.par_model_path = os.path.join( self.fileDir, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` self.parser = Parser() # 初始化实例 self.parser.load(self.par_model_path) # 加载模型 def segmentSentence(self, sentence): return list(self.segmentor.segment(sentence)) def segment(self, sentences): """ :param sentences: 句子列表 :return:句子分词结果 """ wordsList = [] if sentences: for sentence in sentences: wordsList.append(list(self.segmentor.segment(sentence))) return wordsList def postag(self, wordsList): """ :param wordsList: 句子分词列表 :return: 句子分词词性标注结果 """ postagsList = [] if wordsList: for words in wordsList: postagsList.append(list(self.postagger.postag(words))) return postagsList def recognize(self, wordsList, postagsList): """ :param wordsList: 句子分词列表 :param postagsList: 句子标注列表 :return: 句子命名实体识别结果 """ netagsList = [] if wordsList and postagsList: if len(wordsList) == len(postagsList): for words, postags in zip(wordsList, postagsList): netagsList.append( list(self.recognizer.recognize(words, postags))) else: print( "wordsList = {} ,len(wordsList) = {} and postagsList = {} ,len(postagsList)" .format(wordsList, len(wordsList), postagsList, len(postagsList))) else: print("wordsList = {} and postagsList = {}".format( wordsList, postagsList)) return netagsList def dependencyParse(self, wordsList, postagsList): """ :param wordsList: 句子分词列表 :param postagsList: 句子标注列表 :return: 句子句法分析结果 """ arcsList = [] if wordsList and postagsList: if len(wordsList) == len(postagsList): for words, postags in zip(wordsList, postagsList): arcsList.append(list(self.parser.parse( words, postags))) #arc.head 父节点, arc.relation 依存关系 else: print( "wordsList = {} ,len(wordsList) = {} and postagsList = {} ,len(postagsList)" .format(wordsList, len(wordsList), postagsList, len(postagsList))) else: print("wordsList = {} and postagsList = {}".format( wordsList, postagsList)) return arcsList def finalize(self): """ 释放所有没用到的模型 :return: """ self.segmentor.release() # 释放分词模型 self.postagger.release() # 释放词性模型 self.recognizer.release() # 释放命名实体模型 self.parser.release() # 释放依存句法模型
class ltp_api(object): def __init__(self, MODELDIR, exword_path=None): self.MODELDIR = MODELDIR self.output = {} self.words = None self.postags = None self.netags = None self.arcs = None self.exword_path = exword_path # e.x: '/data1/research/matt/ltp/exwords.txt' # 分词 self.segmentor = Segmentor() if not self.exword_path: # 是否加载额外词典 self.segmentor.load(os.path.join(self.MODELDIR, "cws.model")) else: self.segmentor.load_with_lexicon( os.path.join(self.MODELDIR, "cws.model"), self.exword_path) # 词性标注 self.postagger = Postagger() self.postagger.load(os.path.join(self.MODELDIR, "pos.model")) # 依存句法 self.parser = Parser() self.parser.load(os.path.join(self.MODELDIR, "parser.model")) # 命名实体识别 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(self.MODELDIR, "ner.model")) # 语义角色 self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(MODELDIR, "pisrl.model")) # 分词 def ltp_segmentor(self, sentence): words = self.segmentor.segment(sentence) return words # 词性标注 def ltp_postagger(self, words): postags = self.postagger.postag(words) return postags # 依存语法 def ltp_parser(self, words, postags): arcs = self.parser.parse(words, postags) return arcs # 命名实体识别 def ltp_recognizer(self, words, postags): netags = self.recognizer.recognize(words, postags) return netags # 语义角色识别 def ltp_labeller(self, words, postags, arcs): output = [] roles = self.labeller.label(words, postags, arcs) for role in roles: output.append([(role.index, arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) return output def release(self): self.segmentor.release() self.postagger.release() self.parser.release() self.recognizer.release() self.labeller.release() def get_result(self, sentence): self.words = self.ltp_segmentor(sentence) self.postags = self.ltp_postagger(self.words) self.arcs = self.ltp_parser(self.words, self.postags) self.netags = self.ltp_recognizer(self.words, self.postags) self.output['role'] = self.ltp_labeller(self.words, self.postags, self.arcs) # 载入output self.output['words'] = list(self.words) self.output['postags'] = list(self.postags) self.output['arcs'] = [(arc.head, arc.relation) for arc in self.arcs] self.output['netags'] = list(self.netags)
import os from lxml import etree from pyltp import Segmentor, Postagger, NamedEntityRecognizer # model path MODELDIR="/data/ltp/ltp-models/3.3.0/ltp_data" #MODELDIR="/home/twjiang/01.lab/ltp_model/3.3.0/ltp_data" print "正在加载LTP模型... ..." segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) print "模型加载完毕." print "正在加载大词林实体词库... ..." bigcilin_file = open("/users1/twjiang/03.data/entitys_bigcilin.txt") bigcilin = [] line = bigcilin_file.readline() while line: entity = line.strip() bigcilin.append(entity) line = bigcilin_file.readline() bigcilin_file.close() print "大词林实体词库加载完毕: 已加载%d实体" % (len(bigcilin)) piece_size = 3
try: htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 1) sz = re_charEntity.search(htmlstr) except KeyError: # 以空串代替 htmlstr = re_charEntity.sub('', htmlstr, 1) sz = re_charEntity.search(htmlstr) return htmlstr segmentor = Segmentor() # 初始化实例 segmentor.load("/Users/guoziyao/repos/pyltp/ltp_data/cws.model") # 加载模型 postagger = Postagger() # 初始化实例 postagger.load("/Users/guoziyao/repos/pyltp/ltp_data/pos.model") # 加载模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load("/Users/guoziyao/repos/pyltp/ltp_data/ner.model") # 加载模型 def recgonize(url, tag): """ :param url: 网址 :param tag: 实体标记。Ni 机构名,Nh 人名,Ns 地名 :return: 得到的结果 """ text = http_request(url) text = filter_tags(text).replace(' ', '') lines = text.split() result = [] for line in lines: line = line.encode('utf-8')
''' #根据词性,挑选句子对 import os import codecs import re import pandas as pd import sentence_parser from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller from pyltp import SentenceSplitter LTP_DIR = 'D:\LTP\MODEL\ltp_data' # ltp模型目录的路径 segmentor = Segmentor() segmentor.load(os.path.join(LTP_DIR, "cws.model")) # 分词模型路径,模型名称为`cws.model` postagger = Postagger() postagger.load(os.path.join(LTP_DIR, "pos.model")) # 词性标注模型路径,模型名称为`pos.model` recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(LTP_DIR, "ner.model")) # 命名实体识别模型路径,模型名称为`ner.model` parser = Parser() parser.load(os.path.join(LTP_DIR, "parser.model")) # 依存句法分析模型路径,模型名称为`parser.model def postag_list(file, n_postags, line_total): with codecs.open(file, 'r', encoding='utf-8') as f: lines = [line.split(' ')[0].strip() for line in f] for line in lines: with codecs.open('../data/causality_sentences_2.txt', 'a', encoding='utf-8') as fw: fw.write(line + '\n') sus_pos_wors = [] words = list(segmentor.segment(line))
class TripleIE(object): def __init__(self, in_file_path, out_file_path, model_path, clean_output=False): self.logger = logging.getLogger("TripleIE") self.in_file_path = in_file_path self.out_file_path = out_file_path self.model_path = model_path self.clean_output = clean_output # 输出是否有提示 self.out_handle = None self.segmentor = Segmentor() self.segmentor.load(os.path.join(self.model_path, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(self.model_path, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(self.model_path, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(self.model_path, "ner.model")) def run(self, in_file_path=None, out_file_path=None): if in_file_path is not None: self.in_file_path = in_file_path if out_file_path is not None: self.out_file_path = out_file_path self.out_handle = open(self.out_file_path, 'a') with open(self.in_file_path, "r", encoding="utf-8") as rf: self.logger.info("loadding input file {}...".format( self.in_file_path)) text = "" for line in rf: line = line.strip() text += line self.logger.info("done with loadding file...") text = U.rm_html(text) sentences = U.split_by_sign(text) self.logger.info("detect {} sentences".format(len(sentences))) self.logger.info("start to extract...") for sentence in tqdm(sentences): self.extract(sentence) self.logger.info("done with extracting...") self.logger.info("output to {}".format(self.out_file_path)) # close handle self.out_handle.close() def extract(self, sentence): words = self.segmentor.segment(sentence) postags = self.postagger.postag(words) ner = self.recognizer.recognize(words, postags) arcs = self.parser.parse(words, postags) sub_dicts = self._build_sub_dicts(words, postags, arcs) for idx in range(len(postags)): if postags[idx] == 'v': sub_dict = sub_dicts[idx] # 主谓宾 if 'SBV' in sub_dict and 'VOB' in sub_dict: e1 = self._fill_ent(words, postags, sub_dicts, sub_dict['SBV'][0]) r = words[idx] e2 = self._fill_ent(words, postags, sub_dicts, sub_dict['VOB'][0]) if self.clean_output: self.out_handle.write("%s, %s, %s\n" % (e1, r, e2)) else: self.out_handle.write("主谓宾\t(%s, %s, %s)\n" % (e1, r, e2)) self.out_handle.flush() # 定语后置,动宾关系 if arcs[idx].relation == 'ATT': if 'VOB' in sub_dict: e1 = self._fill_ent(words, postags, sub_dicts, arcs[idx].head - 1) r = words[idx] e2 = self._fill_ent(words, postags, sub_dicts, sub_dict['VOB'][0]) temp_string = r + e2 if temp_string == e1[:len(temp_string)]: e1 = e1[len(temp_string):] if temp_string not in e1: if self.clean_output: self.out_handle.write("%s, %s, %s\n" % (e1, r, e2)) else: self.out_handle.write( "动宾定语后置\t(%s, %s, %s)\n" % (e1, r, e2)) self.out_handle.flush() # 抽取命名实体有关的三元组 try: if ner[idx][0] == 'S' or ner[idx][0] == 'B': ni = idx if ner[ni][0] == 'B': while len(ner) > 0 and len( ner[ni]) > 0 and ner[ni][0] != 'E': ni += 1 e1 = ''.join(words[idx:ni + 1]) else: e1 = words[ni] if arcs[ni].relation == 'ATT' and postags[ arcs[ni].head - 1] == 'n' and ner[arcs[ni].head - 1] == 'O': r = self._fill_ent(words, postags, sub_dicts, arcs[ni].head - 1) if e1 in r: r = r[(r.idx(e1) + len(e1)):] if arcs[arcs[ni].head - 1].relation == 'ATT' and ner[ arcs[arcs[ni].head - 1].head - 1] != 'O': e2 = self._fill_ent( words, postags, sub_dicts, arcs[arcs[ni].head - 1].head - 1) mi = arcs[arcs[ni].head - 1].head - 1 li = mi if ner[mi][0] == 'B': while ner[mi][0] != 'E': mi += 1 e = ''.join(words[li + 1:mi + 1]) e2 += e if r in e2: e2 = e2[(e2.idx(r) + len(r)):] if r + e2 in sentence: if self.clean_output: self.out_handle.write("%s, %s, %s\n" % (e1, r, e2)) else: self.out_handle.write( "人名/地名/机构\t(%s, %s, %s)\n" % (e1, r, e2)) self.out_handle.flush() except: pass """ :decription: 为句子中的每个词语维护一个保存句法依存儿子节点的字典 :args: words: 分词列表 postags: 词性列表 arcs: 句法依存列表 """ def _build_sub_dicts(self, words, postags, arcs): sub_dicts = [] for idx in range(len(words)): sub_dict = dict() for arc_idx in range(len(arcs)): if arcs[arc_idx].head == idx + 1: if arcs[arc_idx].relation in sub_dict: sub_dict[arcs[arc_idx].relation].append(arc_idx) else: sub_dict[arcs[arc_idx].relation] = [] sub_dict[arcs[arc_idx].relation].append(arc_idx) sub_dicts.append(sub_dict) return sub_dicts """ :decription:完善识别的部分实体 """ def _fill_ent(self, words, postags, sub_dicts, word_idx): sub_dict = sub_dicts[word_idx] prefix = '' if 'ATT' in sub_dict: for i in range(len(sub_dict['ATT'])): prefix += self._fill_ent(words, postags, sub_dicts, sub_dict['ATT'][i]) postfix = '' if postags[word_idx] == 'v': if 'VOB' in sub_dict: postfix += self._fill_ent(words, postags, sub_dicts, sub_dict['VOB'][0]) if 'SBV' in sub_dict: prefix = self._fill_ent(words, postags, sub_dicts, sub_dict['SBV'][0]) + prefix return prefix + words[word_idx] + postfix
postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print "\t".join(netags) labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "srl/")) roles = labeller.label(words, postags, netags, arcs) for role in roles: print role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) segmentor.release() postagger.release() parser.release() recognizer.release()
class MyLTP(): def __init__(self): ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) # sys.path = [os.path.join(ROOTDIR, "lib")] + sys.path # Set your own model path self.MODELDIR = os.path.join(ROOTDIR, "./ltp_data") # Init LTP Model self.segmentor = Segmentor() self.postagger = Postagger() self.parser = Parser() self.recognizer = NamedEntityRecognizer() self.labeller = SementicRoleLabeller() self.segmentor.load(os.path.join(self.MODELDIR, "cws.model")) self.postagger.load(os.path.join(self.MODELDIR, "pos.model")) self.parser.load(os.path.join(self.MODELDIR, "parser.model")) self.recognizer.load(os.path.join(self.MODELDIR, "ner.model")) self.labeller.load(os.path.join(self.MODELDIR, "pisrl.model")) # 下述函数返回值均为 list, list[0] 为第一个句子的运行结果 # ---------------------------- 分词 ------------------------------- def MySegmentor(self, paragraph): # 段落分成句子 sentences = SentenceSplitter.split(paragraph) result = [] for sentence in sentences: words = self.segmentor.segment(sentence) # 输出 # print("\t".join(words)) result.append(words) return result # ---------------------------- 词性标注 ------------------------------- def MyPostagger(self, words): result = [] for word in words: postags = self.postagger.postag(word) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) # 输出 # print("\t".join(postags)) result.append(postags) return result # ---------------------------- 依存句法分析 ------------------------------- def MyParser(self, words, postags): result = [] for index in range(0, len(words)): arcs = self.parser.parse(words[index], postags[index]) # 输出 # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) result.append(arcs) return result # ---------------------------- 命名实体识别 ------------------------------- def MyRecognizer(self, words, postags): result = [] for index in range(0, len(words)): netags = self.recognizer.recognize(words[index], postags[index]) # 输出 # print("\t".join(netags)) result.append(netags) return result # ---------------------------- 语义角色标注 ------------------------------- def MyRoleLabller(self, words, postags, arcs): result = [] for index in range(0, len(words)): roles = self.labeller.label(words[index], postags[index], arcs[index]) # 输出 # for role in roles: # print(role.index, "".join( # ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) result.append(roles) return result