def answersemantic(resultwordlist, resultposlist): # 根据ltp进行句法分析,转换为 postagger = Postagger() # 初始化实例 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') postagger.load(pos_model_path) # 加载模型 parser = Parser() # 初始化实例 par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') parser.load(par_model_path) # 加载模型 postags = postagger.postag(resultwordlist) # 词性标注'' poslist = [] for i in postags: poslist.append(str(i)) print(poslist) arcs = parser.parse(resultwordlist, poslist) print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) arcshead = [] arcsrela = [] for i in arcs: arcshead.append(i.head) arcsrela.append(i.relation) print(arcshead) print(arcsrela) semanticlist = [] length = len(resultwordlist) poedictlist = [] quenum = -1 for i in range(0, len(resultposlist)): if resultposlist[i] == "question": quenum = i print("resultposlist,resultwordlist: ", resultwordlist, resultposlist) for i in range(0, length): if resultposlist[i] in nertypelist: num = findproperty(i, arcshead, arcsrela, resultposlist) if num != -1: # resultposlist[arcshead[i]-1]=="property":#战狼2的上映日期是什么时候 mov的属性是 # if arcsrela[i]=="ATT" or arcsrela[i]=="SBV": poedict = {} poedict["headnode"] = resultwordlist[i] poedict["headnodetype"] = resultposlist[i] if quenum == -1: questr = "" else: questr = questiondict[resultwordlist[quenum]] properresult = getrelation(propertydict[resultwordlist[num]], resultposlist[i], questr) endnodetype = getnodetype(propertydict[resultwordlist[num]], resultposlist[i], questr) poedict["relation"] = properresult poedict["endnode"] = "" poedict["endnodetype"] = endnodetype poedict["quesion"] = questr poedictlist.append(poedict) print(poedictlist) postagger.release() # 释放模型 parser.release() # 释放模型 return poedictlist
LTP_DATA_DIR, 'pisrl.model') # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。 par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 labeller = SementicRoleLabeller() # 初始化实例 labeller.load(srl_model_path) # 加载模型 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 words = segmentor.segment( '威尔士柯基犬是一种小型犬,它们的胆子很大,也相当机警,能高度警惕地守护家园,是最受欢迎的小型护卫犬之一。') # 分词 print('\t'.join(words)) postags = postagger.postag(words) # 词性标注 print('\t'.join(postags)) parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 # words = ['元芳', '你', '怎么', '看'] # postags = ['nh', 'r', 'r', 'v'] arcs = parser.parse(words, postags) # 句法分析 print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) # words = ['元芳', '你', '怎么', '看']
def main(): f = open("psgs.txt", "r") lines = [line.rstrip() for line in f.readlines()] f.close() segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) f = open("../questions/q_facts_segged_clf.txt", "r") types = f.readlines() f.close() f = open("../questions/provided/q_facts.txt", "r") questions = [line.rstrip() for line in f.readlines()] f.close() f = open("psgs_segged.txt", "w") fans = open("zhidao_answer.txt", "w") i = 0 qid = 0 flag = 0 while i < len(lines): line = lines[i] if (i % 50000 == 0): print "\r#\t%d" % i, sys.stdout.flush() if line.startswith("<question"): qid = int(line.split(" ")[1].split("=")[1].split(">")[0]) flag = 0 f.write(line + "\n") elif line.startswith("</doc") or line.startswith("</question"): f.write(line + "\n") elif line.startswith("<doc"): f.write(line + "\n" + lines[i+1] + "\n") i += 2 else: L = len(line) s = 0 for s in range(L): if line[s:].startswith("最佳答案:") \ or line[s:].startswith("[专业]答案")\ or line[s:].startswith("、"+questions[qid-1]): break if line[s:].startswith("最佳答案"): s += 14 elif line[s:].startswith("[专业]答案"): s += 15 elif line[s:].startswith("、"+questions[qid-1]): s += len(questions[qid-1])+1 if s < L and flag == 0: t = s + 1 while t < L and line[t:].startswith("更多") == False\ and not (t+2<L and line[t]==" " and line[t+1] in "0123456789" and line[t+2] in "0123456789")\ and not line[t:].startswith("~")\ and not line[t:].startswith("?")\ and not line[t:].startswith("!")\ and not line[t:].startswith("。"): t += 1 if s < t and t-s < 200 and t-s > 1: ans = line[s:t].rstrip(".。 ??,,") if types[qid-1].rstrip() == "Q_number": ans = first_con_number(ans) fans.write("%d\t%s\n" % (qid, ans)) flag = 1 # words = segmentor.segment(line) # postags = postagger.postag(words) # for j in range(len(words)): # f.write("%s/%s\t" % (words[j], postags[j])) # f.write("\n") i += 1 f.close() fans.close()
# -*- coding: utf-8 -*- import os import sys import random import numpy as np import xml.dom.minidom import jieba import pickle from pyltp import Segmentor from pyltp import Postagger seg = Segmentor() seg.load('./ltp_data/cws.model') postag = Postagger() postag.load('./ltp_data/pos.model') def get_features(sentence): # Filter sentence sentence = sentence.replace(' ', '') # Char char = list(sentence) word = seg.segment(sentence) pos = postag.postag(word) # Pos pos_ = [] for i in range(len(word)): for j in range(len(word[i])): pos_.append(('B-' if j == 0 else 'I-') + pos[i])
from pyltp import Parser, SentenceSplitter, Segmentor, Postagger, SementicRoleLabeller from data_resource import conn import networkx as nx import matplotlib.pyplot as mp MODEL_DIR_PATH = "E:\\ltp_data_v3.4.0\\" SEGMENTOR_MODEL = MODEL_DIR_PATH + "cws.model" # LTP分词模型库 POSTAGGER_MODEL = MODEL_DIR_PATH + "pos.model" # LTP词性标注模型库 PARSER_MODEL = MODEL_DIR_PATH + "parser.model" # LTP依存分析模型库 ROLE_LABELLER_MODEL = MODEL_DIR_PATH + "pisrl_win.model" # LTP语义角色标注模型库 segmentor = Segmentor() # 初始化分词实例 postagger = Postagger() # 初始化词性标注实例 parser = Parser() # 初始化依存句法分析实例 labeller = SementicRoleLabeller() # 初始化语义角色标注实例 segmentor.load(SEGMENTOR_MODEL) # 加载分词模型库 postagger.load(POSTAGGER_MODEL) # 加载词性标注模型库 parser.load(PARSER_MODEL) # 加载依存分析库 labeller.load(ROLE_LABELLER_MODEL) # 加载语义角色标注库 def test_pyltp_sentence_split(): cursor = conn.cursor() select_sql = "select p_content from law_content_parse" cursor.execute(select_sql) results = cursor.fetchall() count = 0 for res in results: if ':' in res[0] and count == 0: sens = SentenceSplitter.split(res[0])
def children(forder_list): LTP_DATA_DIR = 'E:/Program Files/workspace/ltp_data_v3.4.0' # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 read_path = "E:/Program Files/workspace/report/" write_sen_path = "E:/Program Files/workspace/report_sentence/" write_word_path = "E:/Program Files/workspace/report_word/" # ============================================================================= # for forder in tqdm(forder_list, desc='%s loop'%threading.current_thread().name): # sleep(0.1) # ============================================================================= for forder in forder_list: print("thread " + threading.current_thread().name + " is doing " + forder) file_list = os.listdir(read_path + forder) for file_name in file_list: if file_name == "desktop.txt": os.remove(read_path + forder + '/' + file_name) continue elif file_name == "desktop.ini": continue with open(read_path + forder + '/' + file_name, 'r', encoding="utf-8") as file: content = file.readlines() #去除首尾两行 if "政府工作报告" in content[1]: content.pop(0) content.pop(0) if "来源" in content[-2]: content.pop() content.pop() #去除原文中的非中文字符,分句 content_have_num = [] #含数字的分句 content_not_num = [] #不含数字的分句 for element in content: #将中英文双引号、顿号去掉 temp1_element = re.sub(r'[\、\”\“\"]', '', element) #将数字替换为“数” temp2_element = re.sub(r'[0-9]+\.?[0-9]+|[\d]', '数', temp1_element) #将非中文字符用空白替换 new_element = re.sub(r'[^\u4e00-\u9fa5]', ' ', temp2_element).split(' ') for item in new_element: #过滤去除字符后只剩一个字的元素 if len(item) >= 2 and len(item) < 4: if "数" not in item: content_have_num.append(item.strip()) elif len(item) >= 4: content_have_num.append(item.strip()) if "数" not in item: content_not_num.append(item.strip()) #分别写含数字和不含数字的分句文件 with open(write_sen_path + forder + '/have_num_' + file_name, 'w', encoding="utf-8") as file: file.write('\n'.join(content_have_num)) with open(write_sen_path + forder + '/not_num_' + file_name, 'w', encoding="utf-8") as file: file.write('\n'.join(content_not_num)) #对不含数字的句子分词 words_postags = [] for element in content_not_num: words = list(segmentor.segment(element)) postags = list(postagger.postag(words)) words_postags.append(' '.join(words)) words_postags.append(' '.join(postags)) #写不含数字的句子的分词和词性 with open(write_word_path + forder + '/' + file_name, 'w', encoding="utf-8") as file: file.write('\n'.join(words_postags)) postagger.release() # 释放模型 segmentor.release() # 释放模型
def _init_postagger(self): pos_model_path = os.path.join(self.models_path, 'pos.model') postagger = Postagger() postagger.load(pos_model_path) return postagger
def test_ltp(): from pyltp import Segmentor segmentor = Segmentor() #segmentor.load('/Users/a000/Downloads/ltp-models/3.3.2/ltp_data.model') segmentor.load('/Users/a000/git/ltp_data/cws.model') words = segmentor.segment('元芳你怎么看') words = segmentor.segment('这本书很好, 我喜欢iphone, 1.5') words = segmentor.segment('张子萱怀孕了') words = segmentor.segment('我有一本书') words = segmentor.segment('今天是2017年3月30日, 清朝的官员') words = segmentor.segment('蚂蚁金服近日上市') words = segmentor.segment('国家主席习近平抵达美国佛罗里达州') words = segmentor.segment('独家|你想要的胸以下全是腿, 科切拉潮人用不') total_txt = '<a href=\"http://deeporiginalx.com/search.html#sw=%E7%AC%AC%E4%B8%80%E7%99%BD%E9%93%B6%E7%BD%91\" target=\"_blank\">第一白银网</a>4月19日讯<a href=\"http://deeporiginalx.com/search.html#sw=%E7%8E%B0%E8%B4%A7%E7%99%BD%E9%93%B6\" target=\"_blank\">现货白银</a>今日早盘走势受到美元反弹影响继续走软,目前交投于18.2一线,本周二美国总统特朗普再次提及税改政策,并且宣称将会以“迅雷不及掩耳之势”落地,据小编分析,税改落地将会利好美国经济,从而利好美元,打压白银走势,但问题是,3月份连医改都进展不顺,税改会通过吗?(<a href=\"http://deeporiginalx.com/search.html#sw=%E7%BC%96%E8%BE%91%E6%8E%A8%E8%8D%90%EF%BC%9A%E6%9C%AA%E6%9D%A5%E7%99%BD%E9%93%B6%E8%B5%B0%E5%8A%BF%E5%88%86%E6%9E%90\" target=\"_blank\"><strong><span>编辑推荐:未来白银走势分析</span></strong></a>' total_txt = "<span class=\"article_src\">游民星空</span>2017-04-09<span>阅读原文</span>" soup = BeautifulSoup(total_txt, 'lxml') total_txt = soup.get_text() print total_txt print type(total_txt) words = segmentor.segment(total_txt.encode('utf-8')) #words = segmentor.segment(s) for i in words: print i import jieba w_jieba = jieba.cut('独家|你想要的胸以下全是腿, 科切拉潮人用不') print '!!!!!' for i in w_jieba: print i from pyltp import Postagger poser = Postagger() poser.load('/Users/a000/git/ltp_data/pos.model') #words_pos = poser.postag(words) #for i in xrange(len(words_pos)): # print words[i] # print words_pos[i] s1 = '张继科:脚伤恢复七八成 现在不是想退役的时候' s2 = '张继科:脚伤恢复八成 现在还不是退役的时候' #s2 = '张继科和马龙:脚伤恢复八成 现在还不是退役的时候' s3 = '张继科:脚伤已恢复7-8成 现在还不是退役的时候' s4 = '国际乒联排名:马龙丁宁占据榜首 张继科第四' s5 = '国际乒联公布排名:马龙丁宁第一 张继科第四' s6 = '国家主席习近平抵达美国佛罗里达州' s7 = '习近平抵达美国佛罗里达州' s8 = '习近平抵达美国佛罗里达州 同特朗普会晤' s9 = '习近平抵达美国佛罗里达州 将与特朗普举行会晤' s10 = '习近平抵达美国 将同特朗普举行会晤' s11 = '习近平抵达美国佛罗里达州 将同特朗普举行中美元首会晤' s12 = '【V观】习近平引用芬兰谚语:没有人的开拓就不会有路' s13 = '习近平引用芬兰谚语:没有人的开拓就不会有路' s14 = '习近平就圣彼得堡地铁发生爆炸造成伤亡向普京致慰问电' # s15 = '习近平就圣彼得堡地铁爆炸事件向普京致慰问电' #15135383 ss16 = '习近平就圣彼得堡市地铁发生爆炸造成严重人员伤亡向普京致慰问电' #15130013 ss17 = '习近平就圣彼得堡市地铁爆炸向普京致慰问电' #15127277 s16 = '习近平离京对芬兰进行国事访问并赴美国举行中美元首会晤' #15131991 s17 = '习近平离京对芬兰进行国事访问并赴美举行中美元首会晤' #15132864 s18 = '习近平离京对芬兰共和国进行国事访问并赴美国佛罗里达州举行中美元首会晤' #15131971 ws1 = segmentor.segment(s6) ws2 = segmentor.segment(s7) print ' '.join(ws1) print ' '.join(ws2) pos1 = poser.postag(ws1) pos2 = poser.postag(ws2) print ' '.join(pos1) print ' '.join(pos2) from pyltp import NamedEntityRecognizer reco = NamedEntityRecognizer() reco.load('/Users/a000/git/ltp_data/ner.model') ne1 = reco.recognize(ws1, pos1) ne2 = reco.recognize(ws2, pos2) print ' '.join(ne1) print ' '.join(ne2) from pyltp import Parser parser = Parser() parser.load('/Users/a000/git/ltp_data/parser.model') arc1 = parser.parse(ws1, pos1) arc2 = parser.parse(ws2, pos2) print ' '.join("%d:%s" % (arc.head, arc.relation) for arc in arc1) print ' '.join("%d:%s" % (arc.head, arc.relation) for arc in arc2)
class Ltp(LtpSegment): __model_dir = os.path.join('source', 'ltp_data_v3.4.0') # 词性标注 postagger = Postagger() postagger.load(os.path.join(__model_dir, "pos.model")) # 命名实体识别 recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(__model_dir, "ner.model")) # 依存句法分析 parser = Parser() parser.load(os.path.join(__model_dir, "parser.model")) # 语义角色标注 labeller = SementicRoleLabeller() labeller.load(os.path.join(__model_dir, "pisrl.model")) def __init__(self): super().__init__() def postag(self, words): """ 词性标注 :param input: 分词结果 list :return: 词性 list """ postags = self.postagger.postag(words) return list(postags) def recognize(self, words, postags): """ 命名实体识别: 1. LTP 采用 BIESO 标注体系:B表示实体开始词;I表示实体中间词;E表示实体结束词; S表示单独成实体;O表示不构成命名实体 2. LTP 提供的命名实体类型为:人名(Nh);地名(Ns);机构名(Ni) 3. B、I、E、S位置标签和实体类型标签之间用一个横线 - 相连;O标签后没有类型标签 例如: S-Nh 表示单独一个词构成了人名。 :param words: 分词结果 list :param postags: 词性标注结果 list :return: 命名实体标注结果 list """ netags = self.recognizer.recognize(words, postags) return list(netags) def parse(self, words, postags): """ 依存句法分析 :param words: 分词结果 list :param postags: 词性标注结果 list :return: ltp原生结果 (arc.head, arc.relation) for arc in arcs ROOT节点的索引是0,第一个词开始的索引依次为1、2、3 arc.relation 表示依存弧的关系。 arc.head 表示依存弧的父节点词的索引,arc.relation 表示依存弧的关系。 例: inputs: words = ['元芳', '你', '怎么', '看'] postags = ['nh', 'r', 'r', 'v'] output: 4:SBV 4:SBV 4:ADV 0:HED 输出格式为 head:relation """ arcs = self.parser.parse(words, postags) return arcs def label(self, words, postags, arcs): """ 语义角色标注 :param words: 分词结果 list :param postags: 词性标注结果 list :param arcs: 依存句法分析结果 ltp :return: ltp原生结果 (arg.name, arg.range.start, arg.range.end) for arg in role.arguments 第一个词开始的索引依次为0、1、2 返回结果 roles 是关于多个谓词的语义角色分析的结果。由于一句话中可能不含有语义角色,所以 结果可能为空。role.index 代表谓词的索引, role.arguments 代表关于该谓词的若干语义角 色。arg.name 表示语义角色类型,arg.range.start 表示该语义角色起始词位置的索引, arg.range.end 表示该语义角色结束词位置的索引。 例: inputs: words = ['元芳', '你', '怎么', '看'] postags = ['nh', 'r', 'r', 'v'] arcs 使用依存句法分析的结果 output: 3 A0:(0,0)A0:(1,1)ADV:(2,2) 由于结果输出一行,所以“元芳你怎么看”有一组语义角色。 其谓词索引为3,即“看”。 这个谓词有三个语义角色范围分别是: (0,0)即“元芳”,(1,1)即“你”,(2,2)即“怎么”,类型分别是A0、A0、ADV。 """ roles = self.labeller.label(words, postags, arcs) return roles def get_name_entity(self, sentence, entity_type): """ 获取句子中特定的命名实体集 :param sentence: 待分析句子 :param entity_type: 待分析命名实体类型,可选值 :return: """ words = self.segment(sentence) postags = self.postag(words) ne_tags = self.recognize(words, postags) sentence_len = len(words) ret_entity = set() entity_pattern = "" for i in range(sentence_len): if (ne_tags[i] == 'B-' + entity_type) or (ne_tags[i] == 'B-' + entity_type): entity_pattern += words[i] elif (ne_tags[i] == 'E-' + entity_type) or (ne_tags[i] == 'S-' + entity_type): entity_pattern += words[i] ret_entity.add(entity_pattern) entity_pattern = "" return list(ret_entity)
from pyltp import Parser from pyltp import NamedEntityRecognizer from pyltp import SentenceSplitter from scipy.spatial.distance import cosine from bert_serving.client import BertClient cws_model_path = os.path.join(os.path.abspath('./'), 'ltp_Model', 'cws.model') pos_model_path = os.path.join(os.path.abspath('./'), 'ltp_Model', 'pos.model') par_model_path = os.path.join(os.path.abspath('./'), 'ltp_Model', 'parser.model') ner_model_path = os.path.join(os.path.abspath('./'), 'ltp_Model', 'ner.model') say_words_path = os.path.join(os.path.abspath('./'), 'data', 'saying_words.pickle') segmentor = Segmentor() # 分词 postagger = Postagger() # 词性标注 recognizer = NamedEntityRecognizer() # 命名主体识别 parser = Parser() # 依存分析 segmentor.load(cws_model_path) postagger.load(pos_model_path) recognizer.load(ner_model_path) parser.load(par_model_path) # load saying words say_words = pickle.load(open(say_words_path, 'rb')) # 句子依存分析 def parsing(sentence): words = segmentor.segment(sentence) # pyltp分词
def tag(words): # 词性标注 postagger = Postagger() postagger.load(pos_model_path) postags = postagger.postag(words) postagger.release() # 释放模型 return postags
def __init__(self): """ init method required. set batch_size, and load some resources. """ self.batch_size = 256 FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string("ckpt_dir", "predictor/checkpoint/", "checkpoint location for the model") tf.app.flags.DEFINE_string("ckpt_dir_accu", "./all_data/han/checkpoint_accu/", "checkpoint location for the model") tf.app.flags.DEFINE_string("ckpt_dir_law", "./all_data/han/checkpoint_law/", "checkpoint location for the model") tf.app.flags.DEFINE_string("ckpt_dir_imprision", "./all_data/han/checkpoint_imprision/", "checkpoint location for the model") tf.app.flags.DEFINE_string("vocab_word_path", "predictor/word_freq.txt", "path of word vocabulary.") tf.app.flags.DEFINE_string("accusation_label_path", "predictor/accu.txt", "path of accusation labels.") tf.app.flags.DEFINE_string("article_label_path", "predictor/law.txt", "path of law labels.") tf.app.flags.DEFINE_string("stopwords_file", "predictor/stopword.txt", "path of stopword") tf.app.flags.DEFINE_float("learning_rate", 0.001, "learning rate") tf.app.flags.DEFINE_integer( "decay_steps", 1000, "how many steps before decay learning rate.") tf.app.flags.DEFINE_float("decay_rate", 1.0, "Rate of decay for learning rate.") tf.app.flags.DEFINE_integer("sentence_len", 400, "max sentence length") tf.app.flags.DEFINE_integer("num_sentences", 16, "number of sentences") tf.app.flags.DEFINE_integer("embed_size", 300, "embedding size") #64 tf.app.flags.DEFINE_integer("hidden_size", 256, "hidden size") #128 tf.app.flags.DEFINE_integer( "num_filters", 128, "number of filter for a filter map used in CNN.") #128 tf.app.flags.DEFINE_integer("embed_size_dpcnn", 64, "embedding size") tf.app.flags.DEFINE_integer("hidden_size_dpcnn", 128, "hidden size") #tf.app.flags.DEFINE_integer("num_filters_big", 128, "number of filter for a filter map used in CNN.") tf.app.flags.DEFINE_string( "model_dpcnn", "dp_cnn", "name of model:han,c_gru,c_gru2,gru,text_cnn") tf.app.flags.DEFINE_string("ckpt_dir_dpcnn", "./checkpoint_dpcnn_big/checkpoint/", "checkpoint location for the model") tf.app.flags.DEFINE_boolean( "is_training", False, "is traning.true:tranining,false:testing/inference") tf.app.flags.DEFINE_string( "model", "han", "name of model:han,c_gru,c_gru2,gru,text_cnn") #tf.app.flags.DEFINE_boolean("is_training_flag", False, "is traning.true:tranining,false:testing/inference") tf.app.flags.DEFINE_string('cws_model_path', 'predictor/cws.model', 'cws.model path') tf.app.flags.DEFINE_string('pos_model_path', 'predictor/pos.model', 'pos.model path') tf.app.flags.DEFINE_string('ner_model_path', 'predictor/ner.model', 'ner.model path') tf.app.flags.DEFINE_string('gpu', '1', 'help to select gpu divice') os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu segm = Segmentor() segm.load(FLAGS.cws_model_path) # ltp 模型 post = Postagger() post.load(FLAGS.pos_model_path) recognizer = NamedEntityRecognizer() recognizer.load(FLAGS.ner_model_path) self.ltp_model = [segm, post, recognizer] filter_sizes = [2, 3, 4, 5 ] #,6,7,8]#[2,3,4,5]#[6, 7, 8, 9, 10] # [30,40,50] #8 #filter_sizes_big= [2,3,4,5]#,6,7,8]#[2,3,4,5]#[6, 7, 8, 9, 10] # [30,40,50] #8 stride_length = 1 #1.load label dict, restore model from checkpoint # 1.load label dict self.vocab_word2index = load_word_vocab(FLAGS.vocab_word_path) accusation_label2index = load_label_dict_accu( FLAGS.accusation_label_path) articles_label2index = load_label_dict_article( FLAGS.article_label_path) deathpenalty_label2index = {True: 1, False: 0} lifeimprisonment_label2index = {True: 1, False: 0} vocab_size = len(self.vocab_word2index) accusation_num_classes = len(accusation_label2index) article_num_classes = len(articles_label2index) deathpenalty_num_classes = len(deathpenalty_label2index) lifeimprisonment_num_classes = len(lifeimprisonment_label2index) # 2.restore checkpoint config = tf.ConfigProto() config.gpu_options.allow_growth = True graph = tf.Graph().as_default() with graph: self.model = HierarchicalAttention(accusation_num_classes, article_num_classes, deathpenalty_num_classes, lifeimprisonment_num_classes, FLAGS.learning_rate, self.batch_size, FLAGS.decay_steps, FLAGS.decay_rate, FLAGS.sentence_len, FLAGS.num_sentences, vocab_size, FLAGS.embed_size, FLAGS.hidden_size, num_filters=FLAGS.num_filters, model=FLAGS.model, filter_sizes=filter_sizes, stride_length=stride_length) saver = tf.train.Saver() sess = tf.Session(config=config) saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir_accu)) self.sess_accu = sess saver_law = tf.train.Saver() sess_law = tf.Session(config=config) saver_law.restore(sess_law, tf.train.latest_checkpoint(FLAGS.ckpt_dir_law)) self.sess_law = sess_law self.FLAGS = FLAGS
def getRelation(paragraph): """ paragraph: a list of string, each string is a sentence return: a list of relations and a dict which records the number of occurrence of differents DSNF """ relations = [] dict_DSNF = { 'num_DSNF1': 0, 'num_DSNF2': 0, 'num_DSNF3': 0, 'num_DSNF7': 0, } segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) for iteration, sentence in enumerate(paragraph): sentence = SentenceSplitter.split(sentence)[0] words = segmentor.segment(sentence) # print("\t".join(words)) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) # print("\t".join(postags)) arcs = parser.parse(words, postags) # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) netags = recognizer.recognize(words, postags) # print("\t".join(netags)) # labeller = SementicRoleLabeller() # labeller.load(os.path.join(MODELDIR, "pisrl.model")) # roles = labeller.label(words, postags, arcs) # for role in roles: # print(role.index, "".join( # ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) entityList = findEntities(netags) # print(entityList) entities = [] for i in entityList: l = '' for j in i: l += words[j] entities.append(l) DSNF1_ret = DSNF1(arcs, entityList, words, netags) DSNF2_ret = DSNF2(arcs, entityList, words) DSNF3_ret = DSNF3(arcs, entityList, words, postags) DSNF7_ret = DSNF7(arcs, entityList, words) # print("DSNF1 result: ", DSNF1_ret) # print("DSNF2 result: ", DSNF2_ret) # print("DSNF3 result: ", DSNF3_ret) # print("DSNF7 result: ", DSNF7_ret) relation = [] for r in DSNF1_ret: dict_DSNF['num_DSNF1'] += 1 new_r = [r[0], r[2], r[1]] relation.append((new_r, sentence)) relations.append((new_r, sentence)) for r in DSNF2_ret: dict_DSNF['num_DSNF2'] += 1 new_r = [r[0], r[2], r[1]] relation.append((new_r, sentence)) relations.append((new_r, sentence)) for r in DSNF3_ret: dict_DSNF['num_DSNF3'] += 1 new_r = [r[0], r[2], r[1]] relation.append((new_r, sentence)) relations.append((new_r, sentence)) for r in DSNF7_ret: dict_DSNF['num_DSNF7'] += 1 new_r = [r[0], r[2], r[1]] relation.append((new_r, sentence)) relations.append((new_r, sentence)) if len(relation) > 0: print("evaluate the " + str(iteration + 1) + "-th sentences") print("entities in " + str(iteration + 1) + "-th sentence : ", entities) for one in relation: r = one[0] data = {'sentence': sentence, 'kg': [r[0], r[1], r[2]]} # print('r',r) key = get_key(data) old = DB.kg_mark.find_one({"_id": key}) if old == None: kg.mark_sentence(key, data) else: print("已经存在跳过") continue print(one) p, softmax = pre(data) print("with entities relation: ", r) print("预测:", p, "概率:", softmax) data['label'] = p data['state'] = '4' #设置状态4独立开来 print(data) # if len(relation)==3: # print("关系",relation[1],relation[2],relation[0]) print("--" * 30) segmentor.release() postagger.release() parser.release() recognizer.release() # labeller.release() return relations, dict_DSNF
def __init__(self): self.postagger = Postagger() self.postagger.load('data/ltp_data/pos.model')
def pos_tag(words: list) -> list: # 词性标注 global postagger if not postagger: postagger = Postagger() postagger.load(pos_path) # 加载模型 return list(postagger.postag(words))
for i in range(len(input_txt)): input_txt[i] = input_txt[i].replace('\n', ' ') input.append(input_txt[i].split('\t')) #导入模型 LTP_DATA_DIR = '../../../pyltp/model/ltp_data_v3.4.0/' # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` #par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 句法分析模型路径,模型名称为`parser.model` segmentor = Segmentor() # 初始化分词实例 segmentor.load(cws_model_path) # 加载分词模型 postagger = Postagger() # 初始化标注实例 postagger.load(pos_model_path) # 加载标注模型 #parser = Parser() # 初始化实例 #parser.load(par_model_path) # 加载模型 f = open("../data/CGED16_Input.txt", 'w') for i in range(len(input)): flag = 0 for j in range(len(truth_txt)): truth_txt[j] = truth_txt[j].replace(' ', '') truth_txt[j] = truth_txt[j].replace('\n', '') truth.append(truth_txt[j].split(',')) word_list = segmentor.segment(input[i][1]) # 分词 tag_list = postagger.postag(word_list) # 词性标注
def __init__(self): self.LTP_DATA_DIR = '/Users/yf/Downloads/ltp_data_v3.4.0' # 自定义分词表 self.cut_file = '/Users/yf/Downloads/ltp_data_v3.4.0/cut.txt' # 分词结果 self.cut_list = [] # 依存关系 self.arcs = None # 词性 self.part_speech_list = [] # 分词 self.segmentor = Segmentor() self.segmentor.load_with_lexicon( os.path.join(self.LTP_DATA_DIR, 'cws.model'), self.cut_file) # 词性标注 self.postagger = Postagger() self.postagger.load(os.path.join(self.LTP_DATA_DIR, 'pos.model')) # 命名实体识别 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(self.LTP_DATA_DIR, 'ner.model')) # 依存句法分析 self.parser = Parser() self.parser.load(os.path.join(self.LTP_DATA_DIR, 'parser.model')) # 语义角色标注 self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(self.LTP_DATA_DIR, 'pisrl.model')) # 词性标注集 self._dict = { "a": "形容词", "ni": "机构名称", "b": "其他名词修饰语", "nl": "位置名词", "c": "连词", "ns": "地名", "d": "副词", "nt": "时态名词", "e": "感叹", "nz": "其他专有名词", "g": "词素", "o": "拟声词", "h": "字首", "p": "介词", "i": "成语", "q": "数量", "j": "缩写", "r": "代词", "k": "后缀", "u": "辅助的", "m": "数", "v": "动词", "n": "一般名词", "wp": "标点", "nd": "方向名词", "ws": "外来词", "nh": "人名", "x": "最小意义单位" } # 依存句法关系 self._dict2 = { "SBV": "主谓关系", "VOB": "动宾关系", "IOB": "间宾关系", "FOB": "前置宾语", "DBL": "兼语", "ATT": "定中关系", "ADV": "状中结构", "CMP": "动补结构", "COO": "并列关系", "POB": "介宾关系", "LAD": "左附加关系", "RAD": "右附加关系", "IS": "独立结构", "HED": "核心关系" } # 命名实体识别标注集 self._idct3 = { "O": "这个词不是NE", "S": "这个词单独构成一个NE", "B": "这个词为一个NE的开始", "I": "这个词为一个NE的中间", "E": "这个词位一个NE的结尾" } self._dict4 = {"Nh": "人名", "Ni": "机构名", "Ns": "地名"} # 语义角色类型 self._dict5 = { "ADV": "默认标记", "BNE": "受益人", "CND": "条件", "DIR": "方向", "DGR": "程度", "EXT": "扩展", "FRQ": "频率", "LOC": "地点", "MNR": "方式", "PRP": "目的或原因", "TMP": "时间", "TPC": "主题", "CRD": "并列参数", "PRD": "谓语动词", "PSR": "持有者", "PSE": "被持有" }
def run(): segmentor = Segmentor() # 初始化实例 # segmentor.load('cws.model') # 加载模型,不加载字典 segmentor.load_with_lexicon('cws.model', 'userdict.txt') # 加载模型,加载用户字典 postagger = Postagger() # 初始化实例 postagger.load('pos.model') # 加载模型 # sentence1=['操作简单','洗衣效果好'] # sentence1=open('key/pinglun_filter_all1.txt','r',encoding='utf-8') sentence1 = open('key/pinglun_resource.txt', 'r', encoding='utf-8') pinglun = open('key1/pinglun_all.txt', 'w', encoding='utf-8') pinglun_cut = open('key1/pinglun_cut.txt', 'w', encoding='utf-8') pinglun_tag = open('key1/pinglun_tag.txt', 'w', encoding='utf-8') for sentence in sentence1: words = segmentor.segment(sentence) # 分词 #默认可以这样输出 # print ('\t'.join(words)) pinglun_cut.write('\t'.join(words) + '\n') postags = postagger.postag(words) # 词性标注 for word, tag in zip(words, postags): #############选择词性输出 # print (word+'/'+tag) pinglun_tag.write(word + '/' + tag + '\n') ############只选出副词 if ((tag == 'n' or tag == 'd' or tag == 'a') and len(word) > 1): # if tag == 'd': # print (word+'/'+tag) ##########只输出我们想要的词,去除词性 pinglun.write(word + '\n') segmentor.release() # 释放模型 postagger.release() sentence1.close() pinglun.close() pinglun_cut.close() pinglun_tag.close() #######################开始统计词频加上制作词典 word_df = [] in_file = 'key1/pinglun_all.txt' out_file = 'key1/pinglun_dict.txt' word_count = {} #统计词频的字典 for line in open(in_file, 'r', encoding='utf-8'): words = line.strip().split("\n") for word in words: if word in word_count: word_count[word] += 1 else: word_count[word] = 1 out = open(out_file, 'w', encoding='utf-8') for word in word_count.keys(): if word is not '': #按词频的顺序遍历字典的每个元素 out.write(word) word_df.append([word, str(word_count.get(word))]) out.write('\n') print('制作的字典已经保存到key1/pinglun_dict.txt') out.close() # number = 0 with open("key1/pinglun_reverse.txt", 'w', encoding='utf-8') as wf2: word_df.sort(key=lambda x: int(x[1]), reverse=True) wf2.truncate() for item in word_df: for word in item: wf2.write(word + '\t') wf2.write('\n') # number += 1 # if number == 50: # break print('字典倒序排序已经保存到key1/pinglun_reverse.txt ') wf2.close()
def __init__(self): self.parser = Parser() self.postagger = Postagger() self.parser.load(f'{LTP_DATA_DIR}/parser.model') self.postagger.load(f'{LTP_DATA_DIR}/pos.model')
def simlify(text): LTP_DATA_DIR = r'E:\anaconda\ltpmoxin\ltp_data' # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` lexicon_path = os.path.join(LTP_DATA_DIR, 'lexicon') # 分词词典lexicon segmentor = Segmentor() # 初始化实例 # segmentor.load(cws_model_path) # 加载模型,如果不想自定义词典,就用这一句load模型即可 segmentor.load_with_lexicon(cws_model_path, lexicon_path) # 加载模型,参数lexicon是自定义词典的文件路径 words = segmentor.segment(text) # 分词 #print('|'.join(words))#打印分词结果 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 postags = postagger.postag(words) # 词性标注,这里words是分词后的list #print(' | '.join(postags)) postagger.release() # 释放模型 par_model_path = os.path.join( LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 arcs = parser.parse(words, postags) # 句法分析 parser.release() # 释放模型 #信息提取,结果展示 rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 #for i in range(len(words)): #print(relation[i] +'(' + words[i] +', ' + heads[i] +')') array = [] for i in range(len(words)): dict = {} dict["dep"] = words[i] dict["gov"] = heads[i] dict["pos"] = relation[i] array.append(dict) return array ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 #for word, ntag in zip(words, netags): # print(word + '/' + ntag) recognizer.release() # 释放模型
class Books: ''' 获取每一章节的主要人物和整本书的主要人物 ''' ltp_model_path = "E:\\NLP-homework\\ltp-data-v3.3.1\\ltp_data" # ltp模型路径 book_root_path = "E:\\NLP-homework\\book" # 书籍路径 mainrole_root_path = "E:\\NLP-homework\\MainRole" # 主要人物路径 mainloc_root_path = "E:\\NLP-homework\\MainLocation" #主要地点路径 seg = Segmentor() seg.load(ltp_model_path + '/cws.model') postagger = Postagger() # 初始化实例 postagger.load(ltp_model_path + '/pos.model') # 加载模型 def readBookLines(self, path): rf = open(path, "r", encoding="utf-8") lines = rf.readlines() rf.close() return lines def writeTxt(self, path, namelist): print("path", path, " namelist:", namelist, "结果写入") wf = open(path, "w", encoding="utf-8") for name, times, freq in namelist: wf.write(str(name) + " " + str(times) + " " + str(freq) + "\n") wf.close() print(namelist, " writeTxt over") def segmentor(self, sentence="这是测试"): words = self.seg.segment(sentence) words_list = list(words) #for word_list in words_list: # print("segmentor 1:", word_list) return words_list def postagNLNS(self, word_list): ''' ltp词性标注 nl 位置名 ns 地名 :param word_list: :return: ''' postags = self.postagger.postag(word_list) #词性标注 locations_list = [] for word, tag in zip(word_list, postags): if (tag == "nl" or tag == "ns") and len(word) > 3: #print("postagNLNS :",word," ",tag) locations_list.append(word) return locations_list def postagNH(self, word_list): postags = self.postagger.postag(word_list) name_list = [] for word, tag in zip(word_list, postags): if tag == "nh" and len(word) > 3: print("postagNH :", word, '/', tag) name_list.append(word) print("postagNH name_list", name_list) print("postagNH list(postags)", list(postags)) return list(postags), name_list def getTopTen(self, namelist): resultitf = [] resultname = [] top10Name = [] chapter_fdist = nltk.FreqDist(namelist) #nltk top_name_list = sorted(chapter_fdist.items(), key=lambda x: x[1], reverse=True) for name, num in top_name_list[0:10]: tmplist = [name] * num top10Name += tmplist resultname.append(name) chapter_fdist_ten = nltk.FreqDist(top10Name) for name1, num1 in sorted(chapter_fdist_ten.items(), key=lambda x: x[1], reverse=True): print(name1, num1, round(float(chapter_fdist_ten.freq(name1)), 2)) resultitf.append( (name1, num1, round(float(chapter_fdist_ten.freq(name1)), 2))) return resultitf, resultname def mainLocation(self, filename="Thethreebodyproblem.txt"): lines = self.readBookLines(self.book_root_path + "/" + filename) print("mainLocation:", "filename", filename) lo_list_book = [] for line in lines: if line != "": sents = SentenceSplitter.split(line) for sent in sents: words_line = self.segmentor(sent) lo_list_line = self.postagNLNS(words_line) lo_list_book += lo_list_line top_itf_book, top_loc_book = self.getTopTen(lo_list_book) lo_list_book += top_loc_book self.writeTxt(self.mainloc_root_path + "/" + filename, top_itf_book) def mainName(self, filename): lines = self.readBookLines(self.book_root_path + "\\" + filename) print("mainName 1 :", filename) name_list_book = [] for line in lines: if line != "": sents = SentenceSplitter.split(line) for sent in sents: words_line = self.segmentor(sent) postags_line, name_list_line = self.postagNH(words_line) name_list_book += name_list_line print("mainName 0 name_list_book", name_list_book) top_itf_book, top_name_book = self.getTopTen(name_list_book) print("mainName 2 top_name_book:", top_name_book) print("mainName 3 top_itf_book:", top_itf_book) self.writeTxt(self.mainrole_root_path + '\\' + filename, top_itf_book) def getAllMainName(self): filenames = os.listdir(self.book_root_path) print("getAllMainName1:", filenames) for filename in filenames: print("getAllMainName2 : ", filename) self.mainName(filename) def getAllMainLoc(self): filenames = os.listdir(self.book_root_path) for filename in filenames: print("getAllMainLoc : ", filename) self.mainLocation(filename)
def cal_sentiment_NER(df_text): """ natural language processing on every row from the input. 1. for loop dataframe: 2. preprocess text in the df. 3. get entity using pyLTP 4. get sentiment, keywords, summary using SnowNLP. 5. append result to df Keyword Arguments: df_text -- """ # 词性标注 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 # 命名实体识别 ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) if isinstance(df_text, gftIO.GftTable): df_text = df_text.as_mutable_column_tab() df_result = pd.DataFrame(columns=[ 'datetime', 'people', 'geography', 'organization', 'keyword', 'summary', 'score' ]) for item in df_text[:10].iterrows(): # print(item[1]['Conclusion']) logging.info(item[0]) text = item[1]['Conclusion'] datetime = item[1]['WritingDate'] if not pd.isnull(text): text_split = preprocessing.preprocess_string(text) # 词性标注 # postagger = Postagger() # 初始化实例 words = text_split.split() # 分词结果 postags = postagger.postag(words) # 词性标注 netags = recognizer.recognize(words, postags) # 命名实体识别 dict_netags = defaultdict(list) ls_netags = list(zip(netags, words)) for x, y in ls_netags: dict_netags[x].append(y) s = SnowNLP(text) score = s.sentiments * 2 # # 人名(Nh)、地名(Ns)、机构名(Ni。) # # B、I、E、S ls_organization = [ dict_netags[x] for x in ['S-Ni', 'B-Ni', 'E-Ni', 'I-Ni'] ] ls_people = [ dict_netags[x] for x in ['S-Nh', 'B-Nh', 'E-Nh', 'I-Nh'] ] ls_geography = [ dict_netags[x] for x in ['S-Ns', 'B-Ns', 'E-Ns', 'I-Ns'] ] try: df_result = df_result.append( { 'datetime': datetime, 'keyword': ','.join(s.keywords()), 'organization': list(itertools.chain.from_iterable(ls_organization)), 'people': list(itertools.chain.from_iterable(ls_people)), 'geography': list(itertools.chain.from_iterable(ls_geography)), 'summary': ';'.join(s.summary()), 'score': score # 'text': text, }, ignore_index=True) except: continue return df_result
def person_location_entity(word_list): """ 利用ltp获取人物及地点实体 :param word_list: 分词后的词语列表 :return:返回实体字典,key为:Nh,Ns,Ni,value为列表 """ logging.info('enter person_location_entity...') ner_dic = {} ner = '' if len(word_list) == 0: return ner_dic MODEL_PATH = r'/home/yanlei/IdeaProjects/hotpot/ltp_model' pos_model_path = os.path.join(MODEL_PATH, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` ner_model_path = os.path.join(MODEL_PATH, 'ner.model') # 实体识别模型路径,模型名称为`ner.model` # 1.初始化实例 logging.info('initilizing...') postagger = Postagger() recognizer = NamedEntityRecognizer() # 2.加载模型及字典 logging.info('loading...') postagger.load(pos_model_path) recognizer.load(ner_model_path) # 3.词性标注,remove函数无返回值 logging.info('postaging...') if word_list.count('\n') > 0: word_list.remove('\n') postags = postagger.postag(word_list) # 4.实体识别 logging.info('recognizering...') netags = recognizer.recognize(word_list, postags) # 命名实体识别 # print ('\t'.join(netags)) # 5.结果处理 logging.info('result operating...') index = 0 # 词语索引 for tag in netags: # 如果找不到-,直接越过 if tag.find('-') == -1: continue # 以’-’分隔,前面为词语在实体中的位置(B 表示实体开始词,I表示实体中间词,E表示实体结束词,S表示单独成实体), # 后面为实体类型(人名(Nh)、地名(Ns)、机构名(Ni)) position, type = tag.split('-') if position == 'S': ner_dic.setdefault(type, []) ner_dic[type].append(word_list[index]) elif position == 'B': ner = word_list[index] elif position == 'I': ner += word_list[index] elif position == 'E': ner += word_list[index] ner_dic.setdefault(type, []) ner_dic[type].append(ner) ner = '' index += 1 # 按规则过滤 for type, value in ner_dic.items(): ner_dic[type] = filter_entity(ner_dic.get(type), type) # print(ner_dic) logging.info('releasing...') postagger.release() # 释放模型 recognizer.release() # 释放模型 return ner_dic
class LtpFormatter: model_dir = os.path.join("utils", "ltp_data_v3.4.0") # 注意这里的位置需要调整为运行位置到ltp的相对位置,或者设置为绝对位置 segmentor = Segmentor() segmentor.load(os.path.join(model_dir, "cws.model")) postagger = Postagger() postagger.load(os.path.join(model_dir, "pos.model")) parser = Parser() parser.load(os.path.join(model_dir, "parser.model")) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(model_dir, "ner.model")) labeller = SementicRoleLabeller() labeller.load(os.path.join(model_dir, "pisrl.model")) def format_only_pos(self, sentence): results = {'basic': [], 'role': []} words = self.segmentor.segment(sentence) postags = self.postagger.postag(words) index = 0 for word, postag in zip(words, postags): results['basic'].append({ 'index': index, 'word': word, 'pos': postag }) index += 1 return results def format(self, sentence): results = {'basic': [], 'role': []} words = self.segmentor.segment(sentence) postags = self.postagger.postag(words) arcs = self.parser.parse(words, postags) netags = self.recognizer.recognize(words, postags) roles = self.labeller.label(words, postags, arcs) index = 0 for word, postag, arc, netag in zip(words, postags, arcs, netags): results['basic'].append({ 'index': index, 'word': word, 'pos': postag, 'entity': netag, 'head': arc.head - 1, 'relation': arc.relation }) index += 1 for role in roles: relations = [] for arg in role.arguments: relations.append({ 'name': arg.name, 'start': arg.range.start, 'end': arg.range.end }) results['role'].append({ 'trigger': words[role.index], 'index': role.index, 'relation': relations }) return results def release(self): self.segmentor.release() self.postagger.release() self.parser.release() self.recognizer.release() self.labeller.release()
with open('../data/cont.txt','r',encoding='utf8') as f: content = f.read() print(content) for line in content.split('\n'): print(line) print('----') seg = Segmentor() seg.load(model_path) words = seg.segment(content) seg.release() pos = Postagger() pos.load(pos_path) postag = pos.postag(words) pos.release() union = list(zip(list(words),list(postag))) union_list = [x+' :'+y for x,y in union] ner_path = os.path.abspath('./coach/ltp_data_v3.4.0/ner.model') recognizer = NamedEntityRecognizer() recognizer.load(ner_path) # print(list(words)) # print(list(postag)) ner_list = recognizer.recognize(list(words),list(postag)) ner_list = pd.Series(ner_list,dtype='str') # ner_list = ner_list.loc[ner_list.apply(lambda x : str(x) != '0')]
trains_path = os.path.dirname(ltp_train_path) src_path = os.path.dirname(trains_path) project_path = os.path.dirname(src_path) cws_model_path = os.path.join(project_path, "models/ltp_data_v3.4.0/cws.model") user_dict = os.path.join(project_path, "models/ltp_data_v3.4.0/fulluserdict.txt") pos_model_path = os.path.join(project_path, "models/ltp_data_v3.4.0/pos.model") parser_model_path = os.path.join(project_path, "models/ltp_data_v3.4.0/parser.model") sent = "欧洲东部的罗马尼亚,首都是布加勒斯特,也是一座世界性的城市。" segmentor = Segmentor() # 实例化分词模块 segmentor.load_with_lexicon(cws_model_path, user_dict) words = segmentor.segment(sent) postagger = Postagger() # 实例化词性标注类 postagger.load(pos_model_path) postags = postagger.postag(words) parser = Parser() # 句法解析 parser.load(parser_model_path) arcs = parser.parse(words, postags) arclen = len(arcs) print("len(arcs): {}".format(arclen)) conll = "" for i in range(arclen): # 构建Conll标准的数据结构 if arcs[i].head == 0: arcs[i].relation = "ROOT" conll += "\t" + words[i] + "(" + postags[i] + ")" + "\t" + postags[i] + "\t" + str(arcs[i].head) + "\t" + arcs[
__author__ = "tianwen jiang" # Set your own model path MODELDIR = "/data/ltp/ltp-models/3.3.0/ltp_data" import sys import os from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer print "正在加载LTP模型... ..." segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) #labeller = SementicRoleLabeller() #labeller.load(os.path.join(MODELDIR, "srl/")) print "加载模型完毕。" in_file_name = "input.txt" out_file_name = "output.txt"
segmentor_with_force_vocab = Segmentor( os.path.join(MODELDIR, "cws.model"), force_lexicon_path='lexicon.txt' # 除上述功能外,原本合并在一起的亦会拆分 ) words = segmentor.segment(sentence) print("\t".join(words)) words_with_vocab = segmentor_with_vocab.segment(sentence) print("\t".join(words_with_vocab), "\t\t| With Vocab") words_with_force_vocab = segmentor_with_force_vocab.segment(sentence) print("\t".join(words_with_force_vocab), "\t| Force Vocab") postagger = Postagger(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print("\t".join(postags)) parser = Parser(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print("\t".join("%d:%s" % (head, relation) for (head, relation) in arcs)) recognizer = NamedEntityRecognizer(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print("\t".join(netags)) labeller = SementicRoleLabeller(os.path.join(MODELDIR, "pisrl.model"))
def get_postags(words): postagger = Postagger() postagger.load(pos_model_path) postags = postagger.postag(words) postagger.release() return list(postags)
def main(): clf = joblib.load('model.pkl') pf = list() ne1s = list() ne2s = list() model = models.Word2Vec.load_word2vec_format('cn.cbow.bin', binary=True, unicode_errors = 'ignore') segmentor = Segmentor() postagger = Postagger() recognizer = NamedEntityRecognizer() segmentor.load("ltp_data/cws.model") postagger.load("ltp_data/pos.model") recognizer.load("ltp_data/ner.model") ifsen = 1 input = open("trelationExtractionTrainingCorpus.txt", "r") outputfv = open('feature_vector.txt', 'w') outputfr = open('feature_result.txt', 'w') outputp = open('predict_result.txt', 'w') line = input.readline() senNo = 0 while line: if line[0] == '|': namedEntityBegin = list() namedEntityEnd = list() namedEntityCount = 0 i = 0 for netag in netags: if netag == 'O': i = i + 1 continue if netag == 'S-Ni' or netag == 'S-Nh' or netag == 'S-Ns': namedEntityBegin.append(i) namedEntityEnd.append(i) namedEntityCount = namedEntityCount + 1 i = i + 1 continue if netag == 'B-Ni' or netag == 'B-Nh' or netag == 'B-Ns': namedEntityBegin.append(i) namedEntityCount = namedEntityCount + 1 i = i + 1 continue if netag == 'E-Ni' or netag == 'E-Nh' or netag == 'E-Ns': namedEntityEnd.append(i) i = i + 1 continue else: i = i + 1 continue for i in range(namedEntityCount): j = namedEntityBegin[i] while (j<=namedEntityEnd[i]): print words[j], j = j + 1 print '\n' for i in range(namedEntityCount): for j in range(namedEntityCount): if j > i: print '%d, %d' % (i,j) neType1 = neType(netags[namedEntityBegin[i]]) neType2 = neType(netags[namedEntityBegin[j]]) if neType1*neType2>0 or neType1+neType2==0: continue featureVector = list() featureVector.append(neType1) featureVector.append(neType2) if namedEntityBegin[i] < 3: leftWindowScale = namedEntityBegin[i] else: leftWindowScale = 2 featureVector.append(leftWindowScale) if leftWindowScale == 0: for k in range(300): featureVector.append(0) featureVector.append(0) elif leftWindowScale == 1: try: t = model[words[namedEntityBegin[i]-1].decode('utf-8')] for k in t: featureVector.append(k) except: for k in range(300): featureVector.append(0) for k in range(300): featureVector.append(0) else: for k in range(2): try: t = model[words[namedEntityBegin[i]-k-1].decode('utf-8')] for ktemp in t: featureVector.append(ktemp) except: for ktemp in range(300): featureVector.append(0) wordsLen = len(words) rightWindowScale = wordsLen - namedEntityEnd[j] if rightWindowScale > 2: rightWindowScale = 2 featureVector.append(rightWindowScale) if rightWindowScale == 0: for k in range(300): featureVector.append(0) featureVector.append(0) elif rightWindowScale == 1: try: t = model[words[namedEntityEnd[j]+1].decode('utf-8')] for k in t: featureVector.append(k) except: for k in range(300): featureVector.append(0) for k in range(300): featureVector.append(0) else: for k in range(2): try: t = model[words[namedEntityEnd[j]+1+k].decode('utf-8')] for ktemp in t: featureVector.append(ktemp) except: for ktemp in range(300): featureVector.append(0) wordBetweenCount = namedEntityBegin[j] - namedEntityEnd[i] - 1 featureVector.append(wordBetweenCount) if wordBetweenCount == 0: for k in range(10): for ktemp in range(300): featureVector.append(0) elif wordBetweenCount <= 10: for k in range(wordBetweenCount): try: t = model[words[namedEntityEnd[i]+k+1].decode('utf-8')] for ktemp in t: featureVector.append(ktemp) except: for ktemp in range(300): featureVector.append(0) for k in range(10-wordBetweenCount): for ktemp in range(300): featureVector.append(0) else: for k in range(5): try: t = model[words[namedEntityEnd[i]+k+1].decode('utf-8')] for ktemp in t: featureVector.append(ktemp) except: for ktemp in range(300): featureVector.append(0) for k in range(5): try: t = model[words[namedEntityBegin[j]-5+k].decode('utf-8')] for ktemp in t: featureVector.append(ktemp) except: for ktemp in range(300): featureVector.append(0) pf.append(featureVector) neIndex = namedEntityBegin[i] ne1 = words[neIndex] while neIndex < namedEntityEnd[i]: neIndex = neIndex + 1 ne1 = ne1 + words[neIndex] ne1s.append(ne1) neIndex = namedEntityBegin[j] ne2 = words[neIndex] while neIndex < namedEntityEnd[j]: neIndex = neIndex + 1 ne2 = ne2 + words[neIndex] ne2s.append(ne2) ifRelation = 0 for k in range(relationCount): if (ne1 == relations[k][0] or ne1 == relations[k][1]) and (ne2 == relations[k][0] or ne2 == relations[k][1]) and (ne1 != ne2): ifRelation = 1 break if ifRelation == 0: featureResult = 3 else: featureResult = relationType(relations[k][2]) for k in featureVector: outputfv.write('%f ' % k) outputfv.write('\n') outputfr.write(str(featureResult)) outputfr.write('\n') print featureResult ifsen = 1 line = input.readline() print 'senNo: %d' % senNo senNo = senNo + 1 continue if ifsen == 1: print line line = unicodedata.normalize('NFKC', line.decode('utf-8')).encode('utf-8') words = segmentor.segment(line) postags = postagger.postag(words) netags = recognizer.recognize(words, postags) print "|".join(words) print "|".join(postags) print "|".join(netags) ifsen = 0 relationCount = 0 relations = list() else: relation = line.split(',') relations.append(relation) relationCount = relationCount + 1 print "|".join(relations[relationCount-1]) print relations[relationCount-1][2] line = input.readline() segmentor.release() postagger.release() recognizer.release() input.close() outputfv.close() outputfr.close() pred_res = clf.predict(pf) for i in pred_res: outputp.write(str(i)) outputp.write('\n')