def segmentation(filename, output_filename): print "segmenting '%s' to '%s'" % (filename, output_filename) f = open(filename, "r") lines = f.readlines() f.close() MODELDIR = "./ltp_data/" # segment segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) # postag postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) # Named Entity Recognize recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) # Parse and get SVO parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) f = open(output_filename, "w") fner = open(output_filename.split(".")[0]+"_ner.txt", "w") for _line in lines: line = _line[:-1] if line[-1] in "\n\r": line = line[:-1] words = segmentor.segment(line) postags = postagger.postag(words) # netags = recognizer.recognize(words, postags) # arcs = parser.parse(words, postags) for i in range(len(words)): f.write( "%s/%s\t" % (words[i], postags[i])) # if netags[i]!='O': # fner.write("%s/%s\t" % (words[i], netags[i])) f.write("\n") # fner.write("\n") f.close()
def segmentsentence(sentence): segmentor = Segmentor() postagger = Postagger() parser = Parser() recognizer = NamedEntityRecognizer() segmentor.load("./ltpdata/ltp_data_v3.4.0/cws.model") postagger.load("./ltpdata/ltp_data_v3.4.0/pos.model") # parser.load("./ltpdata/ltp_data_v3.4.0/parser.model") recognizer.load("./ltpdata/ltp_data_v3.4.0/ner.model") ############# word_list = segmentor.segment(sentence) postags_list = postagger.postag(word_list) nertags = recognizer.recognize(word_list, postags_list) ############ for word, ntag in zip(word_list, nertags): if ntag == 'Nh': entity_list.append(word) print(" ".join(word_list)) print(' '.join(nertags)) ############ segmentor.release() postagger.release() # parser.release() recognizer.release() return word_list
def namedEntityRecognize(sentence): ''' 使用pyltp模块进行命名实体识别 返回:1)命名实体和类别元组列表、2)实体类别列表 ''' namedEntityTagTupleList = [] segmentor = Segmentor() # segmentor.load(inout.getLTPPath(index.CWS)) segmentor.load_with_lexicon(inout.getLTPPath(index.CWS), inout.getResourcePath('userDic.txt')) words = segmentor.segment(sentence) segmentor.release() postagger = Postagger() postagger.load(inout.getLTPPath(index.POS)) postags = postagger.postag(words) postagger.release() recognizer = NamedEntityRecognizer() recognizer.load(inout.getLTPPath(index.NER)) netags = recognizer.recognize(words, postags) recognizer.release() # 封装成元组形式 for word, netag in zip(words, netags): namedEntityTagTupleList.append((word, netag)) neTagList = '\t'.join(netags).split('\t') return namedEntityTagTupleList, neTagList
def ltp_ner_data(): """使用 LTP 进行命名实体识别""" LTP_DATA_DIR = 'D:\BaiduNetdiskDownload\ltp_data_v3.4.0' # ltp模型目录的路径 ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` from pyltp import NamedEntityRecognizer recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 result = [] file = [(const.qc_train_pos, const.qc_train_ner), (const.qc_test_pos, const.qc_test_ner)] for i in range(2): with open(file[i][0], 'r', encoding='utf-8') as f: for line in f.readlines(): attr = line.strip().split('\t') words_pos = attr[1].split(" ") words = [word.split('/_')[0] for word in words_pos] postags = [word.split('/_')[1] for word in words_pos] netags = recognizer.recognize(words, postags) # 命名实体识别 res = ' '.join([ "{}/_{}".format(words[i], netags[i]) for i in range(len(words)) ]) result.append("{}\t{}\n".format(attr[0], res)) with open(file[i][1], 'w', encoding='utf-8') as f: f.writelines(result) result.clear() recognizer.release() # 释放模型
def __init__(self): LTP_DIR = "./ltp_data" self.lac = LAC(mode='lac') self.lac.load_customization('data/custom.txt', sep=None) self.ddparser = DDParser(encoding_model='transformer') self.fine_info = FineGrainedInfo self.keyword = Keyword() self.jieba = jieba self.posseg = jieba.posseg self.segmentor = Segmentor(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger( model_path=os.path.join(LTP_DIR, "pos.model")) self.parser = Parser(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer( os.path.join(LTP_DIR, "ner.model"))
def restart(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.srler.release() self.segmentor = Segmentor() self.segmentor.load(os.path.join(self.MODELDIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(self.MODELDIR, "pos.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(self.MODELDIR, "ner.model")) self.parser = Parser() self.parser.load(os.path.join(self.MODELDIR, "parser.model")) self.srler = SementicRoleLabeller() self.srler.load(os.path.join(self.MODELDIR, "pisrl.model"))
def __init__(self): LTP_DIR = "./ltp_data" self.segmentor = Segmentor() # self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"), './dict.txt') self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))
def __init__(self): LTP_PATH = '/root/tmp/pycharm_project_96/pyltp_test/ltp_data' # 分词 self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_PATH,'cws.model')) # 词性标注 self.postagger = Postagger() self.postagger.load(os.path.join(LTP_PATH,'pos.model')) # 依存句法 self.parser = Parser() self.parser.load(os.path.join(LTP_PATH,'parser.model')) # 命名实体识别 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_PATH,'ner.model')) # # 语义角色标注 self.labeller = SementicRoleLabeller() self.labeller.label(os.path.join(LTP_PATH,'pisrl.model'))
def __init__(self): print(111) LTP_DIR = "D:\\ltp_data\\ltp_data_v3.4.0" self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model')) print(111)
def getNameRecognizer(self): if Config.c_namerecognizer: return Config.c_namerecognizer else: ner_model_path = os.path.join(Config.ltp_data_dir, Config.ner_model) Config.c_namerecognizer = NamedEntityRecognizer() Config.c_namerecognizer.load(ner_model_path) return Config.c_namerecognizer
def __init__(self,): LTP_DATA_DIR = SETTINGS.LTP_DATA_DIR # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` srl_model_path = os.path.join(LTP_DATA_DIR, 'srl') # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。 ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` self.segmentor = Segmentor() # 初始化实例 self.segmentor.load_with_lexicon(cws_model_path,'./segName') # 加载模型 self.postagger = Postagger() # 初始化实例 self.postagger.load_with_lexicon(pos_model_path,'./postagName') # 加载模型 self.parser = Parser() # 初始化实例 self.parser.load(par_model_path) # 加载模型 self.labeller = SementicRoleLabeller() # 初始化实例 self.labeller.load(srl_model_path) # 加载模型 self.recognizer = NamedEntityRecognizer() # 初始化实例 self.recognizer.load(ner_model_path) # 加载模型
def __init__(self): # LTP_DIR = './ltp_data_v3.4.0' print("加载模型路径", LTP_DIR) self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) print("加载完毕")
def __init__(self, **kwargs): """ Args: annotators: set that can include pos and ner. model: ltp model to use (path). """ self.segmentor = Segmentor() # 初始化分词器实例 self.recognizer = NamedEntityRecognizer() # 初始化命名实体识别器实例 self.postagger = Postagger() # 初始化词性标注实例 self.segmentor.load(cws_model_path) # 加载分词模型 self.annotators = copy.deepcopy(kwargs.get('annotators', set())) if {'pos'} & self.annotators: self.postagger.load(pos_model_path) if {'ner'} & self.annotators: self.postagger.load(pos_model_path) self.recognizer.load(ner_model_path)
def dependency_parsing(ltp_model_path, sents, postags, said): LTP_DATA_DIR = ltp_model_path # ltp模型目录的路径 par_model_path = os.path.join( LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` ner_model_path = os.path.join( LTP_DATA_DIR, 'ner.model') # 依存句法分析模型路径,模型名称为`parser.model` from pyltp import Parser, NamedEntityRecognizer recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 contents = [] for index in range(len(sents)): wo = sents[index].split() po = postags[index] netags = recognizer.recognize(wo, po) # 命名实体识别 # print("netags", list(netags)) netags = list(netags) if ('S-Ns' not in netags) and ('S-Ni' not in netags) and ('S-Nh' not in netags): continue arcs = parser.parse(wo, po) # 句法分析 arcs = [(arc.head, arc.relation) for arc in arcs] arcs = [(i, arc) for i, arc in enumerate(arcs) if arc[1] == 'SBV'] for arc in arcs: verb = arc[1][0] subject = arc[0] if wo[verb - 1] not in said: continue contents.append((wo[subject], wo[verb - 1], ''.join(wo[verb:]))) # parser.release() # 释放模型 # recognizer.release() # 释放模型 return contents
def name_recognize_one(): import sys, os import pyltp from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller paragraph = '叙利亚东古塔地区。7日发生疑似化学武器袭击事件,导致70余人丧生。报道一出,叙利亚反对派、美国、英国、法国等纷纷指责叙政府军使用化学武器袭击无辜平民。但叙利亚坚决否认,并指责西方和叙反对派造谣,目的是保护被围困的恐怖分子。俄外交部则认为,该谣言旨在袒护恐怖分子,并为外部势力发动打击寻找借口。' sentence = SentenceSplitter.split(paragraph)[1] print('split {}'.format(sentence)) # 断句 # for i in sentence: # print(i) # print() segmentor = Segmentor() segmentor.load(sg_model_path) words = segmentor.segment(sentence) print('|'.join(words)) postagger = Postagger() postagger.load(ps_model_path) postags = postagger.postag(words) for k, v in dict(zip(words, postags)).items(): print(k, v) # print(' ## '.join(postags)) parser = Parser() parser.load(pr_model_path) arcs = parser.parse(words, postags) print(' '.join('%d:%s ' % (arc.head, arc.relation) for arc in arcs)) print('#' * 8) recognizer = NamedEntityRecognizer() recognizer.load(ner_model_path) netag = recognizer.recognize(words, postags) for word, ntag in zip(words, netag): if ntag != 'O': # print('ntag') print(word + ' / ' + netag) print(' / '.join(netag)) # 命名实体识别 word_list = ['欧几里得', '是', '西元前', '三', '世纪', '的', '希腊', '数学家', '。'] postags_list = ['nh', 'v', 'nt', 'm', 'n', 'u', 'ns', 'n', 'wp'] nertags = recognizer.recognize(word_list, postags_list) for word, ntag in zip(word_list, nertags): if ntag != 'O': print(word + '/' + ntag) #print (" ".join(word_list)) print(' '.join(nertags)) segmentor.release() postagger.release() parser.release() recognizer.release()
def get_all_name(r_filename, w_file): # global nlp LTP_DATA_DIR = r'ltp_data_v3.4.0' # LTP模型目录路径 # 分词 segmentor = Segmentor() # 初始化 segmentor.load(os.path.join(LTP_DATA_DIR, 'cws.model')) # 加载模型 # words = segmentor.segment(line) # 分词 # 词性标注 postagger = Postagger() # 初始化 postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model')) # 加载模型 #postags = postagger.postag(words) # postags = postagger.postag(['中国', '进出口', '银行', '与', '中国银行', '加强', '合作', '。']) #res=[] # 命名实体识别 recognizer = NamedEntityRecognizer() # 实例化 recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model')) f_r = open(r_filename, "r", encoding="utf-8") f_w = open(w_file, "w", encoding="utf-8") count = 0 for line in f_r: count += 1 line = line.strip(r"\n") line = raplace_line_feed(line) line = more_space_to_one(line) print(line) words = segmentor.segment(line) postags = postagger.postag(words) netags = recognizer.recognize(words, postags) name_list = get_name(netags, words) if name_list != []: print(name_list) sen = get_some_idea(line, name_list) print(sen) if sen: for key in sen: # print(sen[key]) sens = "\t".join(list(set([data[1] for data in sen[key]]))) f_w.write(key + "\t" + sens + "\n") # nlp.close() f_r.close() f_w.close()
def ner(words, postags): recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model')) # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 for word, ntag in zip(words, netags): print(word + '/' + ntag) recognizer.release() # 释放模型 return netags
def get_ner(words, postags): """ ltp 命名实体识别 """ ner_model_path = os.path.join(LTP_TOP_DIR, 'ner.model') recognizer = NamedEntityRecognizer() recognizer.load(ner_model_path) netags = recognizer.recognize(words, postags) recognizer.release() return list(netags)
def ltp_name_entity_recognizer(LTP_DATA_DIR, words, postags): # 命名实体识别模型路径,模型名称为`ner.model` ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 recognizer.release() # 释放模型 return netags
def ner(words, postags): recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load('/Users/chenming/Spyder/3.3.1/ltp_data/ner.model') # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 for word, ntag in zip(words, netags): print (word + '/' + ntag) recognizer.release() # 释放模型 return netags
def entity_recognize(cutting_list, tagging_list): ner_model_path = os.path.join(LtpParser.ltp_path, 'ner.model') from pyltp import NamedEntityRecognizer recognizer = NamedEntityRecognizer() recognizer.load(ner_model_path) ne_tags = recognizer.recognize(cutting_list, tagging_list) recognizer.release() return ne_tags
def e_recognize(words, postags): recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 #for word, ntag in zip(words, netags): #print(word + '/' + ntag) recognizer.release() # 释放模型 return netags
def ner(words, postags): recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load('../ltp_data/ner.model') # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 # for word,tag in zip(words,netags): # print word+'/'+tag recognizer.release() # 释放模型 return netags
def get_ner(self, word_list, postag_list, model): recognizer = NamedEntityRecognizer() recognizer.load(model) netags = recognizer.recognize(word_list, postag_list) # 命名实体识别 # for word, ntag in zip(word_list, netags): # print(word + '/' + ntag) recognizer.release() # 释放模型 return list(netags)
class LtpLanguageAnalysis(object): def __init__(self, model_dir="D:/ltp_data_v3.4.0"): self.segmentor = Segmentor() self.segmentor.load(os.path.join(model_dir, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(model_dir, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(model_dir, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(model_dir, "ner.model")) # 加载命名实体识别模型 def analyze(self, text): # 分词 words = self.segmentor.segment(text) print('\t'.join(words)) def postags(self, words): # 词性标注 postags = self.postagger.postag(words) # print('\t'.join(postags)) return list(postags) # return '\t'.join(postags) def parse(self, words, postags): # 句法分析 arcs = self.parser.parse(words, postags) # print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) # print("\t".join(arc.relation for arc in arcs)) return "\t".join(arc.relation for arc in arcs) def ner(self, words, postags): # 命名实体 netag = self.recognizer.recognize(words, postags) for word, ntag in zip(words, netag): if ntag != 'O': print(word + '/' + ntag) print("\t".join(netag)) def release_model(self): # 释放模型 self.segmentor.release() self.postagger.release() self.parser.release()
def __init__(self, data_dir): self.LTP_DATA_DIR = data_dir cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') # 分词模型路径,分词模型名称是‘cws.model’ self.segmentor = Segmentor() self.segmentor.load(cws_model_path) pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,分词模型名称是‘pos.model’ self.postagger = Postagger() self.postagger.load(pos_model_path) ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model') self.recognizer = NamedEntityRecognizer() self.recognizer.load(ner_model_path) par_model_path = os.path.join(self.LTP_DATA_DIR, 'parser.model') self.parser = Parser() self.parser.load(par_model_path)
def __init__(self): LTP_DIR = "../../res/ltp/ltp_data_v3.4.0" LTP_DIR_USER = "******" self.segmentor = Segmentor() self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"), os.path.join(LTP_DIR_USER, "fulluserdict.txt")) # self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load_with_lexicon(os.path.join(LTP_DIR, "pos.model"), os.path.join(LTP_DIR_USER, "fulluserdict.txt")) # self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))
def segment(self, texts, use_tag_filter=True): # 初始化实例 # global word_list, netags, postags, relation, heads words = [] pos = [] ner = [] rel = [] hea = [] segmentor = Segmentor() segmentor.load_with_lexicon(self.cws_model_path, './dict/user_recg.dic') # 加载模型,参数是自定义词典的文件路径 self.dic_list postagger = Postagger() postagger.load(self.pos_model_path) recognizer = NamedEntityRecognizer() recognizer.load(self.ner_model_path) parser = Parser() parser.load(self.pas_model_path) for text in texts: text = text.lower() word_list = segmentor.segment(text) word_list = [word for word in word_list if len(word) > 1] # word_list = [word for word in word_list if re.match("[\u0041-\u005a\u4e00-\u9fa5]+", word) != None] # .decode('utf8') 保留中英文 word_list = [word.strip() for word in word_list if word.strip() not in self.stop_words] # 去除停用词 # 词性标注 posttags = postagger.postag(word_list) postags = list(posttags) # NER识别 netags = recognizer.recognize(word_list, postags) # 句法分析 arcs = parser.parse(word_list, postags) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else word_list[id - 1] for id in rely_id] # 匹配依存父节点词语 if use_tag_filter: dic = dict(zip(word_list, postags)) word_list = [x for x in dic.keys() if dic[x] in self.tags_filter] words.append(word_list) pos.append(postags) ner.append(netags) rel.append(relation) hea.append(heads) segmentor.release() postagger.release() recognizer.release() parser.release() return words, pos, ner, rel, hea
def __init__(self): LTP_DIR = "data\ltp_data" cws_model_path = os.path.join(LTP_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` lexicon_path = "dictionary\Dir1.txt" # 参数lexicon是自定义词典的文件路径 self.segmentor = Segmentor() self.segmentor.load_with_lexicon(cws_model_path, lexicon_path) # self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model'))
class LTP: def __init__(self, base_dir, is_custom_seg_dict=False): if base_dir is None: base_dir = 'lib/ltp_data_v3.4.0' self.init(base_dir, is_custom_seg_dict) def init(self, base_dir, is_custom_seg_dict): segmentor_model = os.path.join(base_dir, 'cws.model') tagger_model = os.path.join(base_dir, 'pos.model') ner_model = os.path.join(base_dir, 'ner.model') parser_model = os.path.join(base_dir, 'parser.model') custom_seg_dict = os.path.join(dict_dir, 'vertical_domain_baike_dict.txt') self.segmentor = Segmentor() if is_custom_seg_dict: self.segmentor.load_with_lexicon(segmentor_model, custom_seg_dict) else: self.segmentor.load(segmentor_model) self.tagger = Postagger() self.tagger.load(tagger_model) self.nertagger = NamedEntityRecognizer() self.nertagger.load(ner_model) self.parser = Parser() self.parser.load(parser_model) def parse(self, sentence, parse_tree=True): words = list(self.segmentor.segment(sentence)) tags = list(self.tagger.postag(words)) ner_tags = list(self.nertagger.recognize(words, tags)) if parse_tree: arcs = list(self.parser.parse(words, tags)) else: arcs = None result = LTPResult(words, tags, ner_tags, arcs, sentence) return result def cut(self, sentence): words = self.segmentor.segment(sentence) return words
def test(sentence): os.environ['STANFORD_PARSER'] = STANFORD_PARSER_PATH os.environ['STANFORD_MODELS'] = STANFORD_MODELS_PATH os.environ['JAVAHOME'] = JAVA_HOME stanford_model_path = CHINESE_MODEL_PATH s_parser = stanford.StanfordParser(model_path=stanford_model_path) par_model_path = os.path.join( LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` from pyltp import Parser parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` from pyltp import Segmentor segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` from pyltp import Postagger postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` from pyltp import NamedEntityRecognizer recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 words = segmentor.segment(sentence) postags = postagger.postag(words) netags = recognizer.recognize(words, postags) arcs = parser.parse(words, postags) # 句法分析 res = zip(words, postags, netags, arcs) for i in res: print(','.join(i[:3]), str(i[3].head) + ':' + i[3].relation)
def __init__(self): self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` segmentor = Segmentor() segmentor.load(self.cws_model_path) self.words = segmentor.segment(data) # print("|".join(words)) segmentor.release() postagger = Postagger() # 初始化实例 postagger.load(self.pos_model_path) # 加载模型 self.postags = postagger.postag(self.words) # 词性标注 # print('\t'.join(postags)) postagger.release() # 释放模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(self.ner_model_path) # 加载模型 self.netags = recognizer.recognize(self.words, self.postags) # 命名实体识别 # print('\t'.join(netags)) recognizer.release() # 释放模型
print "\t".join(words) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print "\t".join(netags) labeller = SementicRoleLabeller() labeller.load(os.path.join(MODELDIR, "srl/")) roles = labeller.label(words, postags, netags, arcs) for role in roles: print role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]) segmentor.release() postagger.release() parser.release()
import sys, os ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) sys.path.append(os.path.join(ROOTDIR, "lib")) # Set your own model path MODELDIR = os.path.join("/home/fish/", "ltp_data") from pyltp import Segmentor, Postagger, NamedEntityRecognizer # @UnresolvedImport # 分词功能 segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) def ltp(sentence): words = segmentor.segment(sentence) # 词性标注功能 postags = postagger.postag(words) # 实体识别 netags = recognizer.recognize(words, postags) l = [] li = zip(list(words), list(postags), list(netags)) for a, b, c in li: # 去掉命名实体 if c == "O": # 去掉所有名词
def mingming_shiti(words,postags): """命名实体。机构名(Ni)人名(Nh)地名(Ns)""" recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags) print ("\t".join(netags))