def segmentation(filename, output_filename): print "segmenting '%s' to '%s'" % (filename, output_filename) f = open(filename, "r") lines = f.readlines() f.close() MODELDIR = "./ltp_data/" # segment segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) # postag postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) # Named Entity Recognize recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) # Parse and get SVO parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) f = open(output_filename, "w") fner = open(output_filename.split(".")[0]+"_ner.txt", "w") for _line in lines: line = _line[:-1] if line[-1] in "\n\r": line = line[:-1] words = segmentor.segment(line) postags = postagger.postag(words) # netags = recognizer.recognize(words, postags) # arcs = parser.parse(words, postags) for i in range(len(words)): f.write( "%s/%s\t" % (words[i], postags[i])) # if netags[i]!='O': # fner.write("%s/%s\t" % (words[i], netags[i])) f.write("\n") # fner.write("\n") f.close()
def get_postags(self, words): postagger = Postagger() # 初始化实例 postagger.load(self.pos_model_path) # 加载模型 postags = postagger.postag(words) # 词性标注 print('\t'.join(postags)) postagger.release() # 释放模型 return list(postags)
def ltp_pos_data(): """使用 LTP 进行词性标注""" LTP_DATA_DIR = 'D:\BaiduNetdiskDownload\ltp_data_v3.4.0' # ltp模型目录的路径 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` from pyltp import Postagger postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 result = [] file = [(const.qc_train_seg, const.qc_train_pos), (const.qc_test_seg, const.qc_test_pos)] for i in range(2): with open(file[i][0], 'r', encoding='utf-8') as f: for line in f.readlines(): attr = line.strip().split('\t') words = attr[1].split(" ") words_pos = postagger.postag(words) res = ' '.join([ "{}/_{}".format(words[i], words_pos[i]) for i in range(len(words)) ]) result.append("{}\t{}\n".format(attr[0], res)) with open(file[i][1], 'w', encoding='utf-8') as f: f.writelines(result) result.clear() postagger.release() # 释放模型
def cut_words(): #分词+去除空行 #词性标注集http://ltp.readthedocs.io/zh_CN/latest/appendix.html cont = open('resource_new.txt', 'r', encoding='utf-8') f = open('key/cut_resouce.txt', 'w', encoding='utf-8') segmentor = Segmentor() # 初始化实例 # segmentor.load('cws.model') # 加载模型,不加载字典 segmentor.load_with_lexicon('module/cws.model', 'userdict.txt') # 加载模型,加载用户字典 postagger = Postagger() # 初始化实例 postagger.load('module/pos.model') # 加载模型 for sentence in cont: if sentence.strip() != '': words = segmentor.segment(sentence) # 分词 pos_tags = postagger.postag(words) # 词性标注 for word, tag in zip(words, pos_tags): if tag != 'wp': f.write(word) else: f.write('\n') f.write('\n') else: continue f.close() segmentor.release() postagger.release()
class LtpParser: def __init__(self): LTP_DIR = "ltp_data_v3.4.0" self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) '''句法分析---为句子中的每个词语维护一个保存句法依存儿子节点的字典''' def build_parse_child_dict(self, words, postags, arcs): child_dict_list = [] format_parse_list = [] for index in range(len(words)): child_dict = dict() for arc_index in range(len(arcs)): if arcs[arc_index].head == index+1: #arcs的索引从1开始 if arcs[arc_index].relation in child_dict: child_dict[arcs[arc_index].relation].append(arc_index) else: child_dict[arcs[arc_index].relation] = [] child_dict[arcs[arc_index].relation].append(arc_index) child_dict_list.append(child_dict) rely_id = [arc.head for arc in arcs] # 提取依存父节点id relation = [arc.relation for arc in arcs] # 提取依存关系 heads = ['Root' if id == 0 else words[id - 1] for id in rely_id] # 匹配依存父节点词语 for i in range(len(words)): # ['ATT', '李克强', 0, 'nh', '总理', 1, 'n'] a = [relation[i], words[i], i, postags[i], heads[i], rely_id[i]-1, postags[rely_id[i]-1]] format_parse_list.append(a) return child_dict_list, format_parse_list '''parser主函数''' def parser_main(self, sentence): words = list(self.segmentor.segment(sentence)) # 分词 postags = list(self.postagger.postag(words)) # 词性标注 arcs = self.parser.parse(words, postags) # 句法分析 child_dict_list, format_parse_list = self.build_parse_child_dict(words, postags, arcs) return words, postags, child_dict_list, format_parse_list
def test_ltp(document): LTP_DATA_DIR = r"D:\anaconda\envs\TF+3.5\Lib\site-packages\pyltp-model" # ltp模型目录的路径 par_model_path = os.path.join( LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 words = segmentor.segment(document) # 分词 print("\nA") print("分词结果:") print('\t'.join(words)) segmentor.release() # 释放模型 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 postags = postagger.postag(words) # 词性标注 print("\n") print("词性标注结果:") print('\t'.join(postags)) postagger.release() # 释放模型 parser = Parser() # 初始化实例 parser.load(par_model_path) # 加载模型 arcs = parser.parse(words, postags) # 句法分析 print("\n") print("句法分析结果:") print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) parser.release() # 释放模型
def get_postag_list(self, word_list, model): # 得到词性标注 postag = Postagger() postag.load(model) postag_list = list(postag.postag(word_list)) postag.release() return postag_list
def namedEntityRecognize(sentence): ''' 使用pyltp模块进行命名实体识别 返回:1)命名实体和类别元组列表、2)实体类别列表 ''' namedEntityTagTupleList = [] segmentor = Segmentor() # segmentor.load(inout.getLTPPath(index.CWS)) segmentor.load_with_lexicon(inout.getLTPPath(index.CWS), inout.getResourcePath('userDic.txt')) words = segmentor.segment(sentence) segmentor.release() postagger = Postagger() postagger.load(inout.getLTPPath(index.POS)) postags = postagger.postag(words) postagger.release() recognizer = NamedEntityRecognizer() recognizer.load(inout.getLTPPath(index.NER)) netags = recognizer.recognize(words, postags) recognizer.release() # 封装成元组形式 for word, netag in zip(words, netags): namedEntityTagTupleList.append((word, netag)) neTagList = '\t'.join(netags).split('\t') return namedEntityTagTupleList, neTagList
def postags_opt(words): # Set pyltp postagger model path LTP_DATA_DIR = '../ltp_data_v3.4.0' pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # Init postagger postagger = Postagger() # Load model postagger.load(pos_model_path) # Get postags postags = postagger.postag(words) # Close postagger postagger.release() postags = list(postags) # Init result list saying_words = [] # Filter with tag 'verb' for index, tag in enumerate(postags): if tag == 'v': saying_words.append(words[index]) return saying_words
def locationNER(text): #先分词 segmentor = Segmentor() # 初始化实例 segmentor.load(cws_model_path) # 加载模型 words = segmentor.segment(text) # 分词 #print ('\t'.join(words)) segmentor.release() #再词性标注 postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 postags = postagger.postag(words) # 词性标注 postagger.release() # 释放模型 #最后地理实体识别 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(ner_model_path) # 加载模型 netags = recognizer.recognize(words, postags) # 命名实体识别 for i in range (0,len(netags)): if 'I-Ns'in netags[i] or 'I-Ni'in netags[i]: results.append(words[i-1]+words[i]+words[i+1]) if 'S-Ns'in netags[i] or 'S-Ni'in netags[i]: results.append(words[i]) return results
def word_pos(): #ltp词性标注 candidate=pd.read_csv(r'../data/candidate_sentiment.csv',header=None) can_word=candidate[0].tolist() # 新加一列存放词性 candidate.insert(2,'ltp_pos',0) candidate.insert(3,'jieba_pos',0) candidate.columns=['word','freq','ltp_pos','jieba_pos'] LTP_DATA_DIR = '../ltp_data_v3.4.0/ltp_data_v3.4.0' # ltp模型目录的路径 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 postags = postagger.postag(can_word) # 词性标注 postagger.release() # 释放模型 postags=list(postags) candidate['ltp_pos']=postags #jieba词性标注 jieba_pos=[] for index,row in candidate.iterrows(): s=row['word'] words=pseg.cut(s) pos=[] for w in words: pos.append(w.flag) pos=' '.join(pos) jieba_pos.append(pos) candidate['jieba_pos']=jieba_pos # 添加表头 candidate.to_csv(r'../data/candidate_sentiment.csv',index=None)
def get_postag_list(words_list): postag = Postagger() postag.load(pos_model_path) postag_list = list(postag.postag(words_list)) postag.release() return postag_list
def segmentsentence(sentence): segmentor = Segmentor() postagger = Postagger() parser = Parser() recognizer = NamedEntityRecognizer() segmentor.load("./ltpdata/ltp_data_v3.4.0/cws.model") postagger.load("./ltpdata/ltp_data_v3.4.0/pos.model") # parser.load("./ltpdata/ltp_data_v3.4.0/parser.model") recognizer.load("./ltpdata/ltp_data_v3.4.0/ner.model") ############# word_list = segmentor.segment(sentence) postags_list = postagger.postag(word_list) nertags = recognizer.recognize(word_list, postags_list) ############ for word, ntag in zip(word_list, nertags): if ntag == 'Nh': entity_list.append(word) print(" ".join(word_list)) print(' '.join(nertags)) ############ segmentor.release() postagger.release() # parser.release() recognizer.release() return word_list
def __init__(self,): LTP_DATA_DIR = SETTINGS.LTP_DATA_DIR # ltp模型目录的路径 cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') # 依存句法分析模型路径,模型名称为`parser.model` srl_model_path = os.path.join(LTP_DATA_DIR, 'srl') # 语义角色标注模型目录路径,模型目录为`srl`。注意该模型路径是一个目录,而不是一个文件。 ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` self.segmentor = Segmentor() # 初始化实例 self.segmentor.load_with_lexicon(cws_model_path,'./segName') # 加载模型 self.postagger = Postagger() # 初始化实例 self.postagger.load_with_lexicon(pos_model_path,'./postagName') # 加载模型 self.parser = Parser() # 初始化实例 self.parser.load(par_model_path) # 加载模型 self.labeller = SementicRoleLabeller() # 初始化实例 self.labeller.load(srl_model_path) # 加载模型 self.recognizer = NamedEntityRecognizer() # 初始化实例 self.recognizer.load(ner_model_path) # 加载模型
def __init__(self, **kwargs): """ Args: annotators: set that can include pos and ner. model: ltp model to use (path). """ self.segmentor = Segmentor() # 初始化分词器实例 self.recognizer = NamedEntityRecognizer() # 初始化命名实体识别器实例 self.postagger = Postagger() # 初始化词性标注实例 self.segmentor.load(cws_model_path) # 加载分词模型 self.annotators = copy.deepcopy(kwargs.get('annotators', set())) if {'pos'} & self.annotators: self.postagger.load(pos_model_path) if {'ner'} & self.annotators: self.postagger.load(pos_model_path) self.recognizer.load(ner_model_path)
def restart(self): self.segmentor.release() self.postagger.release() self.recognizer.release() self.parser.release() self.srler.release() self.segmentor = Segmentor() self.segmentor.load(os.path.join(self.MODELDIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(self.MODELDIR, "pos.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(self.MODELDIR, "ner.model")) self.parser = Parser() self.parser.load(os.path.join(self.MODELDIR, "parser.model")) self.srler = SementicRoleLabeller() self.srler.load(os.path.join(self.MODELDIR, "pisrl.model"))
def __init__(self): print(111) LTP_DIR = "D:\\ltp_data\\ltp_data_v3.4.0" self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl_win.model')) print(111)
def __init__(self, input_file, output_file, pyltp_path=None): # ltp model files LTP_DATA_DIR = 'pyltp-resource/ltp-model' if pyltp_path is None else pyltp_path cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') cut_text_path = os.path.join(LTP_DATA_DIR, 'word_segmentation.txt') self.input_file = input_file self.output_file = output_file self.raw_data = [] self.sentence_cutted = [] # 分词后list,每一个子元素为一个app描述分词后的list self.postagger = Postagger() # 初始化词性标注实例 self.postagger.load(pos_model_path) # 加载词性标注模型 self.parser = Parser() # 初始化句法分析实例 self.parser.load(par_model_path) # 加载句法分析模型 self.segmentor = Segmentor() self.segmentor.load_with_lexicon(cws_model_path, cut_text_path)
def ltp_postagger(self, data, is_zip=True): """ 词性参考: https://blog.csdn.net/leiting_imecas/article/details/68485254 :param data: :return: """ if self._postagger is None: self._postagger = Postagger() self._postagger.load(os.path.join(self._MODELDIR, "pos.model")) if isinstance(data, str): words = self.ltp_segmentor(data) else: words = data postags = self._postagger.postag(words) if is_zip: return list(zip(words, postags)) else: return words, postags
def getPostagger(self): if Config.c_postagger: return Config.c_postagger else: pos_model_path = os.path.join(Config.ltp_data_dir, Config.pos_model) Config.c_postagger = Postagger() Config.c_postagger.load(pos_model_path) return Config.c_postagger
def __init__(self): LTP_PATH = '/root/tmp/pycharm_project_96/pyltp_test/ltp_data' # 分词 self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_PATH,'cws.model')) # 词性标注 self.postagger = Postagger() self.postagger.load(os.path.join(LTP_PATH,'pos.model')) # 依存句法 self.parser = Parser() self.parser.load(os.path.join(LTP_PATH,'parser.model')) # 命名实体识别 self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_PATH,'ner.model')) # # 语义角色标注 self.labeller = SementicRoleLabeller() self.labeller.label(os.path.join(LTP_PATH,'pisrl.model'))
def __init__(self): # LTP_DIR = './ltp_data_v3.4.0' print("加载模型路径", LTP_DIR) self.segmentor = Segmentor() self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model')) print("加载完毕")
def __init__(self): LTP_DIR = "./ltp_data" self.lac = LAC(mode='lac') self.lac.load_customization('data/custom.txt', sep=None) self.ddparser = DDParser(encoding_model='transformer') self.fine_info = FineGrainedInfo self.keyword = Keyword() self.jieba = jieba self.posseg = jieba.posseg self.segmentor = Segmentor(os.path.join(LTP_DIR, "cws.model")) self.postagger = Postagger( model_path=os.path.join(LTP_DIR, "pos.model")) self.parser = Parser(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer( os.path.join(LTP_DIR, "ner.model"))
def __init__(self): LTP_DIR = "./ltp_data" self.segmentor = Segmentor() # self.segmentor.load(os.path.join(LTP_DIR, "cws.model")) self.segmentor.load_with_lexicon(os.path.join(LTP_DIR, "cws.model"), './dict.txt') self.postagger = Postagger() self.postagger.load(os.path.join(LTP_DIR, "pos.model")) self.parser = Parser() self.parser.load(os.path.join(LTP_DIR, "parser.model")) self.recognizer = NamedEntityRecognizer() self.recognizer.load(os.path.join(LTP_DIR, "ner.model")) self.labeller = SementicRoleLabeller() self.labeller.load(os.path.join(LTP_DIR, 'pisrl.model'))
def __init__(self, dic): cws_model_path = os.path.join(config.LTP_DATA_DIR, 'cws.model') self.segmentor = Segmentor() self.segmentor.load(cws_model_path) self.segmentor.load_with_lexicon(cws_model_path, config.dic_path) pos_model_path = os.path.join(config.LTP_DATA_DIR, 'pos.model') self.postagger = Postagger() self.postagger.load(pos_model_path) par_model_path = os.path.join(config.LTP_DATA_DIR, 'parser.model') self.parser = Parser() self.parser.load(par_model_path) jieba.load_userdict(dic.keys()) self.dic = dic
def ltp_labeling(info_list, path, mode='train'): """ Split word in querys and write in the disk. """ new_info_list, word_dict, dataset_list, sparse_rate = list(), dict(), list(), 0.0 segmentor = Segmentor() segmentor.load('E://Github/CCF_Competition/data/ltp_data/cws.model') postagger = Postagger() postagger.load('E://Github/CCF_Competition/data/ltp_data/pos.model') idx = 0 for info in info_list: if idx % 100 == 0 : print 1.0 * idx / len(info_list) idx += 1 if mode == 'train' : user, age, gender, education, querys = info elif mode == 'test' : user, querys = info new_querys = list() for query in querys: new_query = list() words = segmentor.segment(query.encode('utf8')) postags = postagger.postag(words) for word, pos in zip(words, postags): word, pos = word.decode('utf8'), pos.decode('utf8') new_query.append((word, pos)) new_querys.append(new_query) if mode == 'train' : new_info_list.append((user, age, gender, education, new_querys)) elif mode == 'test' : new_info_list.append((user, new_querys)) # write in the disk with open(path, 'w') as fw: for info in new_info_list: if mode == 'train' : user, age, gender, education, querys = info query_str = '\t'.join([' '.join([word+'<:>'+pos for word, pos in query]) for query in querys]) fw.writelines((user + '\t' + age + '\t' + gender + '\t' + education +\ '\t' + query_str + '\n').encode('gb18030')) elif mode == 'test' : user, querys = info query_str = '\t'.join([' '.join([word+'<:>'+pos for word, pos in query]) for query in querys]) print user, query_str fw.writelines((user + '\t' + query_str + '\n').encode('gb18030'))
def get_all_name(r_filename, w_file): # global nlp LTP_DATA_DIR = r'ltp_data_v3.4.0' # LTP模型目录路径 # 分词 segmentor = Segmentor() # 初始化 segmentor.load(os.path.join(LTP_DATA_DIR, 'cws.model')) # 加载模型 # words = segmentor.segment(line) # 分词 # 词性标注 postagger = Postagger() # 初始化 postagger.load(os.path.join(LTP_DATA_DIR, 'pos.model')) # 加载模型 #postags = postagger.postag(words) # postags = postagger.postag(['中国', '进出口', '银行', '与', '中国银行', '加强', '合作', '。']) #res=[] # 命名实体识别 recognizer = NamedEntityRecognizer() # 实例化 recognizer.load(os.path.join(LTP_DATA_DIR, 'ner.model')) f_r = open(r_filename, "r", encoding="utf-8") f_w = open(w_file, "w", encoding="utf-8") count = 0 for line in f_r: count += 1 line = line.strip(r"\n") line = raplace_line_feed(line) line = more_space_to_one(line) print(line) words = segmentor.segment(line) postags = postagger.postag(words) netags = recognizer.recognize(words, postags) name_list = get_name(netags, words) if name_list != []: print(name_list) sen = get_some_idea(line, name_list) print(sen) if sen: for key in sen: # print(sen[key]) sens = "\t".join(list(set([data[1] for data in sen[key]]))) f_w.write(key + "\t" + sens + "\n") # nlp.close() f_r.close() f_w.close()
def new_relation_find(words, sentence): """ 新关系发现 :param words: :param sentence: :return: """ # 存放三元组的字典 tuple_dict = dict() index0 = -1 index1 = -1 bool = False for entity_word in entity_words: if sentence.find(entity_word) != -1: if tuple_dict: # 返回为true说明有重复部分 if has_same(tuple_dict[index0], entity_word): continue index1 = sentence.find(entity_word) tuple_dict[index1] = entity_word bool = True break else: index0 = sentence.find(entity_word) tuple_dict[index0] = entity_word if bool is False: return "", "", "" # 排序结果为list # tuple_dict = sorted(tuple_dict.items(), key=lambda d: d[0]) words = "/".join(words).split("/") for key, value in tuple_dict.items(): tuple_word = value words = init_words(tuple_word, words) # 对于已经重构的词进行词标注 postagger = Postagger() # 初始化实例 pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` postagger.load_with_lexicon(pos_model_path, 'data/postagger.txt') # 加载模型 postags = postagger.postag(words) # 词性标注 print('\t'.join(postags)) postagger.release() # 释放模型 # 发现新关系 relation_word = "" index_word = 0 for index, postag in enumerate('\t'.join(postags).split('\t')): index_word += len(words[index]) if index_word >= len(sentence): break if postag == 'v' and index_word - min(index0, index1) <= 2 and max(index0, index1) - index_word <= 2 \ and not has_same(tuple_dict[index0], words[index]) and not has_same(tuple_dict[index1], words[index]) \ and words[index] not in wrong_relation: relation_word = words[index] break if relation_word == "": return "", "", "" return tuple_dict[min(index0, index1)], tuple_dict[max(index0, index1)], relation_word
def posttagger(words): postagger = Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 postags = postagger.postag(words) # 词性标注 #for word, tag in zip(words, postags): #print(word + '/' + tag) postagger.release() # 释放模型 return postags
def posttagger(words): postagger = Postagger() postagger.load(r'D:\Corpus\ltp_data_v3.4.0\pos.model') posttags = postagger.postag(words) # 词性标注 postags = list(posttags) postagger.release() # 释放模型 # print type(postags) return postags
def posttagger(words): postagger = Postagger() # 初始化实例 postagger.load( r'D:\SUFE\ComputerContest\QASystem\DrQA-CN-master\data\ltp_data_v3.4.0\pos.model' ) postags = postagger.postag(words) # 词性标注 postagger.release() return postags
def words_cixing(words=["中国","进出口","银行","与","中国银行","加强","合作"],type_list=0,pos=0): """词性标注,若type_list=True,则返回以列表返回标注词性后的结果。 词性标记集:LTP中采用863词性标注集 词性说明见:http://www.ltp-cloud.com/intro/ 若type_list为真,则返回['ns', 'v', 'n', 'c', 'ni', 'v', 'v'] 若pos为真,则返回['中国/ns', '进出口/v', '银行/n', '与/c', '中国银行/ni', '加强/v', '合作/v'] 默认返回是生成器列表 """ if type(words)==str: words=split_words(words) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) if type_list : return [i for i in postags] if pos: return ['{}/{}'.format(k,v)for k,v in zip(words,[i for i in postags])] return postags
def __init__(self): self.cws_model_path = os.path.join(self.LTP_DATA_DIR, 'cws.model') # 分词模型路径,模型名称为`cws.model` self.pos_model_path = os.path.join(self.LTP_DATA_DIR, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self.ner_model_path = os.path.join(self.LTP_DATA_DIR, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` segmentor = Segmentor() segmentor.load(self.cws_model_path) self.words = segmentor.segment(data) # print("|".join(words)) segmentor.release() postagger = Postagger() # 初始化实例 postagger.load(self.pos_model_path) # 加载模型 self.postags = postagger.postag(self.words) # 词性标注 # print('\t'.join(postags)) postagger.release() # 释放模型 recognizer = NamedEntityRecognizer() # 初始化实例 recognizer.load(self.ner_model_path) # 加载模型 self.netags = recognizer.recognize(self.words, self.postags) # 命名实体识别 # print('\t'.join(netags)) recognizer.release() # 释放模型
# Set your own model path MODELDIR=os.path.join(ROOTDIR, "ltp_data") from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller paragraph = '中国进出口银行与中国银行加强合作。中国进出口银行与中国银行加强合作!' sentence = SentenceSplitter.split(paragraph)[0] segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) words = segmentor.segment(sentence) print "\t".join(words) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print "\t".join(postags) parser = Parser() parser.load(os.path.join(MODELDIR, "parser.model")) arcs = parser.parse(words, postags) print "\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) netags = recognizer.recognize(words, postags)
from sklearn.feature_selection import SelectKBest, chi2 from scipy.sparse import hstack import sys, os ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir) sys.path.append(os.path.join(ROOTDIR, "lib")) # Set your own model path MODELDIR = os.path.join("/home/fish/", "ltp_data") from pyltp import Segmentor, Postagger, NamedEntityRecognizer # @UnresolvedImport # 分词功能 segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) recognizer = NamedEntityRecognizer() recognizer.load(os.path.join(MODELDIR, "ner.model")) def ltp(sentence): words = segmentor.segment(sentence) # 词性标注功能 postags = postagger.postag(words) # 实体识别 netags = recognizer.recognize(words, postags) l = [] li = zip(list(words), list(postags), list(netags)) for a, b, c in li: # 去掉命名实体
def main(): f = open("psgs.txt", "r") lines = [line.rstrip() for line in f.readlines()] f.close() segmentor = Segmentor() segmentor.load(os.path.join(MODELDIR, "cws.model")) postagger = Postagger() postagger.load(os.path.join(MODELDIR, "pos.model")) f = open("../questions/q_facts_segged_clf.txt", "r") types = f.readlines() f.close() f = open("../questions/provided/q_facts.txt", "r") questions = [line.rstrip() for line in f.readlines()] f.close() f = open("psgs_segged.txt", "w") fans = open("zhidao_answer.txt", "w") i = 0 qid = 0 flag = 0 while i < len(lines): line = lines[i] if (i % 50000 == 0): print "\r#\t%d" % i, sys.stdout.flush() if line.startswith("<question"): qid = int(line.split(" ")[1].split("=")[1].split(">")[0]) flag = 0 f.write(line + "\n") elif line.startswith("</doc") or line.startswith("</question"): f.write(line + "\n") elif line.startswith("<doc"): f.write(line + "\n" + lines[i+1] + "\n") i += 2 else: L = len(line) s = 0 for s in range(L): if line[s:].startswith("最佳答案:") \ or line[s:].startswith("[专业]答案")\ or line[s:].startswith("、"+questions[qid-1]): break if line[s:].startswith("最佳答案"): s += 14 elif line[s:].startswith("[专业]答案"): s += 15 elif line[s:].startswith("、"+questions[qid-1]): s += len(questions[qid-1])+1 if s < L and flag == 0: t = s + 1 while t < L and line[t:].startswith("更多") == False\ and not (t+2<L and line[t]==" " and line[t+1] in "0123456789" and line[t+2] in "0123456789")\ and not line[t:].startswith("~")\ and not line[t:].startswith("?")\ and not line[t:].startswith("!")\ and not line[t:].startswith("。"): t += 1 if s < t and t-s < 200 and t-s > 1: ans = line[s:t].rstrip(".。 ??,,") if types[qid-1].rstrip() == "Q_number": ans = first_con_number(ans) fans.write("%d\t%s\n" % (qid, ans)) flag = 1 # words = segmentor.segment(line) # postags = postagger.postag(words) # for j in range(len(words)): # f.write("%s/%s\t" % (words[j], postags[j])) # f.write("\n") i += 1 f.close() fans.close()