def __init__(self, articles_filename='articles.csv', record_filename='record.csv', rule_reference_filename='rule_reference.txt', LTP_DIR="ltp_data_v3.4.0/", filter_dictionary=['有限公司']): self.articles_filename = articles_filename self.record_filename = record_filename self.rule_reference_filename = rule_reference_filename ###############################加载ltp相关模型######################################### self.LTP_DIR = LTP_DIR #分词模型 self.segmentor = pyltp.Segmentor() self.segmentor.load(os.path.join(self.LTP_DIR, "cws.model")) #词性模型 self.postagger = pyltp.Postagger() self.postagger.load(os.path.join(self.LTP_DIR, 'pos.model')) #命名实体模型 self.recognizer = pyltp.NamedEntityRecognizer() self.recognizer.load(os.path.join(self.LTP_DIR, 'ner.model')) #依存句法分析 self.parser = pyltp.Parser() self.parser.load(os.path.join(self.LTP_DIR, 'parser.model')) self.filter_dictionary = filter_dictionary self.left_postags_dict = {} self.left_word_dict = {} self.mid_postags_dict = {} self.mid_word_dict = {} self.right_postags_dict = {} self.right_word_dict = {} self.CMP_dict = {} self.SBV_dict = {} self.VOB_dict = {}
def __init__(self, model_dir_path, blacklist_path): ''' model_dir_path: pyltp 模型文件路径 blacklist_path: 黑名单文件路径 ''' # 初始化相关模型文件路径 self.model_dir_path = model_dir_path self.cws_model_path = os.path.join( self.model_dir_path, 'cws.model') # 分词模型路径,模型名称为`cws.model` self.pos_model_path = os.path.join( self.model_dir_path, 'pos.model') # 词性标注模型路径,模型名称为`pos.model` self.ner_model_path = os.path.join( self.model_dir_path, 'ner.model') # 命名实体识别模型路径,模型名称为`pos.model` # 初始化分词模型 self.segmentor = pyltp.Segmentor() self.segmentor.load(self.cws_model_path) # 初始化词性标注模型 self.postagger = pyltp.Postagger() self.postagger.load(self.pos_model_path) # 初始化NER模型 self.recognizer = pyltp.NamedEntityRecognizer() self.recognizer.load(self.ner_model_path) # 初始化公司名黑名单 self.com_blacklist = set() with open(blacklist_path, 'r', encoding='utf-8') as f_com_blacklist: for line in f_com_blacklist: if len(line.strip()) > 0: self.com_blacklist.add(line.strip())
def build_files(self): """ 遍历原始文档,进行分词词性标注,去除停用词等,创建FileItem类集合 """ files = [] category_id = 0 segmentor = pyltp.Segmentor() segmentor.load( r'C:\Users\51694\PycharmProjects\paper\ltp_model\cws.hyp') postagger = pyltp.Postagger() postagger.load( r'C:\Users\51694\PycharmProjects\paper\ltp_model\pos.hyp') parser = pyltp.Parser() parser.load( r'C:\Users\51694\PycharmProjects\paper\ltp_model\parser.hyp') for ids, path in enumerate(self.file_paths()): with open(path, 'r', encoding='utf-8') as f: try: category = self.path2category(path) if category not in self.category_ids: self.category_ids[category] = category_id category_id += 1 raw = self.process_line(f.read()) words = self.remove_stop_words(list( segmentor.segment(raw))) words = self.clean_specific(words) pos = list(postagger.postag(words)) parse_result = list(parser.parse(words, pos)) files.append( FileItem(ids, category, words, pos, parse_result)) except UnicodeDecodeError: logging.warning(path + ' UTF-8解码失败,请检查文本格式') continue segmentor.release() postagger.release() parser.release() return files
def tag_text(segment_text_state, segment_text_queue, tag_result_queue): postagger = pyltp.Postagger() #实例化分词模块 postagger.load(model_path) #加载分词库 while (not segment_text_queue.empty() ) or segment_text_state.value == 'have': #get()方法从队头删除并返回一个项目 #如果队列为空,get()方法就使进程阻塞timeout秒。 #如果在timeout秒内,发现可用的项目,则继续执行。如果超时,则引发一个异常。 try: segment_text_list = segment_text_queue.get(block=True, timeout=0.1) tag_text_list = list() #一个分词进程分的所有文本放进一个列表,一个列表项就是一篇文本 for text in segment_text_list: words_list = text.split("|") postags_list = postagger.postag(words_list) tag_result_list = list() for word, postag in zip(words_list, postags_list): tag_result_list.append(word + "/" + postag) tag_result_text = ' '.join(tag_result_list) tag_text_list.append(tag_result_text) #put()方法在队尾插入一个项目 #如果队列已满,put()方法就使子进程阻塞,直到空出一个数据单元 tag_result_queue.put(tag_text_list, block=True, timeout=None) except: pass return
def word_posttagger(sentence): pos_ = pyltp.Postagger() pos_.load(pos_model_path) result = pos_.postag(sentence) print(type(result)) print('\t'.join(result)) pos_.release() return result
def __init__(self): model_path = '/home/lnn/Documents/postag/ltp_data_v3.4.0/' self.seg = pyltp.Segmentor() self.seg.load(model_path + 'cws.model') self.pos = pyltp.Postagger() self.pos.load(model_path + 'pos.model') self.parser=pyltp.Parser() self.parser.load(model_path+'parser.model')
def ltpSetup(): LTP_DATA_DIR = './ltp_data_v3.4.0/' cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') segmentor = pyltp.Segmentor() segmentor.load(cws_model_path) postagger = pyltp.Postagger() postagger.load(pos_model_path) return segmentor, postagger
def load_model(): ''' #载入分词模型和词性标注模型 ''' segmentor = pyltp.Segmentor() segmentor.load("./ltp_data/cws.model") postagger = pyltp.Postagger() postagger.load("./ltp_data/pos.model") return segmentor, postagger
def words_mark(array): # 词性标注模型路径,模型名称为`pos.model` pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') postagger = pyltp.Postagger() # 初始化实例 postagger.load(pos_model_path) # 加载模型 postags = postagger.postag(array) # 词性标注 pos_str = ' '.join(postags) pos_array = pos_str.split(" ") postagger.release() # 释放模型 return pos_array
def __init__(self, config_lib="ltp", config_dict=None, config_stop=None, config_dir=None, seg_out_list=False): self.input_type = str self.config_dir = config_dir if config_dir is None: self.config_dir = 'E:/Data/' if 'windows' in platform.architecture( )[1].lower() else '/users/fanzfeng/Data/' self.stop_config = False if config_stop is not None and isinstance( config_stop, str) and os.path.exists(config_stop): self.stop_config = True with open(config_stop, "r", encoding="utf-8") as fp: self.stop_words = [ k.strip() for k in fp.readlines() if len(k.strip()) > 0 ] elif isinstance(config_stop, (list, tuple, set)) and len(config_stop) > 0: self.stop_config = True self.stop_words = config_stop self.all_cut = False self.seg_out_list = seg_out_list self.config_lib = config_lib if config_lib == "jieba": self.jieba_ner = "nr ns nt m".split() if config_dict is not None and isinstance( config_dict, str) and os.path.exists(config_dict): jieba.load_userdict(config_dict) self.seg = jieba.cut self.pos_seg = pseg.cut elif config_lib == "ltp": import pyltp self.segmentor = pyltp.Segmentor() if config_dict is not None and isinstance( config_dict, str) and os.path.exists(config_dict): self.segmentor.load_with_lexicon( os.path.join(self.config_dir, "ltp_data_v3.4.0/cws.model"), config_dict) else: self.segmentor.load( os.path.join(self.config_dir, "ltp_data_v3.4.0/cws.model")) self.seg = self.segmentor.segment self.postagger = pyltp.Postagger() self.text_splitter = pyltp.SentenceSplitter.split self.postagger.load( os.path.join(self.config_dir, "ltp_data_v3.4.0/pos.model")) self.recognizer = pyltp.NamedEntityRecognizer() self.recognizer.load(self.config_dir + "ltp_data_v3.4.0/ner.model")
def sentence( articles: List[Dict], project: Path = os.getcwd(), ltp_dir=os.path.abspath(os.path.join(os.path.realpath(__file__), "../..")) + '/ltp_data' ) -> List[Dict]: logger = hlogger(project) start_time = datetime.datetime.now() logger.info('Starting to process sentences') # 加载ltp相关模型 # 分词模型 segmentor = pyltp.Segmentor() segmentor.load(os.path.join(ltp_dir, "cws.model")) # 词性模型 postagger = pyltp.Postagger() postagger.load(os.path.join(ltp_dir, 'pos.model')) # 命名实体模型 recognizer = pyltp.NamedEntityRecognizer() recognizer.load(os.path.join(ltp_dir, 'ner.model')) if_force = False # 如果存在词表,强制实体标注加载词表 if os.path.exists(project + '/lexicon'): logger.info('Ner will use lexicon') if_force = force_segmentor() if_force.load(project + '/lexicon') logger.info('Processing sentences') results = [] for article in articles: result = extract_information(article['id'], article['content'], segmentor, postagger, recognizer, if_force) results.extend(result) length = len(results) end_time = datetime.datetime.now() logger.info( 'Sentences have been processed successfully,and there are %s sentences' % len(results)) logger.info('FINISHED! using time : %s\n' % get_time( (end_time - start_time).seconds)) return results
def do_pos(self, intxt): words = self.do_seg(intxt) if self.postagger is None: self.postagger = pyltp.Postagger() if self.debug: load_start = default_timer() self.postagger.load(os.path.join(self.model_dir, 'pos.model')) if self.debug: load_use = default_timer() - load_start self.loger.debug("load pos.model use [ %f ] s" % load_use) postags = self.postagger.postag(words) return words, list(postags)
def Postagger(self, words=None, sent=None): if self.__postagger is None: self.__postagger = pyltp.Postagger() if self.__seg_lexicon_path is None: self.__postagger.load(self.__pos_model_path) else: self.__postagger.load_with_lexicon(self.__pos_model_path, self.__seg_lexicon_path) postags = None if sent is not None: words = self.Segmentor(sent) postags = self.__postagger.postag(words) else: postags = self.__postagger.postag(words) return postags
def __init__(self): self.path = 'ltp_data_v3.4.0/' # 下载地址 https://ltp.ai/download.html 3.4.0 self.segmentor = pp.Segmentor() self.segmentor.load(self.path + "cws.model") # 加载分词模型 self.postagger = pp.Postagger() self.postagger.load(self.path + "pos.model") # 加载词性标注模型 self.recognizer = pp.NamedEntityRecognizer() self.recognizer.load(self.path + "ner.model") # 加载命名实体识别模型 self.parser = pp.Parser() self.parser.load(self.path + "parser.model") # 加载依存句法分析模型 self.labeller = pp.SementicRoleLabeller() self.labeller.load(self.path + "pisrl.model") # 加载语义角色标注模型
def __init__(self, *args, **kwargs): self.__LTP_DATA_DIR = 'D:\\NLP\\ltp_data' self.__cws_model_path = os.path.join(self.__LTP_DATA_DIR, 'cws.model') self.__pos_model_path = os.path.join(self.__LTP_DATA_DIR, 'pos.model') self.__par_model_path = os.path.join(self.__LTP_DATA_DIR, 'parser.model') self.segmentor = pyltp.Segmentor() self.segmentor.load_with_lexicon(self.__cws_model_path, './../data/word_dict.txt') self.postagger = pyltp.Postagger() self.postagger.load(self.__pos_model_path) self.parser = pyltp.Parser() self.parser.load(self.__par_model_path) self.tags_dict = {}
def __init__(self, ltp_path, dependency=False): self.dependency = dependency cws_model_path = os.path.join(ltp_path, 'cws.model') pos_model_path = os.path.join(ltp_path, 'pos.model') ner_model_path = os.path.join(ltp_path, 'ner.model') dp_model_path = os.path.join(ltp_path, 'parser.model') self.seg = pyltp.Segmentor() self.pos = pyltp.Postagger() self.ner = pyltp.NamedEntityRecognizer() # self.srl = pyltp.SementicRoleLabeller() self.seg.load(cws_model_path) self.pos.load(pos_model_path) self.ner.load(ner_model_path) # self.srl.load(srl_model_path) if dependency: self.dp = pyltp.Parser() self.dp.load(dp_model_path)
def __init__(self, seg_model_path = 'ltp_data_v3/ltp_data_v3.4.0/cws.model', seg_lexicon_path = 'lexicon/lexicon_test', pos_model_path = 'ltp_data_v3/ltp_data_v3.4.0/pos.model', rec_model_path = 'ltp_data_v3/ltp_data_v3.4.0/ner.model', par_model_path = 'ltp_data_v3/ltp_data_v3.4.0/parser.model'): self.seg_lexicon_path = seg_lexicon_path self.segmentor = pyltp.Segmentor() self.seg_model_path = seg_model_path self.segmentor.load_with_lexicon(self.seg_model_path,self.seg_lexicon_path) self.postagger = pyltp.Postagger() self.pos_model_path = pos_model_path self.postagger.load(self.pos_model_path) self.recognizer = pyltp.NamedEntityRecognizer() self.rec_model_path = rec_model_path self.recognizer.load(rec_model_path) self.parser = pyltp.Parser() self.par_model_path = par_model_path self.parser.load(self.par_model_path)
def ltp_init(self): import pyltp LTP_DATA_DIR = '/nas/data/m1/panx2/lib/ltp/ltp_data_v3.4.0' cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model') self.model_ltp_splitter = pyltp.SentenceSplitter() self.model_ltp_segmentor = pyltp.Segmentor() self.model_ltp_segmentor.load(cws_model_path) self.model_ltp_postagger = pyltp.Postagger() self.model_ltp_postagger.load(pos_model_path) self.model_ltp_recognizer = pyltp.NamedEntityRecognizer() self.model_ltp_recognizer.load(ner_model_path) self.model_ltp_dparser = pyltp.Parser() self.model_ltp_dparser.load(par_model_path) self.parse = self._parse self.sent_seger = self.ltp_sent_seger self.tokenizer = self.ltp_tokenizer self.processor = self.ltp_processor
def _model_initialize(self): if self.__segmentor == None: self.__segmentor = pyltp.Segmentor() if self.__seg_lexicon_path == None: self.__segmentor.load(self.__seg_model_path) else: self.__segmentor.load_with_lexicon(self.__seg_model_path, self.__seg_lexicon_path) if self.__postagger == None: self.__postagger = pyltp.Postagger() if self.__seg_lexicon_path == None: self.__postagger.load(self.__pos_model_path) else: self.__postagger.load_with_lexicon(self.__pos_model_path, self.__seg_lexicon_path) if self.__recognizer == None: self.__recognizer = pyltp.NamedEntityRecognizer() self.__recognizer.load(self.__rec_model_path) if self.__parser == None: self.__parser = pyltp.Parser() self.__parser.load(self.__par_model_path)
def ltp_process(sentence): stop_words = get_stops_words() # 提取停用词,为SBV词【如“是”等停用词】在SBV词中删除。 # 分词 segmentor = pyltp.Segmentor() segmentor.load("./cws.model") words = segmentor.segment(sentence) print("\t".join(words)) segmentor.release() # 词性 postagger = pyltp.Postagger() postagger.load("./pos.model") postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 # postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"]) print("\t".join(postags)) postagger.release() # 依存句法分析 parser = pyltp.Parser() parser.load("./parser.model") arcs = parser.parse(words, postags) parser.release() # 角色分析,暂时没用上 # 拿到前面来是有用意的,在进行判断了当前的SBV的子节点与"说"有关后,需要抽取这个词,简而言之,是SBV,又对应A0,则这个词一定是主语。 labeller = pyltp.SementicRoleLabeller() labeller.load("./pisrl.model") roles = labeller.label(words, postags, arcs) print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) SBV_set = list() Subject_label_set = list() Word_of_speech_content = list() Index_of_Subjet = 0 for arc in arcs: #SBV_index = get_repeat(arc.head, "SBV") k = Index_of_Subjet if arc.relation == "SBV" and words[ arc.head - 1] not in stop_words: # 这个地方难道真的不够严谨,不能只判断是不是SBV,因为一旦判断有SBV了,那么必然这个词就是A0 SBV_set.append(words[arc.head - 1]) # arc.head是从1开始计数,存储SBV指向的谓语动词 Subject_label_set.append( words[Index_of_Subjet]) # 如果有SBV,那么这个词对应的位置肯定是主语 Word_of_speech_content.append( words[arc.head:]) # 拿出来的相当于SBV主语词以后的部分。 Index_of_Subjet += 1 else: Index_of_Subjet += 1 continue # 如果为空列表,该句子没有分析的必要性 ''' recognizer = pyltp.NamedEntityRecognizer() recognizer.load("./ner.model") netags = recognizer.recognize(words, postags) print("\t".join(netags)) labeller = pyltp.SementicRoleLabeller() labeller.load("./pisrl.model") roles = labeller.label(words, postags, arcs) for role in roles: print(role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) ''' return SBV_set, Subject_label_set, Word_of_speech_content # 返回的是一个列表,第一个值是SBV的子节点词(HED),第二个是当前SBV的主语。一定要注意,是不是都是[]
import pandas as pd import literature import pyltp import pickle import os from trigger_dict import TriggerDict from math import inf MIN_SENTENCE_NUM = 140 STOP_WORD_PATH = './相关词表/停用词词表.txt' LTP_SEGMENT_MODE = './LTP_model/cws.model' LTP_POS_MODE = './LTP_model/pos.model' LTP_PARSE_MODE = './LTP_model/parser.model' SEGMENTOR = pyltp.Segmentor() POSTARGGER = pyltp.Postagger() PARSER = pyltp.Parser() with open('./相关词表/线索词词表.txt', 'r', encoding='utf-8') as f: CLUE_WORDS = f.read().splitlines() def load_model(): """ 加载LTP包的分词、词性标注、句法分析模型 """ SEGMENTOR.load(LTP_SEGMENT_MODE) POSTARGGER.load(LTP_POS_MODE) PARSER.load(LTP_PARSE_MODE) def release_model(): """ 释放LTP包的分词、词性标注、句法分析模型 """ SEGMENTOR.release() POSTARGGER.release()
#encoding:utf-8 from deepdive import * from transform import * import pyltp import numpy as np import os import sys #加载ltp相关模型 LTP_DIR = "/root/transaction/udf/model/ltp_data_v3.4.0" #分词模型 segmentor = pyltp.Segmentor() segmentor.load(os.path.join(LTP_DIR, "cws.model")) #词性模型 postagger = pyltp.Postagger() postagger.load(os.path.join(LTP_DIR, 'pos.model')) #命名实体模型 recognizer = pyltp.NamedEntityRecognizer() recognizer.load(os.path.join(LTP_DIR, 'ner.model')) #依存句法分析 parser = pyltp.Parser() parser.load(os.path.join(LTP_DIR, 'parser.model')) @tsv_extractor @returns(lambda doc_id="text", sentence_index="int", sentence_text="text", tokens="text[]", lemmas="text[]", pos_tags="text[]", ner_tags= "text[]", dep_types="text[]", dep_tokens="int[]", : [])
def __init__(self): self.tagger = pyltp.Postagger() self.parser = pyltp.Parser() self.tagger.load(path_to_tagger) self.parser.load(path_to_parser)
def pos_words(words): postagger = pyltp.Postagger() postagger.load(ltp_path + 'pos.model') postags_lst = [pos for pos in postagger.postag(words)] postagger.release() return postags_lst
def make_instances(samples, char_voc, word_voc, sentiment_words_path, question2targets, need_augment, is_training, use_extra_feature, ner_dict_path, pos_dict_path, dtype=np.int32): # TODO: build sentiment words for own data positive_words, negative_words = load_sentiment_words(sentiment_words_path) if need_augment: samples = instance_augment(samples, question2targets) questions = unique_list([sample['question'] for sample in samples]) question2skeleton = gather_skeleton_indicator(questions) if use_extra_feature: with open(ner_dict_path, 'rb') as infile: ner_dict = pickle.load(infile) with open(pos_dict_path, 'rb') as infile: pos_dict = pickle.load(infile) ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model') cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model') pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model') segmentor = pyltp.Segmentor() segmentor.load(cws_model_path) postagger = pyltp.Postagger() postagger.load(pos_model_path) recognizer = pyltp.NamedEntityRecognizer() recognizer.load(ner_model_path) assert set(positive_words) & set(negative_words) == set() sentiment_words = positive_words + negative_words sentiment_words.sort(key=lambda sen_word: len(sen_word), reverse=True) sentiment_words_dic = {} for sen_word in sentiment_words: if sen_word in positive_words: sentiment_words_dic[sen_word] = 1 else: sentiment_words_dic[sen_word] = 0 for sample in samples: assert isinstance(sample, dict) que_ww2v_index_sequence = [] ans_ww2v_index_sequence = [] ans = sample["answer"] que = sample["question"] for word in jieba.lcut(que): for _ in range(len(word)): que_ww2v_index_sequence.append(word_voc[word]) for word in jieba.lcut(ans): for _ in range(len(word)): ans_ww2v_index_sequence.append(word_voc[word]) que_ww2v_index_sequence = np.array(que_ww2v_index_sequence, dtype=dtype) ans_ww2v_index_sequence = np.array(ans_ww2v_index_sequence, dtype=dtype) def make_extra_index_sequence(str_): pos_end = [] ner_end = [] words = segmentor.segment(str_) postages = postagger.postag(words) netags = recognizer.recognize(words, postages) for word_index in range(len(list(words))): word = list(words)[word_index] for _ in range(len(word)): pos_end.append(pos_dict[list(postages)[word_index]]) ner_temp = list(netags)[word_index] if '-' in ner_temp: ner_temp = ner_temp[ner_temp.index('-') + 1:] ner_end.append(ner_dict[ner_temp]) return np.array(ner_end, dtype=dtype), np.array(pos_end, dtype=dtype) if use_extra_feature: que_ner_index_sequence, que_pos_index_sequence = make_extra_index_sequence( que) ans_ner_index_sequence, ans_pos_index_sequence = make_extra_index_sequence( ans) else: que_ner_index_sequence = np.array([0] * len(que), dtype=dtype) que_pos_index_sequence = np.array([0] * len(que), dtype=dtype) ans_ner_index_sequence = np.array([0] * len(ans), dtype=dtype) ans_pos_index_sequence = np.array([0] * len(ans), dtype=dtype) que_cw2v_index_sequence = [ char_voc[char] for char in sample['question'] ] que_cw2v_index_sequence = np.array(que_cw2v_index_sequence, dtype=dtype) ans_cw2v_index_sequence = [char_voc[char] for char in sample['answer']] ans_cw2v_index_sequence = np.array(ans_cw2v_index_sequence, dtype=dtype) que_skeleton_label = question2skeleton[que] assert len(que_cw2v_index_sequence) == len(que_ww2v_index_sequence) assert len(ans_cw2v_index_sequence) == len(ans_ww2v_index_sequence) if use_extra_feature: assert len(que_ner_index_sequence) == len(que_pos_index_sequence) assert len(ans_ner_index_sequence) == len(ans_pos_index_sequence) assert len(que_cw2v_index_sequence) == len(que_ner_index_sequence) assert len(ans_cw2v_index_sequence) == len(ans_ner_index_sequence) if len(que_cw2v_index_sequence) != len(que_skeleton_label): print(que) print(len(que_cw2v_index_sequence)) print(len(que_skeleton_label)) def make_sentiment_polarity_labels(str_): # 0:neutral, 1: positive 2: negative ans_temp = str_ sentiment_polarity_labels = np.array([0] * len(str_), dtype=dtype) for sen_word in sentiment_words: if sen_word in ans_temp and sen_word in str_: if sentiment_words_dic[sen_word] == 1: sentiment_polarity_labels[str_.index(sen_word):str_. index(sen_word) + len(sen_word)] = np.ones( len(sen_word)) ans_temp = ans_temp[:ans_temp.index( sen_word)] + ans_temp[ans_temp.index(sen_word) + len(sen_word):] else: sentiment_polarity_labels[str_.index(sen_word):str_. index(sen_word) + len(sen_word)] = np.array( [2] * len(sen_word), dtype=dtype) ans_temp = ans_temp[:ans_temp.index( sen_word)] + ans_temp[ans_temp.index(sen_word) + len(sen_word):] return sentiment_polarity_labels def make_indicate_target_labels(): # 0: not 1: is specify_target = str(sample["target"]) indicate_target_labels = np.array([0] * len(que), dtype=dtype) if specify_target in que: indicate_target_labels[que.index(specify_target): que.index(specify_target) + len(specify_target)] = \ np.ones(len(specify_target), dtype=dtype) return indicate_target_labels ans_sentiment_polarity_labels = make_sentiment_polarity_labels(ans) que_sentiment_polarity_labels = make_sentiment_polarity_labels(que) indicate_target_labels = make_indicate_target_labels() sample.update({ 'que_ww2v_index_sequence': que_ww2v_index_sequence, 'ans_ww2v_index_sequence': ans_ww2v_index_sequence, 'que_cw2v_index_sequence': que_cw2v_index_sequence, 'ans_cw2v_index_sequence': ans_cw2v_index_sequence, 'ans_sentiment_polarity_labels': ans_sentiment_polarity_labels, 'que_sentiment_polarity_labels': que_sentiment_polarity_labels, 'que_indicate_target_labels': indicate_target_labels, 'ans_indicate_target_labels': np.array([0] * len(ans), dtype=dtype), 'que_skeleton_label': que_skeleton_label, 'que_ner_index_sequence': que_ner_index_sequence, 'que_pos_index_sequence': que_pos_index_sequence, 'ans_ner_index_sequence': ans_ner_index_sequence, 'ans_pos_index_sequence': ans_pos_index_sequence }) # 'question_id': sen_voc[sample["question"]], # 'answer_id': sen_voc[sample["answer"]] return samples
def ltp_process(sentence, old_SI={}): stop_words = get_stops_words() # 提取停用词,为SBV词【如“是”等停用词】在SBV词中删除。 # 分词 segmentor = pyltp.Segmentor() segmentor.load("./model/cws.model") words = segmentor.segment(sentence) segmentor.release() # 词性 postagger = pyltp.Postagger() postagger.load("./model/pos.model") postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 postagger.release() parser = pyltp.Parser() parser.load("./model/parser.model") arcs = parser.parse(words, postags) parser.release() noun_tags = ['nh','nd','n','ni','nl','ns','nt','nz'] # nh:person name, nd:direction noun, n:general noun, ni:organization name, nl:location noun # ns: geographical name, nt:temporal noun, nz:other proper noun SI_words = {} # 词和索引 for tag in noun_tags: # 找出noun_tags词性,对应词在句子的位置 SI_index = np.argwhere(np.array(postags)==tag).reshape(-1).tolist() # SI_index 存储符合tag词性的词的位置 for j in SI_index: # 找到该词,并记录该词位置 蒋丽芸 SI_words[words[j]] = j #print(SI_words) SBV_set = list() Subject_label_set = list() Word_of_speech_content = list() Index_of_Subjet = 0 SI_postags = {} si_SBV_postag = [] si_VOB_postag = [] for arc in arcs: # SBV_index = get_repeat(arc.head, "SBV") k = Index_of_Subjet if arc.relation == "SBV" and words[arc.head - 1] not in stop_words: # 这个地方难道真的不够严谨,不能只判断是不是SBV,因为一旦判断有SBV了,那么必然这个词就是A0 #print(arc.head, words[arc.head -1]) SBV_set.append(words[arc.head - 1]) # arc.head是从1开始计数,存储SBV指向的谓语动词 # 加入主语的判断 if words[Index_of_Subjet] in ['他','他们','你','你们','我','我们', '她','她们']: # 进行指代消解 # 查看当前old_SI,如果old_SI中有相同角色,取积分最高值进行替换人称代词。需要做一次修正,名词词组如习近平+总书记应该是一个词,或者把习近平的权重设置为总书记一样 if old_SI: ag2entity = np.argmax(old_SI.params.keys()) words[Index_of_Subjet] = list(old_SI.params.keys())[ag2entity] else: pass Subject_label_set.append(words[Index_of_Subjet]) else: Subject_label_set.append(words[Index_of_Subjet]) # 如果不是指示代词,那么这个词对应的位置肯定是主语 #SI_postag[words[Index_of_Subjet].split(':')[1]] = Index_of_Subjet if postags[arc.head -1] == 'v': si_SBV_postag.append((words[Index_of_Subjet], Index_of_Subjet)) Word_of_speech_content.append(intro_speech(''.join(words[arc.head:]))) # 拿出所说内容。 #print(intro_speech(''.join(words[arc.head:]))) Index_of_Subjet += 1 SI_postags[arc.relation] = si_SBV_postag elif arc.relation == 'VOB' and words[arc.head -1] not in stop_words: # 加入宾语的判断 if words[Index_of_Subjet] in ['他','他们','你','你们','我','我们', '她','她们']: # 进行指代消解 # 引入前一句的宾语位置和积分最高元素 pass else: Subject_label_set.append(words[Index_of_Subjet]) # 如果不是指示代词,那么这个词对应的位置肯定是主语 si_VOB_postag.append((words[Index_of_Subjet], Index_of_Subjet)) Index_of_Subjet += 1 SI_postags[arc.relation] = si_VOB_postag else: Index_of_Subjet += 1 continue # 如果为空列表,该句子没有分析的必要性 Forcus_point = Si(SI_words, SI_postags,old_SI) # 关注焦点集 # 需要更新self.params Forcus_point.score() return SBV_set, Subject_label_set, Word_of_speech_content, Forcus_point # 返回的是一个列表,第一个值是SBV的子节点词(HED),第二个是当前SBV的主语,结果有可能是空。
def load_model(): segmentor = pyltp.Segmentor() segmentor.load("./ltp_data/cws.model") postagger = pyltp.Postagger() postagger.load("./ltp_data/pos.model") return segmentor, postagger
def postag(words): global postagger_ if postagger_ is None: postagger_ = pyltp.Postagger() postagger_.load(ltp_models['pos']) return postagger_.postag(words)
def ltp_process(sentence, old_SI={}): stop_words = get_stops_words() # 提取停用词,为SBV词【如“是”等停用词】在SBV词中删除。 # 分词 segmentor = pyltp.Segmentor() segmentor.load("./model/cws.model") words = segmentor.segment(sentence) #print("\t".join(words)) segmentor.release() # 词性 postagger = pyltp.Postagger() postagger.load("./model/pos.model") postags = postagger.postag(words) # list-of-string parameter is support in 0.1.5 #print("\t".join(postags)) postagger.release() # 依存句法分析 parser = pyltp.Parser() parser.load("./model/parser.model") arcs = parser.parse(words, postags) parser.release() # 拿到前面来是有用意的,在进行判断了当前的SBV的子节点与"说"有关后,需要抽取这个词,简而言之,是SBV,又对应A0,则这个词一定是主语。 labeller = pyltp.SementicRoleLabeller() labeller.load("./model/pisrl.model") roles = labeller.label(words, postags, arcs) #print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs)) # 依存句法分析 noun_tags = ['nh', 'nd', 'n', 'ni', 'nl', 'ns', 'nt', 'nz'] SI_words = {} # 词和索引 for tag in noun_tags: SI_index = np.argwhere(np.array(postags) == tag).reshape(-1).tolist() for j in SI_index: SI_words[words[j]] = j #print(SI_words) SBV_set = list() Subject_label_set = list() Word_of_speech_content = list() Index_of_Subjet = 0 SI_postags = {} si_SBV_postag = [] si_VOB_postag = [] for arc in arcs: # SBV_index = get_repeat(arc.head, "SBV") k = Index_of_Subjet if arc.relation == "SBV" and words[ arc.head - 1] not in stop_words: # 这个地方难道真的不够严谨,不能只判断是不是SBV,因为一旦判断有SBV了,那么必然这个词就是A0 #print(arc.head, words[arc.head -1]) SBV_set.append(words[arc.head - 1]) # arc.head是从1开始计数,存储SBV指向的谓语动词 # 加入主语的判断 if words[Index_of_Subjet] in [ '他', '他们', '你', '你们', '我', '我们', '她', '她们' ]: # 进行指代消解 # 查看当前old_SI,如果old_SI中有相同角色,取积分最高值进行替换人称代词。需要做一次修正,名词词组如习近平+总书记应该是一个词,或者把习近平的权重设置为总书记一样 if old_SI: ag2entity = np.argmax(old_SI.params.keys()) words[Index_of_Subjet] = list( old_SI.params.keys())[ag2entity] else: pass Subject_label_set.append(words[Index_of_Subjet]) else: Subject_label_set.append( words[Index_of_Subjet]) # 如果不是指示代词,那么这个词对应的位置肯定是主语 #SI_postag[words[Index_of_Subjet].split(':')[1]] = Index_of_Subjet if postags[arc.head - 1] == 'v': si_SBV_postag.append( (words[Index_of_Subjet], Index_of_Subjet)) Word_of_speech_content.append( intro_speech(''.join(words[arc.head:]))) # 拿出所说内容。 #print(intro_speech(''.join(words[arc.head:]))) Index_of_Subjet += 1 SI_postags[arc.relation] = si_SBV_postag elif arc.relation == 'VOB' and words[arc.head - 1] not in stop_words: # 加入宾语的判断 if words[Index_of_Subjet] in [ '他', '他们', '你', '你们', '我', '我们', '她', '她们' ]: # 进行指代消解 # 引入前一句的宾语位置和积分最高元素 pass else: Subject_label_set.append( words[Index_of_Subjet]) # 如果不是指示代词,那么这个词对应的位置肯定是主语 si_VOB_postag.append((words[Index_of_Subjet], Index_of_Subjet)) Index_of_Subjet += 1 SI_postags[arc.relation] = si_VOB_postag else: Index_of_Subjet += 1 continue # 如果为空列表,该句子没有分析的必要性 Forcus_point = Si(SI_words, SI_postags, old_SI) # 关注焦点集 # 需要更新self.params Forcus_point.score() recognizer = pyltp.NamedEntityRecognizer() recognizer.load("./model/ner.model") netags = recognizer.recognize(words, postags) #print("\t".join(netags)) ''' labeller = pyltp.SementicRoleLabeller() labeller.load("./pisrl.model") roles = labeller.label(words, postags, arcs) for role in roles: print(role.index, "".join( ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments])) ''' return SBV_set, Subject_label_set, Word_of_speech_content, Forcus_point # 返回的是一个列表,第一个值是SBV的子节点词(HED),第二个是当前SBV的主语,结果有可能是空。