class Ner: def __init__(self): self.ltp = LTP() def preprocess(self, sent): return re.sub('\s+', '', sent) def ner(self, sents): assert not any(re.search(r'\s', x) for x in sents), "no space is allowed" psents = [x for x in sents if x != ''] if len(psents) == 0: return [[] for x in sents] segment, hidden = self.ltp.seg(psents) ne = self.ltp.ner(hidden) anes = [] for sseg, sne in zip(segment, ne): nes = [] slens = [0] + [len(x) for x in sseg] for i in range(1, len(slens)): slens[i] += slens[i - 1] for t, x, y in sne: if t == 'Ns': nes.append([slens[x], slens[y + 1]]) anes.append(nes) fnes = [] cur = 0 for s in sents: if s == '': fnes.append([]) else: fnes.append(anes[cur]) cur += 1 return fnes
class NamedEntity: def __init__(self, user_dict): self.ltp = LTP() # 默认加载Small模型 # user_dict.txt 是词典文件, max_window是最大前向分词窗口 self.ltp.init_dict(path=user_dict, max_window=4) def entity_recognition(self, text: list): """ 命名实体识别 :param text: 原始文本 :return: 从原始文本中抽取的命名实体 """ seg, hidden = self.ltp.seg(text) # 分词 ner = self.ltp.ner(hidden) entity = [] for tag, start, end in ner[0]: entity.append(seg[0][start:end+1][0]) return entity
def new_generate_ltp_results(): # 加载模型 ltp_model = '../../ltp_models/base1' ltp = LTP(path=ltp_model) # 读取原句子 data = read_file_in_ltp('../data/train_base.json') sentences = list(map(lambda x: x['content'], data)) segmented, pos, ner, srl, dep, sdp_tree, sdp_graph = [], [], [], [], [], [], [] for sent in tqdm(sentences): # 分词 segmented0, hidden = ltp.seg([sent]) # 词性标注 cur_pos = ltp.pos(hidden) # 命名实体识别 cur_ner = ltp.ner(hidden) # 语义角色标注 cur_srl = ltp.srl(hidden) # 依存句法分析 cur_dep = ltp.dep(hidden) # 语义依存分析 (树) cur_sdp_tree = ltp.sdp(hidden, mode='tree') # 语义依存分析 (图) cur_sdp_graph = ltp.sdp(hidden, mode='graph') segmented.append(segmented0[0]) pos.append(cur_pos[0]) ner.append(cur_ner[0]) srl.append(cur_srl[0]) dep.append(cur_dep[0]) sdp_tree.append(cur_sdp_tree[0]) sdp_graph.append(cur_sdp_graph[0]) # 生成句子与分词的对应 sent_seg_matches = sentence_segment_match(data, segmented) pickle.dump([segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches], open('new_ltp_results.pk', 'wb')) return segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches
class Server(object): def __init__(self, path: str = 'small', batch_size: int = 50, device: str = None, onnx: bool = False): self.ltp = LTP(path=path, device=device) self.split = lambda a: map(lambda b: a[b:b + batch_size], range(0, len(a), batch_size)) def _build_words(self, words, pos, dep): res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}] for word, p, (id, parent, relation) in zip(words, pos, dep): offset = res[-1]['offset'] + res[-1]['length'] res.append({ 'id': id - 1, 'length': len(word), 'offset': offset, 'text': word, 'pos': p, 'parent': parent - 1, 'relation': relation, 'roles': [], 'parents': [] }) return res[1:] def _predict(self, sentences: List[str]): result = [] for sentences_batch in self.split(sentences): batch_seg, hidden = self.ltp.seg(sentences_batch) batch_pos = self.ltp.pos(hidden) batch_ner = self.ltp.ner(hidden) batch_srl = self.ltp.srl(hidden) batch_dep = self.ltp.dep(hidden, fast=False) batch_sdp = self.ltp.sdp(hidden, mode='mix') for sent, seg, pos, ner, srl, dep, sdp in \ zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp): words = self._build_words(seg, pos, dep) for word, token_srl in zip(words, srl): for role, start, end in token_srl: text = "".join(seg[start:end + 1]) offset = words[start]['offset'] word['roles'].append({ 'text': text, 'offset': offset, 'length': len(text), 'type': role }) for start, end, label in sdp: words[start - 1]['parents'].append({ 'parent': end - 1, 'relate': label }) nes = [] for role, start, end in ner: text = "".join(seg[start:end + 1]) nes.append({ 'text': text, 'offset': start, 'ne': role.lower(), 'length': len(text) }) result.append({'text': sent, 'nes': nes, 'words': words}) return result def serve(self, port: int = 5000, n_process: int = None): if n_process is None: n_process = 1 if sys.platform == 'win32' else 8 fmt = LogFormatter(fmt='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', color=True) root_logger = logging.getLogger() console_handler = logging.StreamHandler() file_handler = logging.FileHandler('server.log') console_handler.setFormatter(fmt) file_handler.setFormatter(fmt) root_logger.addHandler(console_handler) root_logger.addHandler(file_handler) app_log.setLevel(logging.INFO) gen_log.setLevel(logging.INFO) access_log.setLevel(logging.INFO) app_log.info("Model is loading...") app_log.info("Model Has Been Loaded!") app = Application([(r"/.*", LTPHandler, dict(ltp=self))]) server = HTTPServer(app) server.bind(port) server.start(n_process) ioloop.IOLoop.instance().start()
## #--------从测试看出来,ner本身对问题有干扰,所以在kglist里面要去除. # tiaozhuan=searchKG(kglist=['地点','地址','大小','颜色','老婆','丈夫'],text='我家住在和平区哪个地方') # print(tiaozhuan,"jieguo shi !!!!!!!!!!!!!!!!") ## # 加入句子成分跳转. seg, hidden = ltp.seg([text]) # sdp = ltp.sdp(hidden, graph=False) print(seg, "seg") pos = ltp.pos(hidden) ner = ltp.ner(hidden) print("ner", ner) srl = ltp.srl(hidden) dep = ltp.dep(hidden) sdp = ltp.sdp(hidden) print(ner, "ner结果") seg = seg[0] dep = dep[0] sdp = sdp[0] print(sdp, "语义分析!!!!!!!!!!!!!!!!!!!") # 太难用了. print(dep) for i in dep: #dep算法目前识别不出来老婆的跳转. print(i, seg[i[0] - 1], seg[i[1] - 1]) # 注意下标会多一个, 箭1后为真正下标. '''
def save_as_txt(data): from ltp import LTP import random ltp = LTP() for row in data: id = row[0] school_id = ("000" + str(row[1]))[-4:] texts = row[2] textlines = texts.split('\n') shortened_textlines = [] for line in textlines: line_len = len(line) if line_len > 100: for i in range(line_len // 100): shortened_textlines.append(line[i * 100:(i + 1) * 100]) else: shortened_textlines.append(line) text = ' '.join(shortened_textlines) path = './data/' + str(school_id) if os.path.exists(path): pass else: os.makedirs(path) with open((path + '/' + str(school_id) + "-" + str(id) + ".txt"), 'w', encoding='UTF-8') as file: file.write(text) file.close() print("\r已保存 " + str(school_id) + "-" + str(id) + ".txt", end="") # T2 报告人 68 71 曹进德 # R2 报告人_单位 Arg1: T2 Arg2: T1 seg, hidden = ltp.seg([text]) ner = ltp.ner(hidden) ner_info = [] entities_nh = [] entities_ni = [] print(type(text)) print() for i in ner[0]: if (i[0] == 'Nh'): start = i[1] end = i[2] entity = "".join(seg[0][start:end + 1]) if (len(entity) > 1): entities_nh.append(entity) elif (i[0] == 'Ni'): start = i[1] end = i[2] entity = "".join(seg[0][start:end + 1]) if entity in schoolnames: entities_ni.append(entity) for entity in set(entities_nh): pattern = re.compile(entity) iter = pattern.finditer(text) count = 0 for record in iter: ner_info.append("T" + str(300 + count) + "\t姓名 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 for entity in set(entities_ni): pattern = re.compile(entity) iter = pattern.finditer(text) count = 0 for record in iter: ner_info.append("T" + str(400 + count) + "\t单位 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 pattern = re.compile('教授|副教授|讲师|研究员|副研究员|助理教授|助理研究员') iter = pattern.finditer(text) count = 0 for record in iter: ner_info.append("T" + str(500 + count) + "\t职称 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 date_1 = r"([0-9]+年[0-9]+月[0-9]+日)" # |([0-9]+月[0-9]+日) date_2 = r"([零〇一二三四五六七八九]年[十]?[一二三四五六七八九]月[一二三]?[十]?[一二三四五六七八九十]日)" date_3 = r"([0-9]+月[0-9]+日)" flag = False count = 0 ## 方式1 pattern = re.compile(date_1) iter = pattern.finditer(text) for record in iter: ner_info.append("T" + str(600 + count) + "\t日期 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 flag = True if (flag is False): pattern = re.compile(date_3) iter = pattern.finditer(text) for record in iter: ner_info.append("T" + str(600 + count) + "\t日期 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 ## 方式2 pattern = re.compile(date_2) iter = pattern.finditer(text) for record in iter: ner_info.append("T" + str(600 + count) + "\t日期 " + str(record.span()[0]) + " " + str(record.span()[1]) + "\t" + str(record.group()) + "\n") count += 1 with open((path + '/' + str(school_id) + "-" + str(id) + ".ann"), 'w', encoding='UTF-8') as file: print([text]) print(ner_info) file.writelines(ner_info) file.close() print("\r已保存 " + str(school_id) + "-" + str(id) + ".ann", end="")
class Run(object): def __init__(self, path: str = 'small', batch_size: int = 50, device: str = None, onnx: bool = False): if onnx: self.ltp = FastLTP(path=path, device=device, need_config=True) else: self.ltp = LTP(path=path, device=device, need_config=True) self.split = lambda a: map(lambda b: a[b:b + batch_size], range(0, len(a), batch_size)) def _build_words(self, words, pos, dep): res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}] for word, p, (id, parent, relation) in zip(words, pos, dep): offset = res[-1]['offset'] + res[-1]['length'] res.append({ 'id': id - 1, 'length': len(word), 'offset': offset, 'text': word, 'pos': p, 'parent': parent - 1, 'relation': relation, 'roles': [], 'parents': [] }) return res[1:] def _predict(self, sentences: List[str]): result = [] for sentences_batch in self.split(sentences): batch_seg, hidden = self.ltp.seg(sentences_batch) batch_pos = self.ltp.pos(hidden) batch_ner = self.ltp.ner(hidden) batch_srl = self.ltp.srl(hidden) batch_dep = self.ltp.dep(hidden) batch_sdp = self.ltp.sdp(hidden) for sent, seg, pos, ner, srl, dep, sdp in \ zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp): words = self._build_words(seg, pos, dep) for word, token_srl in zip(words, srl): for role, start, end in token_srl: text = "".join(seg[start:end + 1]) offset = words[start]['offset'] word['roles'].append({ 'text': text, 'offset': offset, 'length': len(text), 'type': role }) for start, end, label in sdp: words[start - 1]['parents'].append({ 'parent': end - 1, 'relate': label }) nes = [] for role, start, end in ner: text = "".join(seg[start:end + 1]) nes.append({ 'text': text, 'offset': start, 'ne': role.lower(), 'length': len(text) }) result.append({'text': sent, 'nes': nes, 'words': words}) return result def test(self, sentences: List[str] = None): self.ltp.add_words("DMI与主机通讯中断") if sentences is None: sentences = ["他叫汤姆去拿外衣。"] res = self._predict([sentence.strip() for sentence in sentences]) print(json.dumps(res, indent=2, sort_keys=True, ensure_ascii=False)) def save(self, out='ltp.npz'): import numpy as np nps = {} for k, v in self.ltp.model.state_dict().items(): k = k.replace("gamma", "weight").replace("beta", "bias") nps[k] = np.ascontiguousarray(v.cpu().numpy()) np.savez(out, **nps) config = self.ltp.config with open('config.json', 'w', encoding='utf-8') as f: json.dump(config, f, indent=2) def test_seged(self): import torch sentences = [ 'My name is tom.', 'He called Tom to get coats.', '他叫Tom去拿外衣。', '他叫汤姆去拿外衣。', "我去长江大桥玩。" ] seg, hidden = self.ltp.seg(sentences) seged, hidden_seged = self.ltp.seg(seg, is_preseged=True) hidden: dict hidden_seged: dict for key, value in hidden.items(): if isinstance(value, torch.Tensor): test = torch.sum(value.float() - hidden_seged[key].float()).numpy() print(key, test) print(seg == seged)
class Server(object): def __init__(self, path: str = 'small', batch_size: int = 50, device: str = None, onnx: bool = False): if onnx: self.ltp = FastLTP(path=path, device=device) else: self.ltp = LTP(path=path, device=device) self.split = lambda a: map(lambda b: a[b:b + batch_size], range(0, len(a), batch_size)) def _build_words(self, words, pos, dep): res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}] for word, p, (id, parent, relation) in zip(words, pos, dep): offset = res[-1]['offset'] + res[-1]['length'] res.append({ 'id': id - 1, 'length': len(word), 'offset': offset, 'text': word, 'pos': p, 'parent': parent - 1, 'relation': relation, 'roles': [], 'parents': [] }) return res[1:] def _predict(self, sentences: List[str]): result = [] for sentences_batch in self.split(sentences): batch_seg, hidden = self.ltp.seg(sentences_batch) batch_pos = self.ltp.pos(hidden) batch_ner = self.ltp.ner(hidden) batch_srl = self.ltp.srl(hidden) batch_dep = self.ltp.dep(hidden) batch_sdp = self.ltp.sdp(hidden) for sent, seg, pos, ner, srl, dep, sdp in \ zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp): words = self._build_words(seg, pos, dep) for word, token_srl in zip(words, srl): for role, start, end in token_srl: text = "".join(seg[start:end + 1]) offset = words[start]['offset'] word['roles'].append({ 'text': text, 'offset': offset, 'length': len(text), 'type': role }) for start, end, label in sdp: words[start - 1]['parents'].append({ 'parent': end - 1, 'relate': label }) nes = [] for role, start, end in ner: text = "".join(seg[start:end + 1]) nes.append({ 'text': text, 'offset': start, 'ne': role.lower(), 'length': len(text) }) result.append({'text': sent, 'nes': nes, 'words': words}) return result
class CnProcessor: r""" Text Processor class implement NER. """ _instance_lock = threading.Lock() def __init__(self): self.__ner = None self.__pos = None # Single instance mode def __new__(cls, *args, **kwargs): if not hasattr(CnProcessor, "_instance"): with CnProcessor._instance_lock: if not hasattr(CnProcessor, "_instance"): CnProcessor._instance = object.__new__(cls) return CnProcessor._instance @staticmethod def word_tokenize(sent): r""" tokenize fiction :param str sent: the sentence need to be tokenized :return: list.the tokens in it """ assert isinstance(sent, str) return [word for word in sent] def get_ner(self, sentence): r""" NER function. :param str sent: the sentence need to be ner :return two forms of tags The first is the triple form (tags,start,end) The second is the list form, which marks the ner label of each word such as 周小明去玩 ['Nh', 'Nh', 'Nh', 'O', 'O'] """ assert isinstance(sentence, (list, str)) from ltp import LTP if isinstance(sentence, list): # Turn the list into sentence tmp = '' for word in sentence: tmp += word sentence = tmp if not sentence: return [], [] if self.__ner is None: self.__ner = LTP() seg, hidden = self.__ner.seg([sentence]) seg = seg[0] ner = self.__ner.ner(hidden) ner = ner[0] ner_label = len(sentence) * ['O'] for i in range(len(ner)): tag, start, end = ner[i] tmp = 0 for j in range(start): tmp += len(seg[j]) start = tmp tmp = 0 for j in range(end + 1): tmp += len(seg[j]) end = tmp ner[i] = (tag, start, end - 1) for j in range(start, end): ner_label[j] = tag return ner, ner_label def get_pos_tag(self, sentence): r""" pos tag function. :param str sentence: the sentence need to be ner :return: the triple form (tags,start,end) """ assert isinstance(sentence, (list, str)) from ltp import LTP if isinstance(sentence, list): # Turn the list into sentence tmp = '' for word in sentence: tmp += word sentence = tmp if not sentence: return [] if self.__pos is None: # get pos tag self.__pos = LTP() seg, hidden = self.__pos.seg([sentence]) pos = self.__pos.pos(hidden) seg = seg[0] pos = pos[0] pos_tag = [] cnt = 0 for tag in range(len(pos)): pos_tag.append([pos[tag], cnt, cnt + len(seg[tag]) - 1]) cnt += len(seg[tag]) return pos_tag
class NLP: """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析 Attributes: default_user_dict_dir: str,用户自定义词典目录 """ RESOURCE_DIR = os.path.abspath( os.path.join( os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "resource")) def __init__(self, model_type='base', user_dict_dir=RESOURCE_DIR): self.default_user_dict_dir = user_dict_dir # 加载ltp模型 self.ltp = LTP(model_type) # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快 files = os.listdir(user_dict_dir) for file in files: file_path = os.path.join(user_dict_dir, file) # 文件夹则跳过 if os.path.isdir(file): continue self.ltp.init_dict(file_path) # # 词性标注模型 # self.postagger = Postagger() # postag_flag = self.postagger.load(os.path.join(self.default_model_dir, 'pos.model')) # # 命名实体识别模型 # self.recognizer = NamedEntityRecognizer() # ner_flag = self.recognizer.load(os.path.join(self.default_model_dir, 'ner.model')) # # 依存句法分析模型 # self.parser = Parser() # parse_flag = self.parser.load(os.path.join(self.default_model_dir, 'parser.model')) def segment(self, sentence, entity_postag=dict()): """采用NLPIR进行分词处理 Args: sentence: string,句子 entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生 Returns: lemmas: list,分词结果 """ # 添加实体词典 if entity_postag: for entity in entity_postag: self.ltp.add_words([entity]) segment, hidden = self.ltp.seg([sentence]) return segment[0], hidden def postag(self, segment, hidden): """对分词后的结果进行词性标注 Args: segment: list,分词后的结果 Returns: words: WordUnit list,包含分词与词性标注结果 """ words = [] # 存储句子处理后的词单元 # 词性标注 postags = self.ltp.pos(hidden) for i in range(len(segment)): # 存储分词与词性标记后的词单元WordUnit,编号从1开始 word = WordUnit(i + 1, segment[i], postags[0][i]) words.append(word) return words def get_postag(self, word): """获得单个词的词性标注 Args: word: str,单词 Returns: post_tag: str,该单词的词性标注 """ _, hidden = self.ltp.seg([word], is_preseged=True) post_tag = self.ltp.pos(hidden) return post_tag[0] def netag(self, words, hidden): """命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并 Args: words: WordUnit list,包含分词与词性标注结果 Returns: words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果 """ lemmas = [] # 存储分词后的结果 postags = [] # 存储词性标书结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 命名实体识别 netags = self.ltp.ner(hidden, as_entities=False) words_netag = EntityCombine.combine(words, netags[0]) return words_netag def parse_seged(self, words): lemmas = [] # 分词结果 postags = [] # 词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 依存句法分析 _, hidden = self.ltp.seg([lemmas], is_preseged=True) arcs = self.ltp.dep(hidden)[0] for i in range(len(arcs)): words[i].head = arcs[i][1] words[i].dependency = arcs[i][2] return SentenceUnit(words) def parse(self, words, hidden): """对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选) Args: words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果 Returns: *: SentenceUnit,该句子单元 """ lemmas = [] # 分词结果 postags = [] # 词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 依存句法分析 arcs = self.ltp.dep(hidden)[0] for i in range(len(arcs)): words[i].head = arcs[i][1] words[i].dependency = arcs[i][2] return SentenceUnit(words) def close(self): """关闭与释放nlp""" pass
class NLP: """ 在LTP分析的结果上进行封装 """ def __init__(self, default_model_dir=LTP4_MODEL_DIR): self.ltp = LTP(path=default_model_dir) def segment(self, sentences): lemmas, hidden = self.ltp.seg(sentences) return lemmas, hidden def postag(self, lemmas, hidden): """ 根据postag的结果抽取words :param lemmas: :param hidden: :return: """ words = [] postags = self.ltp.pos(hidden) for idx_sent, postags_sent in enumerate(postags): words_sent = [] for i in range(len(postags_sent)): # 词的编号从1开始 word = WordUnit(i + 1, lemmas[idx_sent][i], postags_sent[i]) words_sent.append(word) words.append(words_sent) # for i in range(len(postags)): # word = WordUnit(i+1, lemmas[i], postags[i]) # words.append(word) return words def nertag(self, words, hidden): """ 根据nertag的结果抽取words,将ner得到的信息作为pos的纠正和补充,例如n->ni/ns/nl :param lemmas: :param hidden: :return: """ # Nh 人名 Ni 机构名 Ns 地名 nertags = self.ltp.ner(hidden) ''' 为了进行三元组提取,使用到ner信息,需要将一些ner分析后的词进行合并得到新词。 NOTE:NER之后可能将一些tokens合并成一个word 例如: [['高克', '访问', '中国', ',', '并', '在', '同济', '大学', '发表', '演讲', '。']] [['nh', 'v', 'ns', 'wp', 'c', 'p', 'nz', 'n', 'v', 'v', 'wp']] [[('Nh', 0, 0), ('Ns', 2, 2), ('Ni', 6, 7)]] [[(1, 2, 'SBV'), (2, 0, 'HED'), (3, 2, 'VOB'), (4, 2, 'WP'), (5, 9, 'ADV'), (6, 9, 'ADV'), (7, 8, 'ATT'), (8, 6, 'POB'), (9, 2, 'COO'), (10, 9, 'VOB'), (11, 2, 'WP')]] ''' ner2pos = {'Nh': 'nh', 'Ns': 'ns', 'Ni': 'ni'} n = 1 #for i in range(len(words)): for idx_sent, nertags_sent in enumerate(nertags): for item in nertags_sent: for i in range(item[1], item[2] + 1): words[idx_sent][i].nertag = item[0] words[idx_sent][i].postag = ner2pos[item[0]] # for item in nertags: # for i in range(item[1], item[2]+1): # words[i].postag = ner2pos[item[0]] return words def dependency(self, words, hidden): """ 根据dp结果,抽取words信息,用于之后的三元组抽取。(主要是词之间的依赖关系) :param hidden: :return: """ sentences = [] dep = self.ltp.dep(hidden) for idx_sent, dep_sent in enumerate(dep): for i in range(len(words[idx_sent])): if i < len( dep_sent ): # [(1, 2, 'ATT'), (2, 3, 'ATT')]] 省略了(3, 0, 'HED) words[idx_sent][i].head = dep_sent[i][1] words[idx_sent][i].dependency = dep_sent[i][2] sentences.append(SentenceUnit(words[idx_sent])) return sentences
class Conllu(object): """ :param path: 模型路径,或者自动从网上下载 ['base', 'small', 'tiny'] :param batch_size: 最大 Batch Size 自动切分 :param device: ['cpu', 'cuda'] :param onnx: 是否启用 onnx """ def __init__(self, path: str = 'small', batch_size: int = 50, device: str = None, onnx: bool = False): if onnx: self.ltp = FastLTP(path=path, device=device, need_config=True) else: self.ltp = LTP(path=path, device=device, need_config=True) self._split = lambda a: map(lambda b: a[b:b + batch_size], range(0, len(a), batch_size)) def _predict(self, sentences: List[str], pos=True, ner=True, srl=True, dep=True, sdp=True): result = [] for sentences_batch in self._split(sentences): batch_seg, hidden = self.ltp.seg(sentences_batch) batch_size = len(sentences_batch) batch_pos = self.ltp.pos(hidden) if pos else ([[]] * batch_size) batch_ner = self.ltp.ner(hidden) if ner else ([None] * batch_size) batch_srl = self.ltp.srl( hidden, keep_empty=False) if srl else ([None] * batch_size) batch_dep = self.ltp.dep(hidden) if dep else ([None] * batch_size) batch_sdp = self.ltp.sdp(hidden) if sdp else ([None] * batch_size) result += list( zip(batch_seg, batch_pos, batch_ner, batch_dep, batch_sdp, batch_srl)) return result def predict(self, input: str, output: Optional[str] = None, pos: bool = True, ner: bool = False, srl: bool = False, dep: bool = True, sdp: bool = False): """ 预测文本并输出为 conllu 格式 :param input: 要预测的文件,每行一句话 :param output: 输出的结果文件,默认是输入文件添加 .conll 后缀 :param pos: 是否输出 词性标注 结果 ['True','False'] :param ner: 是否输出 命名实体识别 结果 ['True','False'], 占用 conllu feats 列 :param srl: 是否输出 语义角色标注 结果 ['True','False'], 占用 conllu misc 列 :param dep: 是否输出 依存句法分析 结果 ['True','False'] :param sdp: 是否输出 语义依存分析 结果 ['True','False'] """ if output is None: output = f"{input}.conllu" with open(output, mode='w', encoding='utf-8') as f: sentences = sum([sent for idx, sent in iter_lines(input)], []) results = self._predict(sentences, pos, ner, srl, dep, sdp) for text, (seg_s, pos_s, ner_s, dep_s, sdp_s, srl_s) in zip(sentences, results): tokens = conllu.TokenList([ conllu.models.Token(id=idx + 1, form=token, lemma=token, upos=pos if pos else '_', xpos=pos if pos else '_', feats='O' if ner else '_', head=idx, deprel='_', deps='' if sdp else '_', misc='SpaceAfter=No') for idx, (token, pos) in enumerate(zip_longest(seg_s, pos_s)) ], conllu.models.Metadata(text=text)) if ner: for tag, start, end in ner_s: tokens[start]['feats'] = f'B-{tag}' for i in range(start + 1, end): tokens[start]['feats'] = f'I-{tag}' if dep: for id, head, tag in dep_s: tokens[id - 1]['head'] = head tokens[id - 1]['deprel'] = tag if sdp: for id, head, tag in sdp_s: if tokens[id - 1]['deps']: tokens[id - 1]['deps'] = tokens[ id - 1]['deps'] + f"|{head}:{tag}" else: tokens[id - 1]['deps'] = f"{head}:{tag}" if srl: srl_predicate, srl_roles = list(zip(*srl_s)) srl_predicate_num = len(srl_predicate) if srl_predicate_num > 0: srl_misc = [[ f'Predicate={"Y" if i in srl_predicate else "_"}', ['O'] * srl_predicate_num ] for i in range(len(tokens))] for idx, srl_role in enumerate(srl_roles): for tag, start, end in srl_role: srl_misc[start][-1][idx] = f'B-{tag}' for i in range(start + 1, end): srl_misc[start][-1][idx] = f'I-{tag}' srl_misc = [ "|".join([s[0], "Role=" + ",".join(s[-1])]) for s in srl_misc ] for token, misc in zip(tokens, srl_misc): token['misc'] = f"{token['misc']}|{misc}" f.write(tokens.serialize())