class Run(object): def __init__(self, path: str = 'small', batch_size: int = 50, device: str = None, onnx: bool = False): if onnx: self.ltp = FastLTP(path=path, device=device, need_config=True) else: self.ltp = LTP(path=path, device=device, need_config=True) self.split = lambda a: map(lambda b: a[b:b + batch_size], range(0, len(a), batch_size)) def _build_words(self, words, pos, dep): res = [{'id': -1, 'length': 0, 'offset': 0, 'text': 'root'}] for word, p, (id, parent, relation) in zip(words, pos, dep): offset = res[-1]['offset'] + res[-1]['length'] res.append({ 'id': id - 1, 'length': len(word), 'offset': offset, 'text': word, 'pos': p, 'parent': parent - 1, 'relation': relation, 'roles': [], 'parents': [] }) return res[1:] def _predict(self, sentences: List[str]): result = [] for sentences_batch in self.split(sentences): batch_seg, hidden = self.ltp.seg(sentences_batch) batch_pos = self.ltp.pos(hidden) batch_ner = self.ltp.ner(hidden) batch_srl = self.ltp.srl(hidden) batch_dep = self.ltp.dep(hidden) batch_sdp = self.ltp.sdp(hidden) for sent, seg, pos, ner, srl, dep, sdp in \ zip(sentences_batch, batch_seg, batch_pos, batch_ner, batch_srl, batch_dep, batch_sdp): words = self._build_words(seg, pos, dep) for word, token_srl in zip(words, srl): for role, start, end in token_srl: text = "".join(seg[start:end + 1]) offset = words[start]['offset'] word['roles'].append({ 'text': text, 'offset': offset, 'length': len(text), 'type': role }) for start, end, label in sdp: words[start - 1]['parents'].append({ 'parent': end - 1, 'relate': label }) nes = [] for role, start, end in ner: text = "".join(seg[start:end + 1]) nes.append({ 'text': text, 'offset': start, 'ne': role.lower(), 'length': len(text) }) result.append({'text': sent, 'nes': nes, 'words': words}) return result def test(self, sentences: List[str] = None): self.ltp.add_words("DMI与主机通讯中断") if sentences is None: sentences = ["他叫汤姆去拿外衣。"] res = self._predict([sentence.strip() for sentence in sentences]) print(json.dumps(res, indent=2, sort_keys=True, ensure_ascii=False)) def save(self, out='ltp.npz'): import numpy as np nps = {} for k, v in self.ltp.model.state_dict().items(): k = k.replace("gamma", "weight").replace("beta", "bias") nps[k] = np.ascontiguousarray(v.cpu().numpy()) np.savez(out, **nps) config = self.ltp.config with open('config.json', 'w', encoding='utf-8') as f: json.dump(config, f, indent=2) def test_seged(self): import torch sentences = [ 'My name is tom.', 'He called Tom to get coats.', '他叫Tom去拿外衣。', '他叫汤姆去拿外衣。', "我去长江大桥玩。" ] seg, hidden = self.ltp.seg(sentences) seged, hidden_seged = self.ltp.seg(seg, is_preseged=True) hidden: dict hidden_seged: dict for key, value in hidden.items(): if isinstance(value, torch.Tensor): test = torch.sum(value.float() - hidden_seged[key].float()).numpy() print(key, test) print(seg == seged)
from ltp import LTP # text='我现在在天津,我想知道这里的大学都有什么学校.' #加入用户词典. from ltp import LTP ltp = LTP() # user_dict.txt 是词典文件, max_window是最大前向分词窗口 # 注意max_window一定要开大, 开大字典里面词组最大长度. ltp.init_dict(path="user_dict.txt", max_window=6) # 也可以在代码中添加自定义的词语 ltp.add_words(words=["肖申克的救赎", "长江大桥"], max_window=6) def searchKG(kglist,text): # 用bert来算距离的 tmp3 = [] for i in kglist: t = (cosine_distance(vec2(i), vec2(text)))
class NLP: """进行自然语言处理,包括分词,词性标注,命名实体识别,依存句法分析 Attributes: default_user_dict_dir: str,用户自定义词典目录 """ RESOURCE_DIR = os.path.abspath( os.path.join( os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), "resource")) def __init__(self, model_type='base', user_dict_dir=RESOURCE_DIR): self.default_user_dict_dir = user_dict_dir # 加载ltp模型 self.ltp = LTP(model_type) # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快 files = os.listdir(user_dict_dir) for file in files: file_path = os.path.join(user_dict_dir, file) # 文件夹则跳过 if os.path.isdir(file): continue self.ltp.init_dict(file_path) # # 词性标注模型 # self.postagger = Postagger() # postag_flag = self.postagger.load(os.path.join(self.default_model_dir, 'pos.model')) # # 命名实体识别模型 # self.recognizer = NamedEntityRecognizer() # ner_flag = self.recognizer.load(os.path.join(self.default_model_dir, 'ner.model')) # # 依存句法分析模型 # self.parser = Parser() # parse_flag = self.parser.load(os.path.join(self.default_model_dir, 'parser.model')) def segment(self, sentence, entity_postag=dict()): """采用NLPIR进行分词处理 Args: sentence: string,句子 entity_postag: dict,实体词性词典,默认为空集合,分析每一个案例的结构化文本时产生 Returns: lemmas: list,分词结果 """ # 添加实体词典 if entity_postag: for entity in entity_postag: self.ltp.add_words([entity]) segment, hidden = self.ltp.seg([sentence]) return segment[0], hidden def postag(self, segment, hidden): """对分词后的结果进行词性标注 Args: segment: list,分词后的结果 Returns: words: WordUnit list,包含分词与词性标注结果 """ words = [] # 存储句子处理后的词单元 # 词性标注 postags = self.ltp.pos(hidden) for i in range(len(segment)): # 存储分词与词性标记后的词单元WordUnit,编号从1开始 word = WordUnit(i + 1, segment[i], postags[0][i]) words.append(word) return words def get_postag(self, word): """获得单个词的词性标注 Args: word: str,单词 Returns: post_tag: str,该单词的词性标注 """ _, hidden = self.ltp.seg([word], is_preseged=True) post_tag = self.ltp.pos(hidden) return post_tag[0] def netag(self, words, hidden): """命名实体识别,并对分词与词性标注后的结果进行命名实体识别与合并 Args: words: WordUnit list,包含分词与词性标注结果 Returns: words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果 """ lemmas = [] # 存储分词后的结果 postags = [] # 存储词性标书结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 命名实体识别 netags = self.ltp.ner(hidden, as_entities=False) words_netag = EntityCombine.combine(words, netags[0]) return words_netag def parse_seged(self, words): lemmas = [] # 分词结果 postags = [] # 词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 依存句法分析 _, hidden = self.ltp.seg([lemmas], is_preseged=True) arcs = self.ltp.dep(hidden)[0] for i in range(len(arcs)): words[i].head = arcs[i][1] words[i].dependency = arcs[i][2] return SentenceUnit(words) def parse(self, words, hidden): """对分词,词性标注与命名实体识别后的结果进行依存句法分析(命名实体识别可选) Args: words_netag: WordUnit list,包含分词,词性标注与命名实体识别结果 Returns: *: SentenceUnit,该句子单元 """ lemmas = [] # 分词结果 postags = [] # 词性标注结果 for word in words: lemmas.append(word.lemma) postags.append(word.postag) # 依存句法分析 arcs = self.ltp.dep(hidden)[0] for i in range(len(arcs)): words[i].head = arcs[i][1] words[i].dependency = arcs[i][2] return SentenceUnit(words) def close(self): """关闭与释放nlp""" pass
# text='我现在在天津,我想知道这里的大学都有什么学校.' #加入用户词典. from ltp import LTP ltp = LTP() # user_dict.txt 是词典文件, max_window是最大前向分词窗口 # 注意max_window一定要开大, 开大字典里面词组最大长度. ltp.init_dict(path="user_dict.txt", max_window=6) # 也可以在代码中添加自定义的词语 #改成自动算最大窗口吧. words = ["肖申克的救赎", "长江大桥", "负重前行"] max_window = max([len(i) for i in words]) ltp.add_words(words=words, max_window=max_window) def searchKG(kglist, text): # 用bert来算距离的 tmp3 = [] for i in kglist: t = (cosine_distance(vec2(i), vec2(text))) tmp3.append(t) tmp3 = np.array(tmp3) print('所有的距离为', tmp3) # 查询到的最近kg 3元组是!!!!!!!!!!!!!!!! dix = np.argmin(tmp3) print('最近的3元组是', kglist[dix], '对应的阈值是', tmp3[dix]) return kglist[dix]