def get_pos_tag(self, sentence): r""" pos tag function. :param str sentence: the sentence need to be ner :return: the triple form (tags,start,end) """ assert isinstance(sentence, (list, str)) from ltp import LTP if isinstance(sentence, list): # Turn the list into sentence tmp = '' for word in sentence: tmp += word sentence = tmp if not sentence: return [] if self.__pos is None: # get pos tag self.__pos = LTP() seg, hidden = self.__pos.seg([sentence]) pos = self.__pos.pos(hidden) seg = seg[0] pos = pos[0] pos_tag = [] cnt = 0 for tag in range(len(pos)): pos_tag.append([pos[tag], cnt, cnt + len(seg[tag]) - 1]) cnt += len(seg[tag]) return pos_tag
class Ner: def __init__(self): self.ltp = LTP() def preprocess(self, sent): return re.sub('\s+', '', sent) def ner(self, sents): assert not any(re.search(r'\s', x) for x in sents), "no space is allowed" psents = [x for x in sents if x != ''] if len(psents) == 0: return [[] for x in sents] segment, hidden = self.ltp.seg(psents) ne = self.ltp.ner(hidden) anes = [] for sseg, sne in zip(segment, ne): nes = [] slens = [0] + [len(x) for x in sseg] for i in range(1, len(slens)): slens[i] += slens[i - 1] for t, x, y in sne: if t == 'Ns': nes.append([slens[x], slens[y + 1]]) anes.append(nes) fnes = [] cur = 0 for s in sents: if s == '': fnes.append([]) else: fnes.append(anes[cur]) cur += 1 return fnes
def seg_with_ltp40(in_file, out_file_path, manual_seg_file): # initialization model ltp = LTP() line_list = [] # save seg_result corpus = construct_corpus(in_file) f = open(out_file_path, "w", encoding='utf-8') for line in corpus: line_list.append(line) # 将每句话变成列表["Xxxx"] seg_result, hidden = ltp.seg(line_list) f.write("=".join(seg_result[0]) + "\n") line_list.clear() f.flush() # test qps corpus = construct_corpus(in_file, 1) start = time.time() for line in corpus: segment, hidden = ltp.seg(list(line)) end = time.time() qps = round(len(corpus) / (end - start), 2) # test accuracy p, r, f1, line_aver_length = evaluate(out_file_path, manual_seg_file) return qps, p, r, f1, line_aver_length
def __init__(self, path: str = 'small', batch_size: int = 50, device: str = None, onnx: bool = False): self.ltp = LTP(path=path, device=device) self.split = lambda a: map(lambda b: a[b:b + batch_size], range(0, len(a), batch_size))
def ltp_func(text_list): ltp = LTP() seg, hidden = ltp.seg(text_list) pos = ltp.pos(hidden) result = [] for idx, val in enumerate(seg[0]): pag = [val, pos[0][idx]] result.append('/'.join(pag)) return result
def dependency(self): sentence = self.sentence sentences = [] sentences.append(sentence) ltp = LTP() seg, hidden = ltp.seg(sentences) dep = ltp.dep(hidden) print(seg) print(dep) pass
def __init__(self, path: str = 'small', batch_size: int = 10, device: str = None, onnx: str = None, vocab: str = None): self.ltp = LTP(path=path, batch_size=batch_size, device=device, vocab=vocab)
def __init__(self, default_model_dir=LTP4_MODEL_DIR, user_dict_dir=USER_DICT_DIR): self.ltp = LTP(path=default_model_dir) for file in os.listdir(user_dict_dir): self.ltp.init_dict(path=os.path.join(user_dict_dir, file)) self.sentences = [] self.postags = [] self.nertags = [] self.dep = []
def __init__(self, path: str = 'small', batch_size: int = 50, device: str = None, onnx: bool = False): if onnx: self.ltp = FastLTP(path=path, device=device, need_config=True) else: self.ltp = LTP(path=path, device=device, need_config=True) self._split = lambda a: map(lambda b: a[b:b + batch_size], range(0, len(a), batch_size))
def work_summary_parser_ltp(): f = csvReader("标准工作任务单") ltp = LTP() paList = [] for i, row in enumerate(f): if i != 0: val = row[1][5:].split(',') paList.append(val[2]) wa, ha = ltp.seg(paList) pa = ltp.pos(ha) return wa, pa
def __init__(self, model_type='base', user_dict_dir=RESOURCE_DIR): self.default_user_dict_dir = user_dict_dir # 加载ltp模型 self.ltp = LTP(model_type) # 添加用户词典(法律文书大辞典与清华大学法律词典),这种方式是添加进内存中,速度更快 files = os.listdir(user_dict_dir) for file in files: file_path = os.path.join(user_dict_dir, file) # 文件夹则跳过 if os.path.isdir(file): continue self.ltp.init_dict(file_path)
def mongo2ner(idx, ltp, offset, size): """ 根据offset从mongo中取指定size的文章 :param idx: :param offset: :param size: :return: """ entities = [] pid = os.getpid() try: # debug_logger.debug("{} ---pid:{} MongoDB: Skip: {}, size: {}".format(idx, pid, offset, size)) ltp = LTP(path=LTP4_MODEL_DIR) db_connect = MongoClient(host=MONGODB_HOST, port=MONGODB_PORT) db = db_connect[MONGODB_DATABASE_NAME] coll = db[MONGODB_ENTMT_COLLECTION] # debug_logger.debug("pid: {}, connected".format(pid)) for art in coll.find(skip=offset, limit=size): debug_logger.debug(art['title']) text = art['title'] + art['content'] entities_of_art = get_article_entities(idx, text, ltp) entities += entities_of_art # debug_logger.debug("pid: {}, write".format(pid)) with open(os.path.join(USER_DICT_DIR, 'ners_' + str(idx) + '.txt'), 'w') as fw: for item in entities: for word, label in item: fw.write(word + '\t' + label + '\n') except Exception as e: print("ERROR mongo2ner: {}".format(e)) # debug_logger.debug("ERROR mongo2ner: {}".format(e)) return entities
def findFood(self, sentence): ltp = LTP() words, hidden = ltp.seg([sentence]) posTags = ltp.pos(hidden) words = words[0] #分词结果list posTags = posTags[0] #词性标注结果list dep = ltp.dep(hidden)[0] #依存句法分析结果list relyId = [d[1] for d in dep] #父节点id list relation = [d[2] for d in dep] #关系结果 list heads = ['Root' if id == 0 else words[id - 1] for id in relyId] #父节点内容 string = '' for i in range(len(words)): if 'n' in posTags[i] and heads[i] == '吃' and relation[i] == 'VOB': string += words[i] string += ' ' return string
def WriteTest(readfile, savefile): with open(readfile, "r", encoding="utf-8") as rfp: ltp = LTP() logger.info("Processing file:%s ." % (readfile)) with open(savefile, 'w', encoding='utf-8') as wfp: for row in tqdm(rfp, desc="file %s process" % (readfile)): sent1, sent2 = row.split('\t') seg, hid = ltp.seg([sent1, sent2]) sdp = ltp.sdp(hid, mode='tree') pos = ltp.pos(hid) tmpitem = { 'sentence1': [seg[0], pos[0], sdp[0]], 'sentence2': [seg[1], pos[1], sdp[1]] } jsonline = json.dumps(tmpitem) wfp.write(jsonline + "\n")
def get_ner(self, sentence): r""" NER function. :param str sent: the sentence need to be ner :return two forms of tags The first is the triple form (tags,start,end) The second is the list form, which marks the ner label of each word such as 周小明去玩 ['Nh', 'Nh', 'Nh', 'O', 'O'] """ assert isinstance(sentence, (list, str)) from ltp import LTP if isinstance(sentence, list): # Turn the list into sentence tmp = '' for word in sentence: tmp += word sentence = tmp if not sentence: return [], [] if self.__ner is None: self.__ner = LTP() seg, hidden = self.__ner.seg([sentence]) seg = seg[0] ner = self.__ner.ner(hidden) ner = ner[0] ner_label = len(sentence) * ['O'] for i in range(len(ner)): tag, start, end = ner[i] tmp = 0 for j in range(start): tmp += len(seg[j]) start = tmp tmp = 0 for j in range(end + 1): tmp += len(seg[j]) end = tmp ner[i] = (tag, start, end - 1) for j in range(start, end): ner_label[j] = tag return ner, ner_label
def findFood(sentence): ltp = LTP() words, hidden = ltp.seg([sentence]) posTags = ltp.pos(hidden) words = words[0] #分词结果list print(words) posTags = posTags[0] #词性标注结果list print(posTags) dep = ltp.dep(hidden)[0] #依存句法分析结果list for t in dep: print(t) relyId = [d[1] for d in dep] #父节点id list relation = [d[2] for d in dep] #关系结果 list heads = ['Root' if id == 0 else words[id - 1] for id in relyId] #父节点内容 for i in range(len(words)): if 'n' in posTags[i] and heads[i] == '吃' and relation[i] == 'VOB': print("找到了一种食物:" + words[i])
class NamedEntity: def __init__(self, user_dict): self.ltp = LTP() # 默认加载Small模型 # user_dict.txt 是词典文件, max_window是最大前向分词窗口 self.ltp.init_dict(path=user_dict, max_window=4) def entity_recognition(self, text: list): """ 命名实体识别 :param text: 原始文本 :return: 从原始文本中抽取的命名实体 """ seg, hidden = self.ltp.seg(text) # 分词 ner = self.ltp.ner(hidden) entity = [] for tag, start, end in ner[0]: entity.append(seg[0][start:end+1][0]) return entity
def __init__(self, seq_len=512): """ Constructs Huggingface CN tokenizer & other col: What column to tokenize if pretraining """ self.tokenizer_cn = AutoTokenizer.from_pretrained("bert-base-chinese") self.tokenizer_ltp = LTP("small") self.max_seq_length = seq_len
def is_word(sentence): from ltp import LTP r""" Judge whether it is a word. :param str sentence: input sentence string sentence: input sentence string :return bool: is a word or not """ if sentence[0] == sentence[1]: return True ltp = LTP() seg, hidden = ltp.seg([sentence]) pos = ltp.pos(hidden) pos = pos[0] if len(pos) == 1 and pos[0] == 'n': return False return True
def work_detail_parser_ltp(): f = csvReader("标准工作任务单") ltp = LTP() paList = [] pbList = [] for i, row in enumerate(f): if i != 0: val = row[1][5:].split(',') paList.append(val[2]) temp = val[3:] for v in temp: pbList.append(v) # print(paList) # print(pbList) sa, ha = ltp.seg(paList) sb, hb = ltp.seg(pbList) pa = ltp.pos(ha) pb = ltp.pos(hb) return sa, sb, pa, pb
def test_nlp_model(self): ltp1 = LTP(LTP4_MODEL_DIR) ltp2 = LTP(LTP4_MODEL_DIR) ltp3 = LTP(LTP4_MODEL_DIR) ltp4 = LTP(LTP4_MODEL_DIR) ltp5 = LTP(LTP4_MODEL_DIR) ltp6 = LTP(LTP4_MODEL_DIR) ltp7 = LTP(LTP4_MODEL_DIR) print('-------') import time time.sleep(10)
def create(): """create profession keywords json file. """ ltp = LTP() # 默认加载 Small 模型 # import the professions file with open('./dataset/profession.json', 'rb') as jsonfile: profession_json = json.load(jsonfile, encoding='utf-8') for i, profession in enumerate(profession_json['data']): profession_json['data'][i]['kwords'] = find_kwords_by_ltp( profession['name'], ltp) with open('./dataset/profession2.json', 'w', encoding='utf-8') as jsonfile: json.dump(profession_json, jsonfile, ensure_ascii=False)
def main(args): # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm) # If we want to fine-tune these model, we have to use same tokenizer : LTP (https://github.com/HIT-SCIR/ltp) with open(args.file_name, "r", encoding="utf-8") as f: data = f.readlines() ltp_tokenizer = LTP(args.ltp) # faster in GPU device bert_tokenizer = BertTokenizer.from_pretrained(args.bert) ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer) with open(args.save_path, "w", encoding="utf-8") as f: data = [json.dumps(ref) + "\n" for ref in ref_ids] f.writelines(data)
def new_generate_ltp_results(): # 加载模型 ltp_model = '../../ltp_models/base1' ltp = LTP(path=ltp_model) # 读取原句子 data = read_file_in_ltp('../data/train_base.json') sentences = list(map(lambda x: x['content'], data)) segmented, pos, ner, srl, dep, sdp_tree, sdp_graph = [], [], [], [], [], [], [] for sent in tqdm(sentences): # 分词 segmented0, hidden = ltp.seg([sent]) # 词性标注 cur_pos = ltp.pos(hidden) # 命名实体识别 cur_ner = ltp.ner(hidden) # 语义角色标注 cur_srl = ltp.srl(hidden) # 依存句法分析 cur_dep = ltp.dep(hidden) # 语义依存分析 (树) cur_sdp_tree = ltp.sdp(hidden, mode='tree') # 语义依存分析 (图) cur_sdp_graph = ltp.sdp(hidden, mode='graph') segmented.append(segmented0[0]) pos.append(cur_pos[0]) ner.append(cur_ner[0]) srl.append(cur_srl[0]) dep.append(cur_dep[0]) sdp_tree.append(cur_sdp_tree[0]) sdp_graph.append(cur_sdp_graph[0]) # 生成句子与分词的对应 sent_seg_matches = sentence_segment_match(data, segmented) pickle.dump([segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches], open('new_ltp_results.pk', 'wb')) return segmented, pos, ner, srl, dep, sdp_tree, sdp_graph, sent_seg_matches
def main(args): # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm) # 如果要微调这些模型,则必须使用相同的tokenizer : LTP (https://github.com/HIT-SCIR/ltp) with open(args.file_name, "r", encoding="utf-8", errors='ignore') as f: data = f.readlines() print(f'开始处理数据,共有{len(data)}条') data = [ line.strip() for line in data if len(line) > 0 and not line.isspace() ] # avoid delimiter like '\u2029' print(f"开始加载ltp和bert的tokenizer模型") ltp_tokenizer = LTP(path=args.ltp) # faster in GPU device bert_tokenizer = BertTokenizer.from_pretrained(args.bert) #准备映射关系 ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer) #保存映射关系 with open(args.save_path, "w", encoding="utf-8") as f: data = [json.dumps(ref) + "\n" for ref in ref_ids] f.writelines(data) print(f"保存所有{len(data)}条数据的映射关系到文件{args.save_path}")
def thread_main(args, gpu=True): """ 多线程处理 Args: args: gpu: 是否使用gpu Returns: """ from functools import partial from multiprocessing import Pool from tqdm import tqdm # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm) # 如果要微调这些模型,则必须使用相同的tokenizer : LTP (https://github.com/HIT-SCIR/ltp) with open(args.file_name, "r", encoding="utf-8") as f: data = f.readlines() print(f'开始处理数据,共有{len(data)}条') data = [ line.strip() for line in data if len(line) > 0 and not line.isspace() ] # avoid delimiter like '\u2029' print(f"开始加载ltp和bert的tokenizer模型") ltp_tokenizer = LTP(path=args.ltp) # faster in GPU device bert_tokenizer = BertTokenizer.from_pretrained(args.bert) newdata = [data[i:i + 1000] for i in range(0, len(data), 1000)] #准备映射关系, 并行线程数 #如果使用GPU,请设置如下 if gpu: import torch torch.multiprocessing.set_start_method('spawn') with Pool(processes=args.processes) as p: # partial_clean 是封装一下函数 partial_clean = partial(prepare_ref, ltp_tokenizer=ltp_tokenizer, bert_tokenizer=bert_tokenizer) # chunksize8,就是数据分成8份 ref_ids_nest = list( tqdm(p.imap(partial_clean, newdata, chunksize=8), desc="开始处理数据")) ref_ids = [ref for nest in ref_ids_nest for ref in nest] #保存映射关系 with open(args.save_path, "w", encoding="utf-8") as f: data = [json.dumps(ref) + "\n" for ref in ref_ids] f.writelines(data) print(f"保存所有{len(data)}条数据的映射关系到文件{args.save_path}")
def load_word_segmentation_tool(): """ 加载分词工具 :return: HanLP: hanlp, ltp: LTP """ logger.info("loading word segmentation tool") # HanLP = HanLPClient(url='https://www.hanlp.com/api', auth='MTE4QGJicy5oYW5scC5jb206MXFFOHhWUkJNQXBNdlh0NA==') HanLP = hanlp.load(hanlp.pretrained.mtl. CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH, verbose=True) tasks = list(HanLP.tasks.keys()) for task in tasks: if task not in TASK: del HanLP[task] tok = HanLP[TASK[0]] tok.dict_combine = {'新冠', '新冠病毒', '新冠肺炎'} ltp = LTP() logger.info("loaded word segmentation tool") return HanLP, ltp
def prepare_ref(lines: List[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokenizer): ltp_res = [] for i in range(0, len(lines), 100): res = ltp_tokenizer.seg(lines[i:i + 100])[0] res = [get_chinese_word(r) for r in res] ltp_res.extend(res) assert len(ltp_res) == len(lines) bert_res = [] for i in range(0, len(lines), 100): res = bert_tokenizer(lines[i:i + 100], add_special_tokens=True, truncation=True, max_length=512) bert_res.extend(res["input_ids"]) assert len(bert_res) == len(lines) ref_ids = [] for input_ids, chinese_word in zip(bert_res, ltp_res): input_tokens = [] for id in input_ids: token = bert_tokenizer._convert_id_to_token(id) input_tokens.append(token) input_tokens = add_sub_symbol(input_tokens, chinese_word) ref_id = [] # We only save pos of chinese subwords start with ##, which mean is part of a whole word. for i, token in enumerate(input_tokens): if token[:2] == "##": clean_token = token[2:] # save chinese tokens' pos if len(clean_token) == 1 and _is_chinese_char( ord(clean_token)): ref_id.append(i) ref_ids.append(ref_id) assert len(ref_ids) == len(bert_res) return ref_ids
def load_ltp_weights(weights_type): ''' 加载 LTP 权重文件,实例化 LTP 模型 :param weights_type: 载入模型文件类型,只能采用 base、small、tiny 三种类型 :return: 载入权重参数后的 LTP 模型 ''' # 诊断模型类型 assert weights_type in ['base', 'small', 'tiny'], 'LTP 模型只能采用 base、small、tiny三种类型的参数' # 确认文件路径 if LtpModelPath is None: file_path = os.path.abspath( os.path.join(os.path.dirname('.'), 'weights', weights_type)) else: file_path = os.path.abspath( os.path.join(LtpModelPath, weights_type)) # 载入权重 ltp = LTP(path=file_path) return ltp
# [[('every', 5)], [('自然数', 'x'), 'and', ('奇数', 'x')]] from ltp import LTP ltp = LTP() class NlpCtr(object): def __init__(self): self.seg = None self.words = None self.dep = None def trans_result(self, depArr, posArr): tempposArr = posArr[0] tempdepArr = depArr[0] tempArr = [] for item in tempdepArr: dic = { 'dep': item[0], 'gov': item[1], 'type': item[2], # 'pos': tempposArr[item[0] - 1] } tempArr.append(dic) return tempArr def getHED(self, words): root = None for word in words: if word['gov'] == 0 and word['type'] == 'HED': root = word['dep']