def test_ner(self): text = '厦门明天会不会下雨' ner = jiagu.ner(text) # 命名实体识别 print('NER result:', [(c, p) for c, p in zip(text, ner)]) # Character-level labeling self.assertEqual(len(ner), len(text)) self.assertEqual(ner, ['B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])
def nercontent(chapter_id): wordsegss = getContentSeg(chapter_id) if NerWordSeg.query.filter_by(word_id=wordsegss[0][0].id).all() == []: for wordsegs in wordsegss: ners = jiagu.ner([wordseg.wordseg for wordseg in wordsegs]) # 词性标注 for i, ner in enumerate(ners): wordseg = NerWordSeg(nertag=ner_dict[ner], word_id=wordsegs[i].id) db.session.add(wordseg) ners = [[ NerWordSeg.query.filter_by(word_id=wordseg.id).first().nertag for wordseg in wordsegs ] for wordsegs in wordsegss] return jsonify({'ners': ners}), 200
def read_txt(file_path): with open(file_path, 'r', encoding='utf-8') as f: all_data = [line.strip('\n') for line in f.readlines()] result = [] for data in all_data: # print(jiagu.seg(''.join(data))) one = [] for ner in jiagu.ner(data): if ner is not 'O': one.append(ner) if one: result.append(one) text = ''' 4. 香农的信息定义 假定事物状态可以用一个以经典集合论为基础的概率模型来描述,则信息就是用来消除不确定性的东西,或信息是事物运动状态或存在方式的不确定性描述。 但在实际中要寻找一个合适的概率模型往往是非常困难的,有时是否存在这样一种模型还值得探讨。此外,信息有很强的主观性和实用性,但该定义没有考虑信息接收者的主观特性和主观意义,不顾信息的具体含义、具体用途、重要程度和可能引起的后果等因素,这就与实际情况不完全一致。 ''' keywords = jiagu.knowledge(text) # 关键词 print(keywords)
import jiagu # jiagu.init() # 可手动初始化,也可以动态初始化 text = '苏州的天气不错' words = jiagu.seg(text) # 分词 print(words) words = jiagu.cut(text) # 分词 print(words) pos = jiagu.pos(words) # 词性标注 print(pos) ner = jiagu.ner(words) # 命名实体识别 print(ner) # 字典模式分词 text = '思知机器人挺好用的' words = jiagu.seg(text) print(words) # jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 jiagu.load_userdict(['思知机器人']) words = jiagu.seg(text) print(words) text = ''' 该研究主持者之一、波士顿大学地球与环境科学系博士陈池(音)表示,“尽管中国和印度国土面积仅占全球陆地的9%,但两国为这一绿化过程贡献超过三分之一。考虑到人口过多的国家一般存在对土地过度利用的问题,这个发现令人吃惊。”
import jiagu # jiagu.init() # 可手动初始化,也可以动态初始化 text = '厦门明天会不会下雨' words = jiagu.seg(text) # 分词 print(words) words = jiagu.cws(text, model="mmseg") # mmseg分词 print(words) pos = jiagu.pos(words) # 词性标注 print(pos) ner = jiagu.ner(text) # 命名实体识别 print(ner)
import jiagu jiagu.load_userdict('dict/user.dict') # load file contract_file = open("contract.txt", "r") contract_contents = contract_file.read() segs = jiagu.seg(contract_contents) ner = jiagu.ner(segs) print(segs) print(ner) content_arr = list(segs) for i in range(len(content_arr)): if (content_arr[i] != '\n'): print('index: ' + str(i) + ':' + content_arr[i] + ' ' + ner[i]) else: print()