def split_by_pos(self, sentence): pos_dict = {} pos_dict_out = {} input_words = jiagu.seg(sent_flush(sentence)) # 分词 input_pos = jiagu.pos(input_words) # 标注词性 for word, po in zip(input_words, input_pos): if pos_dict.get(po) == None: pos_dict[po] = [word] else: pos_dict[po].append(word) for key, val in pos_dict.items(): if len(val) == 1: if pos_dict_out.get('other') == None: pos_dict_out['other'] = [val[0]] else: pos_dict_out['other'].append(val[0]) else: pos_dict_out[key] = val return pos_dict_out
for k in self.d: tmp[k] = log(self.d[k].getsum()) - log(self.total) for word in x: tmp[k] += log(self.d[k].freq(word)) ret, prob = 0, 0 for k in self.d: now = 0 try: for otherk in self.d: now += exp(tmp[otherk] - tmp[k]) now = 1 / now except OverflowError: now = 0 if now > prob: ret, prob = k, now return (ret, prob) if __name__ == '__main__': classifier = Bayes() # 预测 classifier.load('model/1.model') import jiagu words = jiagu.seg('今天真的开心') ret, prob = classifier.classify(words) print(ret, prob)
import jiagu # jiagu.init() # 可手动初始化,也可以动态初始化 text = '苏州的天气不错' words = jiagu.seg(text) # 分词 print(words) words = jiagu.cut(text) # 分词 print(words) pos = jiagu.pos(words) # 词性标注 print(pos) ner = jiagu.ner(words) # 命名实体识别 print(ner) # 字典模式分词 text = '思知机器人挺好用的' words = jiagu.seg(text) print(words) # jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 jiagu.load_userdict(['思知机器人']) words = jiagu.seg(text) print(words) text = ''' 该研究主持者之一、波士顿大学地球与环境科学系博士陈池(音)表示,“尽管中国和印度国土面积仅占全球陆地的9%,但两国为这一绿化过程贡献超过三分之一。考虑到人口过多的国家一般存在对土地过度利用的问题,这个发现令人吃惊。”
import jiagu text = '有意义' words = jiagu.seg(text) print(words) # jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 jiagu.load_userdict(['意义']) words = jiagu.seg(text) # 自定义分词,字典分词模式有效 print(words)
def test_seg_one(self): sentence = "人要是行干一行行一行" words = jiagu.seg(sentence, model="mmseg") self.assertTrue(list(words) == ['人', '要是', '行', '干一行', '行', '一行'])
import jiagu # jiagu.init() # 可手动初始化,也可以动态初始化 text = '厦门明天会不会下雨' words = jiagu.seg(text) # 分词,可以用model选择分词模式,不填则默认,mmseg则使用mmseg算法。 print(words) # words = jiagu.seg(text, model="mmseg") # mmseg 分词得到generator,需要用list进行转换 # print(list(words)) pos = jiagu.pos(words) # 词性标注 print(pos) ner = jiagu.ner(text) # 命名实体识别 print(ner) text = ''' 该研究主持者之一、波士顿大学地球与环境科学系博士陈池(音)表示,“尽管中国和印度国土面积仅占全球陆地的9%,但两国为这一绿化过程贡献超过三分之一。考虑到人口过多的国家一般存在对土地过度利用的问题,这个发现令人吃惊。” NASA埃姆斯研究中心的科学家拉玛·内曼尼(Rama Nemani)说,“这一长期数据能让我们深入分析地表绿化背后的影响因素。我们一开始以为,植被增加是由于更多二氧化碳排放,导致气候更加温暖、潮湿,适宜生长。” “MODIS的数据让我们能在非常小的尺度上理解这一现象,我们发现人类活动也作出了贡献。” NASA文章介绍,在中国为全球绿化进程做出的贡献中,有42%来源于植树造林工程,对于减少土壤侵蚀、空气污染与气候变化发挥了作用。 据观察者网过往报道,2017年我国全国共完成造林736.2万公顷、森林抚育830.2万公顷。其中,天然林资源保护工程完成造林26万公顷,退耕还林工程完成造林91.2万公顷。京津风沙源治理工程完成造林18.5万公顷。三北及长江流域等重点防护林体系工程完成造林99.1万公顷。完成国家储备林建设任务68万公顷。 ''' keywords = jiagu.keywords(text, 5) # 关键词 print(keywords) summarize = jiagu.summarize(text, 3) # 摘要
import jiagu # jiagu.init() # 可手动初始化,也可以动态初始化 text = '厦门明天会不会下雨' words = jiagu.seg(text) # 分词 print(words) words = jiagu.seg(text, model="mmseg") # mmseg分词 print(words) pos = jiagu.pos(words) # 词性标注 print(pos) ner = jiagu.ner(text) # 命名实体识别 print(ner)
import jiagu jiagu.load_userdict('dict/user.dict') # load file contract_file = open("contract.txt", "r") contract_contents = contract_file.read() segs = jiagu.seg(contract_contents) ner = jiagu.ner(segs) print(segs) print(ner) content_arr = list(segs) for i in range(len(content_arr)): if (content_arr[i] != '\n'): print('index: ' + str(i) + ':' + content_arr[i] + ' ' + ner[i]) else: print()
import jiagu #jiagu.init() # 可手动初始化,也可以动态初始化 text = '携手推动民族复兴,实现和平统一目标;探索“两制”台湾方案,丰富和平统一实践;坚持一个中国原则,维护和平统一前景;深化两岸融合发展,夯实和平统一基础;实现同胞心灵契合,增进和平统一认同。在《告台湾同胞书》发表40周年纪念会上,习近平总书记提出的这五个方面重大政策主张,系统阐释了实现国家统一的目标内涵、基本方针、路径模式,深刻指明了今后一个时期对台工作的基本思路、重点任务和前进方向,既有坚定的原则性又有极强的针对性和极大的包容性,展现了非凡的政治勇气和政治智慧。' words = jiagu.seg(text) stop_words = ['的', ',', ';', '、'] words = [w for w in words if w not in stop_words] # 去除停用词,符号等 keywords = jiagu.keywords(words) # 关键词抽取 print(keywords)
stock_code_list = ['000513.SZ', '000698.SZ'] trade_date = str(datetime.now().strftime('%Y%m%d')) last_ten_trady_date = get_last_ten_trady_date(trade_date) df_market_news = get_market_news(trade_date) df_stock_message_and_money_flow = pd.DataFrame(columns=[ 'tscode', 'message_original', 'message_parsed', 'label', 'score' ]) for i in range(len(stock_code_list)): stock_code = stock_code_list[i] stock_concept = get_stock_concept(stock_code) #加载特定股票的关键词字典 jiagu.load_userdict(stock_concept) for j in range(len(df_market_news)): content = df_market_news.loc[j]['content'] #中文分词 words = jiagu.seg(content) for k in range(len(words)): if (stock_concept.count(words[k])) > 0: message_parsed = content + get_stock_money_flow( stock_code, last_ten_trady_date, trade_date) df_stock_message_and_money_flow = df_stock_message_and_money_flow.append( pd.DataFrame( data={ 'tscode': [stock_code], 'message_original': [content], 'message_parsed': [message_parsed], 'label': [0], 'score': [0] }), ignore_index=True) tokenizer = AutoTokenizer.from_pretrained("./bert-base-chinese")
def test_seg_two(self): sentence = "武汉市长江大桥上的日落,很喜欢看日出日落。" words = jiagu.seg(sentence, model="mmseg") self.assertTrue( list(words) == ['武汉市', '长江大桥', '上', '的', '日落', ',', '很', '喜欢', '看', '日出日落', '。'])
def jiagu(self, text): # 甲骨分词 jiagu_result = jiagu.seg(text) return jiagu_result
thul = thulac.thulac(seg_only=True) for sample in samples: print(thul.cut(sample, text=True)) end_time = time.time() print('thulac time', (end_time - start_time) * 1000) start_time = time.time() seg = pkuseg.pkuseg() for sample in samples: print(seg.cut(sample)) end_time = time.time() print('pkuseg time', (end_time - start_time) * 1000) start_time = time.time() for sample in samples: print(jiagu.seg(sample)) end_time = time.time() print('jiagu time', (end_time - start_time) * 1000) start_time = time.time() for sample in samples: print(fool.cut(sample)) end_time = time.time() print('fool time', (end_time - start_time) * 1000) start_time = time.time() for sample in samples: print(HanLP.segment(sample)) end_time = time.time() print('HanLP time', (end_time - start_time) * 1000)
import os import jiagu import json from _utils import find_all_file,cut_filename ROOT=os.path.dirname(os.path.abspath(__file__)) fc =lambda str:" ".join(jiagu.seg(str)) path=lambda ROOT,*a:os.path.join(ROOT,*a) def get_target_file(need_TW = False): ''' 生成测试文件列表 ''' for file_path in find_all_file(path(ROOT,'res'),suffix_filter='utf8'): name = cut_filename(file_path) if name[2] == ".utf8": if "tw" not in name[1]: yield file_path elif "tw" in name[1] and need_TW: yield file_path if __name__ == "__main__": ''' 使用jieba切分词语并且输出文件 ''' for file_address in get_target_file(): print('file_address',file_address) out = [] with open(file_address,"r",encoding="utf-8") as file: