# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-01-07 13:53 from pyhanlp import * from test_utility import ensure_data IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier') NaiveBayesClassifier = JClass('com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier') # 中文情感挖掘语料-ChnSentiCorp 谭松波 chn_senti_corp = ensure_data("ChnSentiCorp情感分析酒店评论", "http://file.hankcs.com/corpus/ChnSentiCorp.zip") def predict(classifier, text): print("《%s》 情感极性是 【%s】" % (text, classifier.classify(text))) if __name__ == '__main__': classifier = NaiveBayesClassifier() # 创建分类器,更高级的功能请参考IClassifier的接口定义 classifier.train(chn_senti_corp) # 训练后的模型支持持久化,下次就不必训练了 predict(classifier, "前台客房服务态度非常好!早餐很丰富,房价很干净。再接再厉!") predict(classifier, "结果大失所望,灯光昏暗,空间极其狭小,床垫质量恶劣,房间还伴着一股霉味。") predict(classifier, "可利用文本分类实现情感分析,效果不是不行")
# 《自然语言处理入门》7.2.1 《人民日报》语料库与 PKU 标注集 import sys import os # 得到当前根目录 o_path = os.getcwd() # 返回当前工作目录 sys.path.append(o_path) # 添加自己指定的搜索路径 from test_utility import ensure_data PKU98 = ensure_data("pku98", "http://file.hankcs.com/corpus/pku98.zip") PKU199801 = os.path.join(PKU98, '199801.txt') PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt') PKU199801_TEST = os.path.join(PKU98, '199801-test.txt') POS_MODEL = os.path.join(PKU98, 'pos.bin') NER_MODEL = os.path.join(PKU98, 'ner.bin')
for (start, end) in A & B: word = text[start:end] if dic.containsKey(word): IV_R += 1 else: OOV_R += 1 p, r = A_cap_B_size / B_size * 100, A_cap_B_size / A_size * 100 return p, r, 2 * p * r / (p + r), OOV_R / OOV * 100, IV_R / IV * 100 if __name__ == '__main__': print(to_region('商品 和 服务')) sighan05 = ensure_data( 'icwb2-data', 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip') msr_dict = os.path.join(sighan05, 'gold', 'msr_training_words.utf8') msr_test = os.path.join(sighan05, 'testing', 'msr_test.utf8') msr_output = os.path.join(sighan05, 'testing', 'msr_output.txt') msr_gold = os.path.join(sighan05, 'gold', 'msr_test_gold.utf8') DoubleArrayTrieSegment = JClass( 'com.hankcs.hanlp.seg.Other.DoubleArrayTrieSegment') segment = DoubleArrayTrieSegment([msr_dict ]).enablePartOfSpeechTagging(True) with open(msr_gold) as test, open(msr_output, 'w') as output: for line in test: output.write(" ".join( term.word for term in segment.seg(re.sub("\\s+", "", line)))) output.write("\n")
# -*- coding:utf-8 -*-# Author:hankcs# Date: 2018-06-07 18:37 # 《自然语言处理入门》3.6.1 日语分词语料 import sys import os # 得到当前根目录 o_path = os.getcwd() # 返回当前工作目录 sys.path.append(o_path) # 添加自己指定的搜索路径 from E_330_ngram_segment import train_bigram, load_bigram from test_utility import ensure_data jp_corpus = ensure_data('jpcorpus', 'http://file.hankcs.com/corpus/jpcorpus.zip') jp_bigram = os.path.join(jp_corpus, 'jp_bigram') jp_corpus = os.path.join(jp_corpus, 'ja_gsd-ud-train.txt') if __name__ == '__main__': train_bigram(jp_corpus, jp_bigram) # 训练 segment = load_bigram(jp_bigram, verbose=False) # 加载 print(segment.seg('自然言語処理入門という本が面白いぞ!')) # 日语分词
# 《自然语言处理入门》7.4.2 标注语料 import sys import os # 得到当前根目录 o_path = os.getcwd() # 返回当前工作目录 sys.path.append(o_path) # 添加自己指定的搜索路径 from tests.book.ch07.demo_hmm_pos import AbstractLexicalAnalyzer, PerceptronSegmenter from tests.book.ch07.demo_perceptron_pos import train_perceptron_pos from test_utility import ensure_data ZHUXIAN = ensure_data( "zhuxian", "http://file.hankcs.com/corpus/zhuxian.zip") + "/train.txt" posTagger = train_perceptron_pos(ZHUXIAN) # 训练 analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), posTagger) # 包装 print(analyzer.analyze("陆雪琪的天琊神剑不做丝毫退避,直冲而上,瞬间,这两道奇光异宝撞到了一起。")) # 分词+标注
# -*- coding:utf-8 -*-# Author:hankcs# Date: 2018-06-21 19:46 # 《自然语言处理入门》5.3 基于感知机的人名性别分类 import sys import os # 得到当前根目录 o_path = os.getcwd() # 返回当前工作目录 sys.path.append(o_path) # 添加自己指定的搜索路径 from pyhanlp import * from test_utility import ensure_data PerceptronNameGenderClassifier = JClass( 'com.hankcs.hanlp.model.perceptron.PerceptronNameGenderClassifier') cnname = ensure_data('cnname', 'http://file.hankcs.com/corpus/cnname.zip') TRAINING_SET = os.path.join(cnname, 'train.csv') TESTING_SET = os.path.join(cnname, 'test.csv') MODEL = cnname + ".bin" def run_classifier(averaged_perceptron): print('=====%s=====' % ('平均感知机算法' if averaged_perceptron else '朴素感知机算法')) classifier = PerceptronNameGenderClassifier() print('训练集准确率:', classifier.train(TRAINING_SET, 10, averaged_perceptron)) model = classifier.getModel() print('特征数量:', len(model.parameter)) # model.save(MODEL, model.featureMap.entrySet(), 0, True) # classifier = PerceptronNameGenderClassifier(MODEL) for name in "魏勇刚", "沈雁冰", "陆雪琪", "李冰冰": print('%s=%s' % (name, classifier.predict(name))) print('测试集准确率:', classifier.evaluate(TESTING_SET))