# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-01-07 13:53

from pyhanlp import *
from test_utility import ensure_data

IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier')
NaiveBayesClassifier = JClass('com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
# 中文情感挖掘语料-ChnSentiCorp 谭松波
chn_senti_corp = ensure_data("ChnSentiCorp情感分析酒店评论", "http://file.hankcs.com/corpus/ChnSentiCorp.zip")


def predict(classifier, text):
    print("《%s》 情感极性是 【%s】" % (text, classifier.classify(text)))


if __name__ == '__main__':
    classifier = NaiveBayesClassifier()
    #  创建分类器,更高级的功能请参考IClassifier的接口定义
    classifier.train(chn_senti_corp)
    #  训练后的模型支持持久化,下次就不必训练了
    predict(classifier, "前台客房服务态度非常好!早餐很丰富,房价很干净。再接再厉!")
    predict(classifier, "结果大失所望,灯光昏暗,空间极其狭小,床垫质量恶劣,房间还伴着一股霉味。")
    predict(classifier, "可利用文本分类实现情感分析,效果不是不行")
예제 #2
0
파일: E_721_pku.py 프로젝트: HBU/NLP
# 《自然语言处理入门》7.2.1 《人民日报》语料库与 PKU 标注集
import sys
import os
# 得到当前根目录
o_path = os.getcwd()  # 返回当前工作目录
sys.path.append(o_path)  # 添加自己指定的搜索路径

from test_utility import ensure_data

PKU98 = ensure_data("pku98", "http://file.hankcs.com/corpus/pku98.zip")
PKU199801 = os.path.join(PKU98, '199801.txt')
PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt')
PKU199801_TEST = os.path.join(PKU98, '199801-test.txt')
POS_MODEL = os.path.join(PKU98, 'pos.bin')
NER_MODEL = os.path.join(PKU98, 'ner.bin')
예제 #3
0
            for (start, end) in A & B:
                word = text[start:end]
                if dic.containsKey(word):
                    IV_R += 1
                else:
                    OOV_R += 1
    p, r = A_cap_B_size / B_size * 100, A_cap_B_size / A_size * 100
    return p, r, 2 * p * r / (p + r), OOV_R / OOV * 100, IV_R / IV * 100


if __name__ == '__main__':
    print(to_region('商品 和 服务'))

    sighan05 = ensure_data(
        'icwb2-data',
        'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip')
    msr_dict = os.path.join(sighan05, 'gold', 'msr_training_words.utf8')
    msr_test = os.path.join(sighan05, 'testing', 'msr_test.utf8')
    msr_output = os.path.join(sighan05, 'testing', 'msr_output.txt')
    msr_gold = os.path.join(sighan05, 'gold', 'msr_test_gold.utf8')

    DoubleArrayTrieSegment = JClass(
        'com.hankcs.hanlp.seg.Other.DoubleArrayTrieSegment')
    segment = DoubleArrayTrieSegment([msr_dict
                                      ]).enablePartOfSpeechTagging(True)
    with open(msr_gold) as test, open(msr_output, 'w') as output:
        for line in test:
            output.write("  ".join(
                term.word for term in segment.seg(re.sub("\\s+", "", line))))
            output.write("\n")
예제 #4
0
# -*- coding:utf-8 -*-# Author:hankcs# Date: 2018-06-07 18:37
# 《自然语言处理入门》3.6.1 日语分词语料
import sys
import os
# 得到当前根目录
o_path = os.getcwd()  # 返回当前工作目录
sys.path.append(o_path)  # 添加自己指定的搜索路径

from E_330_ngram_segment import train_bigram, load_bigram
from test_utility import ensure_data

jp_corpus = ensure_data('jpcorpus',
                        'http://file.hankcs.com/corpus/jpcorpus.zip')
jp_bigram = os.path.join(jp_corpus, 'jp_bigram')
jp_corpus = os.path.join(jp_corpus, 'ja_gsd-ud-train.txt')

if __name__ == '__main__':
    train_bigram(jp_corpus, jp_bigram)  # 训练
    segment = load_bigram(jp_bigram, verbose=False)  # 加载
    print(segment.seg('自然言語処理入門という本が面白いぞ!'))  # 日语分词
예제 #5
0
# 《自然语言处理入门》7.4.2 标注语料
import sys
import os
# 得到当前根目录
o_path = os.getcwd()  # 返回当前工作目录
sys.path.append(o_path)  # 添加自己指定的搜索路径
from tests.book.ch07.demo_hmm_pos import AbstractLexicalAnalyzer, PerceptronSegmenter
from tests.book.ch07.demo_perceptron_pos import train_perceptron_pos
from test_utility import ensure_data

ZHUXIAN = ensure_data(
    "zhuxian", "http://file.hankcs.com/corpus/zhuxian.zip") + "/train.txt"
posTagger = train_perceptron_pos(ZHUXIAN)  # 训练
analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), posTagger)  # 包装
print(analyzer.analyze("陆雪琪的天琊神剑不做丝毫退避,直冲而上,瞬间,这两道奇光异宝撞到了一起。"))  # 分词+标注
예제 #6
0
# -*- coding:utf-8 -*-# Author:hankcs# Date: 2018-06-21 19:46
# 《自然语言处理入门》5.3 基于感知机的人名性别分类
import sys
import os
# 得到当前根目录
o_path = os.getcwd()  # 返回当前工作目录
sys.path.append(o_path)  # 添加自己指定的搜索路径

from pyhanlp import *
from test_utility import ensure_data

PerceptronNameGenderClassifier = JClass(
    'com.hankcs.hanlp.model.perceptron.PerceptronNameGenderClassifier')
cnname = ensure_data('cnname', 'http://file.hankcs.com/corpus/cnname.zip')
TRAINING_SET = os.path.join(cnname, 'train.csv')
TESTING_SET = os.path.join(cnname, 'test.csv')
MODEL = cnname + ".bin"


def run_classifier(averaged_perceptron):
    print('=====%s=====' % ('平均感知机算法' if averaged_perceptron else '朴素感知机算法'))
    classifier = PerceptronNameGenderClassifier()
    print('训练集准确率:', classifier.train(TRAINING_SET, 10, averaged_perceptron))
    model = classifier.getModel()
    print('特征数量:', len(model.parameter))
    # model.save(MODEL, model.featureMap.entrySet(), 0, True)
    # classifier = PerceptronNameGenderClassifier(MODEL)
    for name in "魏勇刚", "沈雁冰", "陆雪琪", "李冰冰":
        print('%s=%s' % (name, classifier.predict(name)))
    print('测试集准确率:', classifier.evaluate(TESTING_SET))