Пример #1
0
def train_or_load_classifier():
    corpus_path = ensure_data('搜狗文本分类语料库迷你版',
                              'http://hanlp.linrunsoft.com/release/corpus/sogou-text-classification-corpus-mini.zip')
    model_path = corpus_path + '.ser'
    if os.path.isfile(model_path):
        return NaiveBayesClassifier(IOUtil.readObjectFrom(model_path))
    classifier = NaiveBayesClassifier()
    classifier.train(corpus_path)
    model = classifier.getModel()
    IOUtil.saveObjectTo(model, model_path)
    return model
Пример #2
0
# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-06-07 18:37
# 《自然语言处理入门》3.6.1 日语分词语料
# 配套书籍:http://nlp.hankcs.com/book.php
# 讨论答疑:https://bbs.hankcs.com/
import os

from tests.book.ch03.ngram_segment import train_bigram, load_bigram
from tests.test_utility import ensure_data

jp_corpus = ensure_data('jpcorpus',
                        'http://file.hankcs.com/corpus/jpcorpus.zip')
jp_bigram = os.path.join(jp_corpus, 'jp_bigram')
jp_corpus = os.path.join(jp_corpus, 'ja_gsd-ud-train.txt')

if __name__ == '__main__':
    train_bigram(jp_corpus, jp_bigram)  # 训练
    segment = load_bigram(jp_bigram, verbose=False)  # 加载
    print(segment.seg('自然言語処理入門という本が面白いぞ!'))  # 日语分词
Пример #3
0
# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-07-30 21:03
# 《自然语言处理入门》9.1 新词提取
# 配套书籍:http://nlp.hankcs.com/book.php
# 讨论答疑:https://bbs.hankcs.com/
from pyhanlp import *
from tests.test_utility import ensure_data

HLM_PATH = ensure_data("红楼梦.txt", "http://file.hankcs.com/corpus/红楼梦.zip")
XYJ_PATH = ensure_data("西游记.txt", "http://file.hankcs.com/corpus/西游记.zip")
SHZ_PATH = ensure_data("水浒传.txt", "http://file.hankcs.com/corpus/水浒传.zip")
SAN_PATH = ensure_data("三国演义.txt", "http://file.hankcs.com/corpus/三国演义.zip")
WEIBO_PATH = ensure_data(
    "weibo-classification",
    "http://file.hankcs.com/corpus/weibo-classification.zip")


def test_weibo():
    for folder in os.listdir(WEIBO_PATH):
        print(folder)
        big_text = ""
        for file in os.listdir(os.path.join(WEIBO_PATH, folder)):
            with open(os.path.join(WEIBO_PATH, folder, file),
                      encoding='utf-8') as src:
                big_text += "".join(src.readlines())
        word_info_list = HanLP.extractWords(big_text, 100)
        print(word_info_list)


def extract(corpus):
Пример #4
0
            for (start, end) in A & B:
                word = text[start:end]
                if dic.containsKey(word):
                    IV_R += 1
                else:
                    OOV_R += 1
    p, r = A_cap_B_size / B_size * 100, A_cap_B_size / A_size * 100
    return p, r, 2 * p * r / (p + r), OOV_R / OOV * 100, IV_R / IV * 100


if __name__ == '__main__':
    print(to_region('商品 和 服务'))

    sighan05 = ensure_data(
        'icwb2-data',
        'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip')
    msr_dict = os.path.join(sighan05, 'gold', 'msr_training_words.utf8')
    msr_test = os.path.join(sighan05, 'testing', 'msr_test.utf8')
    msr_output = os.path.join(sighan05, 'testing', 'msr_output.txt')
    msr_gold = os.path.join(sighan05, 'gold', 'msr_test_gold.utf8')

    DoubleArrayTrieSegment = JClass(
        'com.hankcs.hanlp.seg.Other.DoubleArrayTrieSegment')
    segment = DoubleArrayTrieSegment([msr_dict
                                      ]).enablePartOfSpeechTagging(True)
    with open(msr_gold,
              encoding='utf-8') as test, open(msr_output,
                                              'w',
                                              encoding='utf-8') as output:
        for line in test:
Пример #5
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-01-07 13:53

from pyhanlp import *
from tests.test_utility import ensure_data

IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier')
NaiveBayesClassifier = JClass('com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
# 中文情感挖掘语料-ChnSentiCorp 谭松波
chn_senti_corp = ensure_data("ChnSentiCorp情感分析酒店评论", "http://hanlp.linrunsoft.com/release/corpus/ChnSentiCorp.zip")


def predict(classifier, text):
    print("《%s》 情感极性是 【%s】" % (text, classifier.classify(text)))


if __name__ == '__main__':
    classifier = NaiveBayesClassifier()
    #  创建分类器,更高级的功能请参考IClassifier的接口定义
    classifier.train(chn_senti_corp)
    #  训练后的模型支持持久化,下次就不必训练了
    predict(classifier, "前台客房服务态度非常好!早餐很丰富,房价很干净。再接再厉!")
    predict(classifier, "结果大失所望,灯光昏暗,空间极其狭小,床垫质量恶劣,房间还伴着一股霉味。")
    predict(classifier, "可利用文本分类实现情感分析,效果不是不行")
Пример #6
0
# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-05-23 17:26
import os

from pyhanlp import SafeJClass
from tests.test_utility import ensure_data

NaiveBayesClassifier = SafeJClass(
    'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')
sogou_corpus_path = ensure_data(
    '搜狗文本分类语料库迷你版',
    'http://file.hankcs.com/corpus/sogou-text-classification-corpus-mini.zip')


def train_or_load_classifier():
    model_path = sogou_corpus_path + '.ser'
    if os.path.isfile(model_path):
        return NaiveBayesClassifier(IOUtil.readObjectFrom(model_path))
    classifier = NaiveBayesClassifier()
    classifier.train(sogou_corpus_path)
    model = classifier.getModel()
    IOUtil.saveObjectTo(model, model_path)
    return NaiveBayesClassifier(model)


def predict(classifier, text):
    print("《%16s》\t属于分类\t【%s】" % (text, classifier.classify(text)))
    # 如需获取离散型随机变量的分布,请使用predict接口
    # print("《%16s》\t属于分类\t【%s】" % (text, classifier.predict(text)))
Пример #7
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-01-07 13:53

from pyhanlp import *
from tests.test_utility import ensure_data

IClassifier = JClass('com.hankcs.hanlp.classification.classifiers.IClassifier')
NaiveBayesClassifier = JClass('com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
# 中文情感挖掘语料-ChnSentiCorp 谭松波
chn_senti_corp = ensure_data("ChnSentiCorp情感分析酒店评论", "http://file.hankcs.com/corpus/ChnSentiCorp.zip")


def predict(classifier, text):
    print("《%s》 情感极性是 【%s】" % (text, classifier.classify(text)))


if __name__ == '__main__':
    classifier = NaiveBayesClassifier()
    #  创建分类器,更高级的功能请参考IClassifier的接口定义
    classifier.train(chn_senti_corp)
    #  训练后的模型支持持久化,下次就不必训练了
    predict(classifier, "前台客房服务态度非常好!早餐很丰富,房价很干净。再接再厉!")
    predict(classifier, "结果大失所望,灯光昏暗,空间极其狭小,床垫质量恶劣,房间还伴着一股霉味。")
    predict(classifier, "可利用文本分类实现情感分析,效果不是不行")
Пример #8
0
# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-04-28 10:07

from pyhanlp import *
from tests.test_utility import ensure_data

WordVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.WordVectorModel')
DocVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.DocVectorModel')
model_path = os.path.join(
    ensure_data(
        'hanlp-wiki-vec-zh',
        'http://hanlp.linrunsoft.com/release/model/hanlp-wiki-vec-zh.zip'),
    'hanlp-wiki-vec-zh.txt')
word2vec = WordVectorModel(model_path)
doc2vec = DocVectorModel(word2vec)
docs = ["山东苹果丰收", "农民在江苏种水稻", "奥运会女排夺冠", "世界锦标赛胜出", "中国足球失败"]
for idx, doc in enumerate(docs):
    doc2vec.addDocument(idx, doc)

print(word2vec.nearest('语言'))

for res in doc2vec.nearest('我要看比赛'):
    print('%s = %.2f' %
          (docs[res.getKey().intValue()], res.getValue().floatValue()))
Пример #9
0
# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-04-28 10:07

from pyhanlp import *
from tests.test_utility import ensure_data

WordVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.WordVectorModel')
DocVectorModel = JClass('com.hankcs.hanlp.mining.word2vec.DocVectorModel')
model_path = os.path.join(
    ensure_data('hanlp-wiki-vec-zh', 'http://file.hankcs.com/model/hanlp-wiki-vec-zh.zip'),
    'hanlp-wiki-vec-zh.txt')
word2vec = WordVectorModel(model_path)
doc2vec = DocVectorModel(word2vec)
docs = ["山东苹果丰收", "农民在江苏种水稻", "奥运会女排夺冠", "世界锦标赛胜出", "中国足球失败"]
for idx, doc in enumerate(docs):
    doc2vec.addDocument(idx, doc)

print(word2vec.nearest('语言'))

for res in doc2vec.nearest('我要看比赛'):
    print('%s = %.2f' % (docs[res.getKey().intValue()], res.getValue().floatValue()))
Пример #10
0
# Date: 2018-07-29 23:24
# 《自然语言处理入门》8.6 自定义领域命名实体识别
# 配套书籍:http://nlp.hankcs.com/book.php
# 讨论答疑:https://bbs.hankcs.com/
import os
import sys
# 得到当前根目录
o_path = os.getcwd()  # 返回当前工作目录
sys.path.append(o_path)  # 添加自己指定的搜索路径
from tests.book.ch05.perceptron_cws import CWSTrainer
from tests.book.ch07.demo_hmm_pos import AbstractLexicalAnalyzer, PerceptronSegmenter
from tests.book.ch07.demo_perceptron_pos import PerceptronPOSTagger
from tests.book.ch08.demo_sp_ner import NERTrainer, os, PerceptronNERecognizer
from tests.test_utility import ensure_data

PLANE_ROOT = ensure_data("plane-re",
                         "http://file.hankcs.com/corpus/plane-re.zip")
PLANE_CORPUS = os.path.join(PLANE_ROOT, 'train.txt')
PLANE_MODEL = os.path.join(PLANE_ROOT, 'model.bin')

if __name__ == '__main__':
    trainer = NERTrainer()
    trainer.tagSet.nerLabels.clear()  # 不识别nr、ns、nt
    trainer.tagSet.nerLabels.add("np")  # 目标是识别np
    recognizer = PerceptronNERecognizer(
        trainer.train(PLANE_CORPUS, PLANE_MODEL).getModel())
    # 在NER预测前,需要一个分词器,最好训练自同源语料库
    CWS_MODEL = CWSTrainer().train(PLANE_CORPUS,
                                   PLANE_MODEL.replace('model.bin',
                                                       'cws.bin')).getModel()
    analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(CWS_MODEL),
                                       PerceptronPOSTagger(), recognizer)
Пример #11
0
# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-07-06 13:54
# 《自然语言处理入门》7.4.2 标注语料
# 配套书籍:http://nlp.hankcs.com/book.php
# 讨论答疑:https://bbs.hankcs.com/
from tests.book.ch07.demo_hmm_pos import AbstractLexicalAnalyzer, PerceptronSegmenter
from tests.book.ch07.demo_perceptron_pos import train_perceptron_pos
from tests.test_utility import ensure_data

ZHUXIAN = ensure_data(
    "zhuxian", "http://file.hankcs.com/corpus/zhuxian.zip") + "/train.txt"
posTagger = train_perceptron_pos(ZHUXIAN)  # 训练
analyzer = AbstractLexicalAnalyzer(PerceptronSegmenter(), posTagger)  # 包装
print(analyzer.analyze("陆雪琪的天琊神剑不做丝毫退避,直冲而上,瞬间,这两道奇光异宝撞到了一起。"))  # 分词+标注
Пример #12
0
# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-06-21 19:46
# 《自然语言处理入门》5.3 基于感知机的人名性别分类
# 配套书籍:http://nlp.hankcs.com/book.php
# 讨论答疑:https://bbs.hankcs.com/

from pyhanlp import *
from tests.test_utility import ensure_data

PerceptronNameGenderClassifier = JClass(
    'com.hankcs.hanlp.model.perceptron.PerceptronNameGenderClassifier')
cnname = ensure_data('cnname', 'http://file.hankcs.com/corpus/cnname.zip')
TRAINING_SET = os.path.join(cnname, 'train.csv')
TESTING_SET = os.path.join(cnname, 'test.csv')
MODEL = cnname + ".bin"


def run_classifier(averaged_perceptron):
    print('=====%s=====' % ('平均感知机算法' if averaged_perceptron else '朴素感知机算法'))
    classifier = PerceptronNameGenderClassifier()
    print('训练集准确率:', classifier.train(TRAINING_SET, 10, averaged_perceptron))
    model = classifier.getModel()
    print('特征数量:', len(model.parameter))
    # model.save(MODEL, model.featureMap.entrySet(), 0, True)
    # classifier = PerceptronNameGenderClassifier(MODEL)
    for name in "赵建军", "沈雁冰", "陆雪琪", "李冰冰":
        print('%s=%s' % (name, classifier.predict(name)))
    print('测试集准确率:', classifier.evaluate(TESTING_SET))

# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-05-23 17:26
import os

from pyhanlp import SafeJClass
from tests.test_utility import ensure_data

NaiveBayesClassifier = SafeJClass(
    'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')
sogou_corpus_path = ensure_data(
    '搜狗文本分类语料库迷你版',
    'http://hanlp.linrunsoft.com/release/corpus/sogou-text-classification-corpus-mini.zip'
)


def train_or_load_classifier():
    model_path = sogou_corpus_path + '.ser'
    if os.path.isfile(model_path):
        return NaiveBayesClassifier(IOUtil.readObjectFrom(model_path))
    classifier = NaiveBayesClassifier()
    classifier.train(sogou_corpus_path)
    model = classifier.getModel()
    IOUtil.saveObjectTo(model, model_path)
    return NaiveBayesClassifier(model)


def predict(classifier, text):
    print("《%16s》\t属于分类\t【%s】" % (text, classifier.classify(text)))
    # 如需获取离散型随机变量的分布,请使用predict接口
# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-05-23 17:26
import os

from pyhanlp import SafeJClass
from tests.test_utility import ensure_data

# 实例化朴素贝叶斯分类器实例
NaiveBayesClassifier = SafeJClass(
    'com.hankcs.hanlp.classification.classifiers.NaiveBayesClassifier')
# 实例化文件IO实例
IOUtil = SafeJClass('com.hankcs.hanlp.corpus.io.IOUtil')
# 自己的微博训练语料(这里如果Python是用HanLP安装的,请将训练文件夹放在下边的路径下
# C:\ProgramData\Anaconda3\Lib\site-packages\pyhanlp\static\data\test
sentiment_corpus_path = ensure_data('新浪微博/train', '')


def train_or_load_classifier():
    # 朴素贝叶斯模型文件名
    model_path = sentiment_corpus_path + '.ser'
    # 检查模型文件是否存在,如果存在则加载模型并返回朴素贝叶斯分类器对象
    if os.path.isfile(model_path):
        return NaiveBayesClassifier(IOUtil.readObjectFrom(model_path))
    # 模型文件不存在,则首先构建朴素贝叶斯分类器实例
    classifier = NaiveBayesClassifier()
    # 传入训练文件路径名称进行训练
    classifier.train(sentiment_corpus_path)
    # 获取训练后得到的模型
    model = classifier.getModel()
    # 保存模型为模型文件
Пример #15
0
Файл: pku.py Проект: HBU/NLP
# -*- coding:utf-8 -*-
# Author:hankcs
# Date: 2018-07-04 17:41
# 《自然语言处理入门》7.2.1 《人民日报》语料库与 PKU 标注集
# 配套书籍:http://nlp.hankcs.com/book.php
# 讨论答疑:https://bbs.hankcs.com/
import os

from tests.test_utility import ensure_data

PKU98 = ensure_data("pku98", "http://file.hankcs.com/corpus/pku98.zip")
PKU199801 = os.path.join(PKU98, '199801.txt')
PKU199801_TRAIN = os.path.join(PKU98, '199801-train.txt')
PKU199801_TEST = os.path.join(PKU98, '199801-test.txt')
POS_MODEL = os.path.join(PKU98, 'pos.bin')
NER_MODEL = os.path.join(PKU98, 'ner.bin')
Пример #16
0
# -*- coding:utf-8 -*-
# Author: hankcs
# Date: 2019-02-11 23:18
# 《自然语言处理入门》12.5.1 训练模型
# 配套书籍:http://nlp.hankcs.com/book.php
# 讨论答疑:https://bbs.hankcs.com/

from pyhanlp import *
from tests.test_utility import ensure_data

KBeamArcEagerDependencyParser = JClass(
    'com.hankcs.hanlp.dependency.perceptron.parser.KBeamArcEagerDependencyParser'
)
CTB_ROOT = ensure_data("ctb8.0-dep",
                       "http://file.hankcs.com/corpus/ctb8.0-dep.zip")
CTB_TRAIN = CTB_ROOT + "/train.conll"
CTB_DEV = CTB_ROOT + "/dev.conll"
CTB_TEST = CTB_ROOT + "/test.conll"
CTB_MODEL = CTB_ROOT + "/ctb.bin"
BROWN_CLUSTER = ensure_data(
    "wiki-cn-cluster.txt", "http://file.hankcs.com/corpus/wiki-cn-cluster.zip")

if __name__ == '__main__':
    parser = KBeamArcEagerDependencyParser.train(CTB_TRAIN, CTB_DEV,
                                                 BROWN_CLUSTER, CTB_MODEL)
    print(parser.parse("人吃鱼"))
    score = parser.evaluate(CTB_TEST)
    print("UAS=%.1f LAS=%.1f\n" % (score[0], score[1]))